From a9464fadec85393f0344cba9c9e94b125f170445 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Fri, 18 Apr 2025 11:14:14 -0700 Subject: [PATCH 01/11] Adding remat piece by piece --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 + .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 1303 +++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 217 +++ llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 62 + .../AMDGPUOccupancyAndLatencyHelper.cpp | 18 + .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 53 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 11 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 3 + llvm/lib/Target/AMDGPU/GCNRegPressure.h | 4 + 9 files changed, 1675 insertions(+) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4ff761ec19b3c..1ba8e3e2a54d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -530,6 +530,10 @@ extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); extern char &AMDGPUWaitSGPRHazardsLegacyID; +void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &); +FunctionPass *createAMDGPUHotBlockRematerializePass(); +extern char &AMDGPUHotBlockRematerializeID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp new file mode 100644 index 0000000000000..70b25beeb22b9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -0,0 +1,1303 @@ +//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU hot block Rematerialize +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMIRUtils.h" +#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "AMDGPU.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "GCNRegPressure.h" + +#define DEBUG_TYPE "amdgpu-hot-block-remat" + +using namespace llvm; + +static cl::opt TargetOccupancy("amdgpu-remat-target-occupancy"); + +namespace { + +typedef DenseSet InstSet; +typedef DenseSet BlockSet; + +struct RematNode { + enum class RematKind { + Candidate, // Not ready yet. + OneDefOneUse, + Clone, + }; + RematNode() + : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(0) {} + RematNode(unsigned R, MachineInstr *MI, unsigned S) + : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(S) {} + unsigned Reg; + MachineInstr *DefMI; + MachineBasicBlock *InsertBlock; + union { + MachineInstr *InsertPointMI; + unsigned UserCount; + }; + RematKind Kind; + unsigned Size; +}; + +struct BlockLiveInfo { + MachineBasicBlock *BB; + unsigned MaxSReg; + unsigned MaxVReg; + // Input live is the live reg which cross block. + const GCNRPTracker::LiveRegSet InputLive; +}; + +struct RematStatus { + unsigned TargetOcc; + unsigned TargetVLimit; + unsigned TargetSLimit; + unsigned MaxVPressure; + unsigned MaxSPressure; + unsigned InputPhysicalVPressure; + unsigned InputPhysicalSPressure; + // More occupancy can help more than latency cost to reach It. + bool MemBound; + // abs(VTargetOcc-STargetOcc) > 1. + bool NotBalance; + DenseMap MBBPressureMap; + DenseMap MBBInputLiveMap; + DenseMap MBBOutputLiveMap; + // Collect MBBs which has memory write. When move instructions cross MBB, skip + // mem inst if the MBB has memory write. To make things fast, just check + // mayStore and isBarrier. + DenseSet MemWriteMBBSet; +}; + +class AMDGPUHotBlockRematerialize : public MachineFunctionPass { + +public: + static char ID; + + DenseSet TotalUniformInsts; + DenseSet SafeToRemoveInsts; + DenseSet DivergentInsts; + void removeInst(const MachineInstr *MI) { + TotalUniformInsts.erase(MI); + SafeToRemoveInsts.erase(MI); + DivergentInsts.erase(MI); + } + + AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + void applyRemat(MapVector &RematMap, + std::vector &HotBlocks, MachineDominatorTree *DT, + llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + MachineFunction &MF); + void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, + llvm::SlotIndexes *SlotIndexes, + const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII); + void applyCloneRemat(RematNode &Node, + std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + llvm::SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF); + bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, bool &IsNearTarget); + + StringRef getPassName() const override { return "AMDGPU rematerialize"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +void AMDGPUHotBlockRematerialize::applyRemat(MapVector &RematMap, + std::vector &HotBlocks, MachineDominatorTree *DT, + llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + MachineFunction &MF) { + std::vector UpdateList; + for (auto &It : RematMap) { + UpdateList.emplace_back(It.second); + } + // Sort update list with slotIndex to make sure def moved before use. + // If use moved before def, It might not be the first use anymore. + std::sort(UpdateList.begin(), UpdateList.end(), + [&SlotIndexes](RematNode &I, RematNode &J) { + SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI); + SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI); + return A < B; + }); + + for (RematNode &Node : UpdateList) { + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII); + } else if (Node.Kind == RematNode::RematKind::Clone) { + applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, + MF); + } + } +} + +unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { + // Skip processing current block if It has only debug instructions + if (MBB.getFirstNonDebugInstr() == MBB.end()) + return ST->getOccupancyWithNumVGPRs(0); + auto BBEnd = MBB.rbegin(); + GCNUpwardRPTracker RPTracker(*LIS); + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (!llvm::getNonDebugMBBEnd(BBEnd, MBB)) + return ST->getOccupancyWithNumVGPRs(0); + + GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; + RPTracker.reset(*BBEnd, &OutputLive, true); + + for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) { + MachineInstr &MI = (*I++); + RPTracker.recede(MI); + if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH)) + Status.MemWriteMBBSet.insert(&MBB); + } + + GCNRegPressure RP = RPTracker.getMaxPressureAndReset(); + unsigned SPressure = RP.getMaxSGPR(); + if (SPressure > MaxSPressure) { + MaxSPressure = SPressure; + } + if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) { + MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + } + Status.MBBPressureMap[&MBB] = RP; + return RP.getOccupancy(*ST); +} + +unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { + unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second; + // If only have one block, input/ouput virtual live set are empty. + if (MF.size() > 1) { + // Build input output live reg first. + auto *SlotIndexes = LIS->getSlotIndexes(); + DenseMap MBBInputSlotMap; + DenseMap MBBOutputSlotMap; + for (MachineBasicBlock &MBB : MF) { + auto BBBegin = MBB.getFirstNonDebugInstr(); + if (BBBegin != MBB.end()) { + auto SI = SlotIndexes->getInstructionIndex(*BBBegin); + MBBInputSlotMap[&MBB] = SI; + } + + auto BBEnd = MBB.rbegin(); + + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) { + auto SI = SlotIndexes->getInstructionIndex(*BBEnd); + MBBOutputSlotMap[&MBB] = SI; + } + } + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + + const auto &LI = LIS->getInterval(Reg); + + // Skip local live interval to make live input/ouput faster. + if (llvm::isLocalLiveInterval(LI, SlotIndexes)) + continue; + + for (auto InputIt : MBBInputSlotMap) { + MachineBasicBlock *MBB = InputIt.first; + auto SI = InputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + Status.MBBInputLiveMap[MBB][Reg] |= LiveMask; + } + + for (auto OutputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = OutputIt.first; + auto SI = OutputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask; + } + } + } + + LLVM_DEBUG( + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) { + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + } dbgs() << "input live"; + for (auto &It : Status.MBBInputLiveMap) { + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + }); + + for (auto It = MF.begin(); It != MF.end(); ++It) { + MachineBasicBlock &MBB = *It; + unsigned Occ = + collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status); + if (TgtOcc > Occ) + TgtOcc = Occ; + } + return TgtOcc; +} + +RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const GCNSubtarget *ST) { + unsigned MaxSPressure = 0; + unsigned MaxVPressure = 0; + RematStatus Status; + unsigned TgtOcc = + collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status); + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (TgtOcc >= MaxOcc) { + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = 0; + Status.TargetSLimit = 0; + Status.MaxVPressure = 0; + Status.MaxSPressure = 0; + Status.InputPhysicalVPressure = 0; + Status.InputPhysicalSPressure = 0; + Status.MemBound = false; + Status.NotBalance = false; + return Status; + } + + MaxSPressure += RegForVCC; + MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF)); + unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure); + unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure); + + llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI); + bool MemBound = + TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc); + + bool NotBalance = false; + + const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU(); + // Currently, only sgpr bound can be fixed with remat. + if (STgtOcc < VTgtOcc) { + unsigned BigOcc = std::max(STgtOcc, VTgtOcc); + // Change TgtOcc to in case sgpr and vgpr is not balance. + if (BigOcc > TgtOcc) { + TgtOcc = BigOcc; + NotBalance = true; + if (TgtOcc >= MaxOccupancy) + TgtOcc = MaxOccupancy - 1; + } + } + + // Collect input physical pressure. + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + unsigned VInputPressure = 0; + uint64_t SInputMask = 0; + for (const auto &Livein : MRI.liveins()) { + const Register Reg = Livein.first; + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + assert(Reg.isPhysical() && "input must be physical reg"); + unsigned RegSize = RC->getLaneMask().getNumLanes(); + if (SIRI->isVGPR(MRI, Reg)) { + VInputPressure += RegSize; + } else { + unsigned RegIndex = SIRI->getHWRegIndex(Reg); + uint64_t Mask = ((1 << RegSize) - 1) << RegIndex; + SInputMask |= Mask; + } + } + // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high + // pressure. + unsigned SInputPressure = 0; + uint64_t Mask = 0xf; + while (Mask != 0) { + if (Mask & SInputMask) { + SInputPressure += 4; + } + Mask = Mask << 4; + } + + // If balanced, try next occupancy. + TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1); + + auto CC = MF.getFunction().getCallingConv(); + bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS; + // For shader profiles other than ps/cs, set target profile max as 4. + if (!IsPsCs) { + TgtOcc = TgtOcc > 4 ? 4 : TgtOcc; + } + if (TargetOccupancy) + TgtOcc = TargetOccupancy; + + unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true); + unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc); + + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = VLimit; + Status.TargetSLimit = SLimit; + Status.MaxVPressure = MaxVPressure; + Status.MaxSPressure = MaxSPressure; + Status.InputPhysicalVPressure = VInputPressure; + Status.InputPhysicalSPressure = SInputPressure; + Status.MemBound = MemBound; + Status.NotBalance = NotBalance; + return Status; +} + +// For case like +// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, +// implicit-def dead $scc; xb.uniform +// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; +// xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit +// killed $scc; xb.uniform +// Sink S_AND right before S_CSELECT will overwrite SCC. +// To avoid It, skip case when DefMI and UseMI has implicit define use. +bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { + if (DefMI->getDesc().NumImplicitDefs == 0) + return false; + + auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo(); + for (MachineOperand &Def : DefMI->implicit_operands()) { + if (!Def.isReg()) + continue; + if (Def.isUse()) + continue; + Register Reg = Def.getReg(); + if (UseMI->readsRegister(Reg, TRI)) + return true; + } + return false; +} + +// SGPR has alignment requirment, cannot get accurate reg number. +const unsigned NearTargetRegLimit = 10; +bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST, + MachineFunction &MF) { + unsigned MaxSGPR = ST->getAddressableNumSGPRs(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + Register ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg) + MaxSGPR -= 4; + + const unsigned AlignmentDelta = 3; + MaxSGPR -= AlignmentDelta; + + return MaxSPressure > MaxSGPR; +} + +// Skip live reg remated to other block. +void updateLiveInfo(MapVector &RematMap, + GCNRPTracker::LiveRegSet &LiveSet, + const GCNRPTracker::LiveRegSet &InputLive, + MachineBasicBlock *CurBB, + DenseMap &RPOTIndexMap) { + for (auto &It : RematMap) { + unsigned Reg = It.first; + // Skip reg not in live set. + if (!LiveSet.count(Reg)) + continue; + // Skip reg already in input set. + // Input set will be taken care in getReducedSize. + if (InputLive.count(Reg)) + continue; + + auto &Node = It.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is after InsertBB in Reverse post order, the def is + // still before LiveInfo.BB, It is still live. + unsigned LiveBBIndex = RPOTIndexMap[CurBB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex > InsertBBIndex) { + continue; + } + } + // Already in remat map, don't need to check again, remove from + // candidate. + LiveSet.erase(Reg); + } +} + +int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, bool IsVGPR) { + int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + + if (MO.getReg() == AMDGPU::EXEC) + continue; + + // Don't move user of VCC. + if (MO.getReg() == AMDGPU::VCC) { + RematSize = 0; + break; + } + Register Reg = MO.getReg(); + + // Don't move physical register use. + if (Reg.isPhysical()) { + RematSize = 0; + break; + } + + if (IsVGPR != SIRI->isVGPR(MRI, Reg)) { + // Not support mix of v and s when remat now. + // TODO: count possible pressure change here. + RematSize = 0; + break; + } + bool IsSingleDef = MRI.hasOneDef(Reg); + if (!IsSingleDef) { + IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI); + } + + if (IsSingleDef) { + // The reg might share with other candidates, check It here. + // Count share reg in getReducedSize. + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + if (unsigned SubIdx = MO.getSubReg()) { + if (OpRC) + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + } + int InputSize = SIRI->getRegSizeInBits(*OpRC); + // If input not live in hotspot, move It cross hotspot should have + // less reg then DefMi. + if (RematSize > InputSize) { + RematSize -= InputSize; + continue; + } + } + + RematSize = 0; + break; + } + return RematSize; +} + +MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB, + MachineDominatorTree *DT, + MachineLoopInfo *LI) { + while (LI->getLoopDepth(BB) > 0) { + MachineDomTreeNode *N = DT->getNode(BB); + if (N == nullptr) + return nullptr; + MachineDomTreeNode *IDom = N->getIDom(); + if (IDom == nullptr) + return nullptr; + + BB = IDom->getBlock(); + } + + return BB; +} + +MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT, + BlockSet &Blocks) { + auto I = Blocks.begin(), E = Blocks.end(); + + MachineBasicBlock *DomB = cast(*(I++)); + while (I != E) { + MachineBasicBlock *B = cast(*(I++)); + DomB = DT->findNearestCommonDominator(DomB, B); + if (DomB == nullptr) + return nullptr; + } + // For split block like: + // bb.42: + // %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec, + // // implicit $exec + // %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // implicitdef $scc, implicit $exec + // + // bb.68: + //; predecessors: %bb.42 + // successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%), + // %bb.43(50.00%) + // + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // which is from + // bb.42: + //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit + //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // The real common dom is bb.42. + // TODO: use _term version of exec update instructions so don't need this + // anymore. + if (DomB && DomB->pred_size() == 1 && !DomB->empty()) { + // Upstreaming note: This used to be SI_MASK_BRANCH + if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) { + MachineBasicBlock *Pred = *DomB->pred_begin(); + if (Pred->succ_size() == 1 && + (Pred->empty() || !Pred->back().isBranch())) { + DomB = Pred; + } + } + } + + return DomB; +} + +MachineBasicBlock * +findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + const MachineRegisterInfo &MRI, bool MemBound) { + + BlockSet BBSet; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + BBSet.insert(UseMI.getParent()); + } + if (BBSet.size() == 0) + return nullptr; + + MachineBasicBlock *BB = *BBSet.begin(); + if (BBSet.size() > 1) { + MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet); + if (!BDom) + return nullptr; + BB = BDom; + } + // Try to find non loop dominator. + if (!MemBound) { + BB = findNonLoopDominator(BB, DT, MLI); + } + if (!BB) + return nullptr; + + // If BB is already a hot block, move to BB will not help. + // hotBlockRemat will fail It when process BB. + + // Must reachable from DefMI. + if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB)) + return nullptr; + + return BB; +} + +// Maybe expensive to be called all over the place +bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + for (auto &Def : DefMI->defs()) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) { + if (UseMI.isPHI()) + return true; + } + } + return false; +} + +bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + // Do not move PHI nodes + if (isUsedByPhi(DefMI, MRI)) + return false; + + unsigned OpNum = DefMI->getNumOperands(); + // Only move DefMI which all operand is unique def. + for (unsigned I = 0; I < OpNum; I++) { + MachineOperand &Op = DefMI->getOperand(I); + if (!Op.isReg()) + continue; + if (!MRI.getUniqueVRegDef(Op.getReg()) && + !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) { + return false; + } + } + return true; +} + +void addOneDefOneUseCandidate(RematNode &Node, + std::vector &RematList, + MachineRegisterInfo &MRI, int &RematCnt, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, + MachineLoopInfo *MLI, bool IsVGPR, + bool MemBound) { + unsigned Reg = Node.Reg; + MachineInstr *DefMI = Node.DefMI; + + unsigned Size = Node.Size; + MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin(); + MachineBasicBlock *InsertBB = UseMI->getParent(); + + // For VGPR, always move next to the only user to avoid wqm or exec issue. + // But doing this will cause issue when DefMI is in wqm user not in + // wqm. Disable VGPR remat for now. + // TODO: make sure single user don't need wqm. + if (!IsVGPR) { + if (MachineBasicBlock *NewInsertBB = + findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) { + if (InsertBB != NewInsertBB) { + InsertBB = NewInsertBB; + // If can find a non-loop insert block, go to the insert block. + if (DefMI->getParent() != InsertBB) { + if (!InsertBB->empty()) { + auto It = InsertBB->getFirstNonPHI(); + It = skipDebugInstructionsForward(It, InsertBB->end()); + if (It == InsertBB->end()) + UseMI = nullptr; + else + UseMI = &*It; + } + } + } + } + } + + if (IsVGPR) { + // Don't count reg in same block for valu. + if (UseMI->getParent() == DefMI->getParent()) + return; + } + + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(DefMI, UseMI)) { + return; + } + + Node.InsertBlock = InsertBB; + Node.InsertPointMI = UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + RematList.emplace_back(Node); + RematCnt += Size; +} + +void buildRematCandiates(std::vector &Candidates, + GCNRPTracker::LiveRegSet &CandidateRegSet, + DenseSet &PinnedRegSet, + const MachineRegisterInfo &MRI, + const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, + bool IsVGPR) { + + for (auto LiveRegIt : CandidateRegSet) { + unsigned Reg = LiveRegIt.first; + // Skip unsafe reg. + if (PinnedRegSet.count(Reg)) + continue; + + if (SIRI->isVGPR(MRI, Reg) != IsVGPR) + continue; + bool IsSafeCandidate = true; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (MI) { + if (IsVGPR) { + // Only remat valu now. + if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY) + IsSafeCandidate = false; + if (MI->getOpcode() == AMDGPU::COPY) { + // Make sure src is unique define. + if (MI->getOperand(1).isReg() && + nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg())) + IsSafeCandidate = false; + } else { + // Skip convergent valu. + if (MI->isConvergent()) + IsSafeCandidate = false; + } + } + // Skip inst has more than 1 def. + if (MI->getDesc().NumDefs > 1) + IsSafeCandidate = false; + } else { + IsSafeCandidate = false; + } + + if (IsSafeCandidate) { + int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR); + if (Gain > 0) { + Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5)); + } else { + IsSafeCandidate = false; + } + } + // Save unsafe reg. + if (!IsSafeCandidate) + PinnedRegSet.insert(Reg); + } + + // Sort by gain. + std::sort(Candidates.begin(), Candidates.end(), + [](RematNode &I, RematNode &J) { return I.Size > J.Size; }); +} + +void addCloneCandidate(std::vector &CloneList, + std::vector &RematList, + DenseSet &PinnedRegSet, + MachineRegisterInfo &MRI, int &RematCnt) { + // Group user in same blocks. + std::vector UserSetList(CloneList.size()); + + for (size_t I = 0; I < CloneList.size(); I++) { + auto *Node = CloneList[I]; + unsigned Reg = Node->Reg; + MachineInstr *DefMI = Node->DefMI; + // Group user in same blocks. + BlockSet &UserSet = UserSetList[I]; + + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); + UserSet.insert(UseMI.getParent()); + } + + if (UserSet.size() == 1) { + // All users are in same block with DefMI. + if (*UserSet.begin() == DefMI->getParent()) { + // Mark cannot remat for now. + // TODO: try to split if is bigger than 4 and only used once per + // channel. + PinnedRegSet.insert(Reg); + continue; + } + } + + int Size = Node->Size; + Size <<= 16; + // Pack userSet size to size. + Size |= UserSet.size(); + Node->UserCount = Size; + } + + std::sort(CloneList.begin(), CloneList.end(), + // Sort based on userSet size. + [](const RematNode *A, const RematNode *B) { + static constexpr int Mask = 0xffff; + return (A->UserCount & Mask) < (B->UserCount & Mask); + }); + + for (RematNode *Node : CloneList) { + Node->Kind = RematNode::RematKind::Clone; + RematList.emplace_back(*Node); + RematCnt += Node->Size; + } +} + +int filterRematCandiates(std::vector &Candidates, + std::vector &RematList, + DenseSet &PinnedRegSet, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) { + int RematCnt = 0; + // Work one def one use first. + for (auto &Node : Candidates) { + unsigned Reg = Node.Reg; + if (!MRI.hasOneNonDBGUse(Reg)) { + continue; + } + MachineInstr *DefMI = Node.DefMI; + if (!isSafeToMove(DefMI, MRI)) { + PinnedRegSet.insert(Reg); + continue; + } + + addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI, + IsVGPR, MemBound); + } + + if (!IsVGPR) { + std::vector CloneList; + // Try multi use case. + for (auto &Node : Candidates) { + unsigned Reg = Node.Reg; + if (MRI.hasOneNonDBGUse(Reg)) { + continue; + } + MachineInstr *DefMI = Node.DefMI; + if (!isSafeToMove(DefMI, MRI)) { + PinnedRegSet.insert(Reg); + continue; + } + + // Clone for each user. + CloneList.emplace_back(&Node); + } + + addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt); + } + + return RematCnt; +} + +int getReducedSize(MapVector &RematMap, + GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts, + const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo, + DenseMap &RPOTIndexMap) { + int ReducedSize = 0; + for (auto &It : RematMap) { + Register Reg = It.first; + + if (!CanidateSet.count(Reg)) + continue; + + bool IsReduced = false; + auto &Node = It.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is before InsertBB in Reverse post order, the def is + // moved after LiveInfo.BB, It is not live anymore. + unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex < InsertBBIndex) + IsReduced = true; + } else { + // Clone. + IsReduced = true; + // If has use in LiveInfo.BB, could not reduce from input live. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == LiveInfo.BB) { + IsReduced = false; + break; + } + } + } + if (IsReduced) { + ReducedSize += Node.Size; + ReducedInsts.insert(Node.DefMI); + } + + // Already in remat map, don't need to check again, remove from candidate. + CanidateSet.erase(Reg); + } + + return ReducedSize; +} + +int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + + // Find shared operand in ReducedInsts. + int SharedSize = 0; + DenseMap SharedRegMaskMap; + for (MachineInstr *DefMI : ReducedInsts) { + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + Register Reg = MO.getReg(); + + if (Reg == AMDGPU::EXEC) + continue; + if (!Reg.isVirtual()) + continue; + + if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) { + // Not support mix of v and s when remat now. + continue; + } + + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + unsigned Mask; + if (unsigned SubIdx = MO.getSubReg()) { + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + Mask = (1 << SubMOSize) - 1; + } else { + Mask = (1 << MOSize) - 1; + } + auto SharedRegIt = SharedRegMaskMap.find(Reg); + if (SharedRegIt == SharedRegMaskMap.end()) { + SharedRegMaskMap[Reg] = LaneBitmask(Mask); + } else { + unsigned PrevMask = SharedRegIt->second.getAsInteger(); + if (unsigned SharedMask = (PrevMask & Mask)) { + // Some thing is shared. + for (int I = 0; I < MOSize; I++) { + if (SharedMask & (1 << I)) { + SharedSize += 1; + } + } + } + LaneBitmask MoMask = LaneBitmask(Mask | PrevMask); + SharedRegMaskMap[Reg] = MoMask; + } + } + } + return SharedSize; +} + +void dumpRematMap(MapVector &RematMap, + const SIRegisterInfo *SIRI) { + dbgs() << "\n rematMap: \n"; + for (auto It : RematMap) { + int Reg = It.first; + dbgs() << printReg(Reg, SIRI); + dbgs() << "\n"; + } +} +int DebugBlockIndex = 42; +void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet, + MapVector &VRematMap, + MapVector &SRematMap, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + llvm::dumpLiveSet(LiveSet, SIRI); + dumpRematMap(VRematMap, SIRI); + dumpRematMap(SRematMap, SIRI); +} + +void dumpCandidates(std::vector &RematCandidates, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + dbgs() << "\n Candidates: \n"; + unsigned TotalSize = 0; + for (RematNode &Node : RematCandidates) { + dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size; + dbgs() << "\n"; + TotalSize += Node.Size; + } + dbgs() << "Total Size:" << TotalSize << "\n"; +} + +bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, bool &IsNearTarget) { + const GCNSubtarget *ST = &MF.getSubtarget(); + + const SIInstrInfo *SIII = ST->getInstrInfo(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + ReversePostOrderTraversal RPOT(&MF); + DenseMap RPOTIndexMap; + for (MachineBasicBlock *MBB : RPOT) { + RPOTIndexMap[MBB] = RPOTIndexMap.size(); + } + + auto &MRI = MF.getRegInfo(); + + bool IsUpdated = false; + RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST); + + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (Status.TargetOcc >= MaxOcc) + return false; + + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; + + int RematSCnt = Status.MaxSPressure - SLimit; + + bool IsSGPRSpill = false; + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); + } + + const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; + + // If bound by lds, skip. + if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && + !IsForceRematSgpr) + return false; + + MachineBasicBlock *EntryMBB = &MF.front(); + + auto *SlotIndexes = LIS->getSlotIndexes(); + + // Reg which already marked remat. + MapVector VRematMap; + MapVector SRematMap; + // Reg which cannot move around to remat. + DenseSet PinnedRegSet; + std::vector HotBlocks; + for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) { + MachineBasicBlock *MBB = *It; + auto &RP = Status.MBBPressureMap[MBB]; + // ignore block not hot. + if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit && + (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) < + Status.TargetSLimit) + continue; + // Collect reg pressure. + unsigned MaxVPressure = 0; + unsigned MaxSPressure = 0; + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; + + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; + LLVM_DEBUG( + dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI)); + + GCNDownwardRPTracker Tracker(*LIS); + + Tracker.reset(*MBB->begin(), &InputLive); + + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + Tracker.advance(); + auto LISLR = Tracker.getLiveRegs(); + // Update live set for things already remated. + updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap); + updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap); + + const GCNRPTracker::LiveRegSet &LiveSet = LISLR; + unsigned VPressure = 0; + unsigned SPressure = 0; + collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure); + if (MaxVPressure < VPressure) + MaxVPressure = VPressure; + if (MaxSPressure < SPressure) + MaxSPressure = SPressure; + } + MaxSPressure += RegForVCC + Status.InputPhysicalSPressure; + if (MaxVPressure <= VLimit && MaxSPressure <= SLimit) + continue; + + // Build block live info. + // Use outputLive for EntryMBB. + BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure, + MBB != EntryMBB ? InputLive : OutputLive}; + // Skip entry block when save hotBlock to reduce clone because not clone in + // entry block. + if (MBB != EntryMBB) + HotBlocks.emplace_back(LiveInfo); + GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive; + + // Update reg pressure based on remat list. + InstSet VReducedInsts; + InstSet SReducedInsts; + int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI, + LiveInfo, RPOTIndexMap); + int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI, + LiveInfo, RPOTIndexMap); + + // Calculate size need to be remat. + int RematVCnt = MaxVPressure - VReduced - VLimit; + int RematSCnt = MaxSPressure - SReduced - SLimit; + + bool IsSGPRSpill = false; + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF); + } + bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; + // Try to add candidates into remat list. + + int NewRematSCnt = 0; + if (RematSCnt > 0) { + // Build candidate nodes. + std::vector SRematCandidates; + buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI, + SIII, SIRI, /*IsVGPR*/ false); + + LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI)); + std::vector SRematList; + // Filter candidates. + NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList, + PinnedRegSet, DT, PDT, MLI, MRI, + /*IsVGPR*/ false, Status.MemBound); + if (NewRematSCnt > RematSCnt) { + // Has enough remat node to cover rematCnt. + int RematCnt = 0; + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + RematCnt += Node.Size; + if (RematCnt > RematSCnt) + break; + } + NewRematSCnt = 0; + } else { + + for (RematNode &Node : SRematList) { + SReducedInsts.insert(Node.DefMI); + } + // Check shared size. + int SharedReducedSize = + getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI); + if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= + RematSCnt) { + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + } + } else { + if (!IsForceRematSgpr) + return false; + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + } + // Find local one def one use candidates. + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + if (MI.getDesc().NumDefs != 1) + continue; + MachineOperand &DstMO = MI.getOperand(0); + Register Reg = DstMO.getReg(); + if (!SIRI->isSGPRReg(MRI, Reg)) + continue; + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + if (!MRI.hasOneDef(Reg)) + continue; + if (Reg.isPhysical()) + continue; + MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); + if (UseMI.getParent() != MBB) + continue; + int Gain = rematGain(&MI, Reg, MRI, SIRI, + /*IsVGPR*/ false); + if (Gain > 0) { + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(&MI, &UseMI)) { + continue; + } + RematNode Node = {Reg, &MI, (unsigned)Gain >> 5}; + Node.InsertPointMI = &UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + SRematMap[Reg] = Node; + SharedReducedSize += Node.Size; + } + } + } + NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize; + } + } + // If works, continue. + + // Collect live range from hot inst. + // find common live range in hot insts. + // Remat these common live range. + // Apply the remat. + + int NewRematVCnt = 0; + if (RematVCnt > 0) { + // TODO: V remat. + } + + bool NeedSRemat = RematSCnt > 0; + bool NeedVRemat = RematVCnt > 0; + // If sgpr spill, always do remat. + bool IsSRematOK = + (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr; + bool IsVRematOK = + (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty(); + if (NeedSRemat && NeedVRemat) { + if (IsVRematOK && IsSRematOK) { + IsUpdated = true; + } else if (IsSGPRSpill) { + IsUpdated = true; + } + } else if (NeedSRemat) { + if (IsSRematOK) { + IsUpdated = true; + } + } else if (NeedVRemat) { + if (IsVRematOK) { + IsUpdated = true; + } + } + // TODO: what to do when cannot reach target? + if (NewRematSCnt > 0) { + if ((unsigned)NewRematSCnt <= NearTargetRegLimit) { + IsNearTarget = true; + } else { + if (!IsSGPRSpill) + return false; + } + } + } + + if (SRematMap.empty() && VRematMap.empty()) { + return IsUpdated; + } + + if (!SRematMap.empty()) { + IsUpdated = true; + applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, + MF); + LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); + } + + // Balance between vector and scalar if possible. + return IsUpdated; +} + +bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { + if (MF.size() < 2) + return false; + LiveIntervals *LIS = &getAnalysis().getLIS(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); + MachinePostDominatorTree *PDT = + &getAnalysis().getPostDomTree(); + MachineLoopInfo *MLI = &getAnalysis().getLI(); + + bool IsNearTarget = false; + return hotBlockRemat(MF, MLI, LIS, DT, PDT, IsNearTarget); +} + +} // namespace + +INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) + +char AMDGPUHotBlockRematerialize::ID = 0; +char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID; + +FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() { + return new AMDGPUHotBlockRematerialize(); +} + diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp new file mode 100644 index 0000000000000..dc8b67e368516 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -0,0 +1,217 @@ +//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMIRUtils.h" +#include "SIRegisterInfo.h" +#include "SIInstrInfo.h" + +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" + +#define DEBUG_TYPE "xb-mir-util" +using namespace llvm; + +namespace llvm { +bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, + MachineBasicBlock &MBB) { + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + while (BBEnd != MBB.rend() && BBEnd->isDebugInstr()) + BBEnd++; + return BBEnd != MBB.rend(); +} +} // namespace llvm + +namespace { +bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes, + SmallDenseSet &TouchedMBBSet) { + MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); + // Treat non inst as not local. + if (!StartMI || !EndMI) + return false; + // is local when parent MBB the same. + bool IsSameMBB = StartMI->getParent() == EndMI->getParent(); + if (!IsSameMBB) + return false; + // Collect touched MBB. + MachineBasicBlock *MBB = StartMI->getParent(); + TouchedMBBSet.insert(MBB); + return true; +} + +bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes, + SmallDenseSet &TouchedMBBSet) { + for (const LiveRange::Segment &Seg : Range->segments) { + if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet)) + return false; + } + return true; +} + +bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) { + MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); + // Treat non inst as not local. + if (!StartMI || !EndMI) + return false; + // is local when parent MBB the same. + return StartMI->getParent() == EndMI->getParent(); +} + +bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) { + for (const LiveRange::Segment &Seg : Range->segments) { + if (!isLocalSegment(&Seg, Indexes)) + return false; + } + return true; +} + +} // namespace + +// In case like float4 v, v.x used and defined in one block, v.y used and define +// in another block, one live interval could touch more than one MBB. +// TouchedMBBSet is used for scheduling where local live interval could cross +// multiple regions, need to calculate livereg for each region inside touched +// MBB. +bool llvm::isLocalLiveInterval( + const LiveInterval &LI, SlotIndexes *Indexes, + SmallDenseSet &TouchedMBBSet) { + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet)) + return false; + } + } + return isLocalLiveRange(&LI, Indexes, TouchedMBBSet); +} + +bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!isLocalLiveRange(&S, Indexes)) + return false; + } + } + return isLocalLiveRange(&LI, Indexes); +} + +void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { + + dbgs() << "\n live set: \n"; + for (auto It : LiveSet) { + int Reg = It.first; + dbgs() << printReg(Reg, SIRI); + if (It.second.any()) { + dbgs() << " mask:" << It.second.getAsInteger(); + } + dbgs() << "\n"; + } +} + +namespace llvm { +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) { + unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + Size >>= 5; + if (Mask.any()) { + if (unsigned MaskSize = Mask.getNumLanes()) { + if (MaskSize < Size) + Size = MaskSize; + } + } + return Size; +} + +void collectLiveSetPressure(const LiveSet &LiveSet, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, unsigned &VPressure, + unsigned &SPressure) { + VPressure = 0; + SPressure = 0; + for (auto LiveIt : LiveSet) { + unsigned Reg = LiveIt.first; + unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI); + if (SIRI->isVGPR(MRI, Reg)) { + VPressure += Size; + } else { + SPressure += Size; + } + } +} + +bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { + // Support multi def for pattern of pointer: + // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // %808.sub1:sgpr_64 = S_MOV_B32 0 + bool HasSub0 = false; + bool HasSub1 = false; + for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) { + if (unsigned SubReg = UserDefMO.getSubReg()) { + bool IsSingleSubReg = false; + switch (SubReg) { + default: + break; + case AMDGPU::sub0: + if (!HasSub0) { + HasSub0 = true; + IsSingleSubReg = true; + } + break; + case AMDGPU::sub1: + if (!HasSub1) { + HasSub1 = true; + IsSingleSubReg = true; + } + break; + } + if (!IsSingleSubReg) { + HasSub0 = false; + break; + } + } else { + HasSub0 = false; + break; + } + } + + return (HasSub0 && HasSub1); +} + +bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *LI, + MachineBasicBlock *ToBB) { + if (FromBB == ToBB) { + return true; + } + + if (DT->dominates(FromBB, ToBB)) { + return true; + } + + if (PDT->dominates(ToBB, FromBB)) { + return true; + } + + if (loopContainsBoth(LI, ToBB, FromBB)) { + return true; + } + // TODO: cover case hotBB in loop, + // one block in that loop dom BB or + // BB post dom one block in that loop. + return false; +} +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h new file mode 100644 index 0000000000000..c4452c91a43a8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -0,0 +1,62 @@ +//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H + +#include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" + +namespace llvm { + +class LiveInterval; +class SlotIndexes; +class MachineRegisterInfo; +class SIRegisterInfo; +class MachineDominatorTree; +class MachinePostDominatorTree; + +constexpr unsigned RegForVCC = 2; + +bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, + llvm::MachineBasicBlock &MBB); + +// Check if LI live cross basic blocks, save all touched basic block if is +// local. +bool isLocalLiveInterval( + const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes, + llvm::SmallDenseSet &TouchedMBBSet); +bool isLocalLiveInterval(const llvm::LiveInterval &LI, + llvm::SlotIndexes *Indexes); + +bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); + +using LiveSet = llvm::DenseMap; +void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI); + +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); +void collectLiveSetPressure(const LiveSet &LiveSet, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + unsigned &VPressure, unsigned &SPressure); + +bool reach_block(llvm::MachineBasicBlock *FromBB, + llvm::MachineDominatorTree *DT, + llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI, + llvm::MachineBasicBlock *ToBB); +} + +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp new file mode 100644 index 0000000000000..32301130606a7 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -0,0 +1,18 @@ +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==------------------------------------------------------------------------==// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//==------------------------------------------------------------------------==// + +namespace llvm { +} + + diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h new file mode 100644 index 0000000000000..f9be0a2c73d86 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -0,0 +1,53 @@ +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H + +namespace llvm { + +class MachineFunction; +class GCNSubtarget; +class MachineLoopInfo; + +struct SchedScore { + // Score for this Sched result. + unsigned Occupancy = 0; + bool SgprSpill = false; + unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass? + unsigned MemLatency = 0; // Only save mem latency. + // We want mem latency small and hide big. Compare + // memLatency - hide * Occ, smaller is better. + unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1. + unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy. + unsigned Lds = 0; // Todo: count lds. + SchedScore() {} + + // Other info which can help compare schedule result. + float computeScore() const; + float computeScore2() const; + + void sum(const SchedScore &S, unsigned LoopDepth = 0); + bool isBetter(const SchedScore &S) const; + bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const; + // More latency can be hiden with ExtraOcc. + unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; +}; + +SchedScore collectLatency(llvm::MachineFunction &MF, + const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI = nullptr); + +} +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 90e3489ced923..9c1aec6cd047d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -397,6 +397,12 @@ static cl::opt cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable Hot block rematerialize +static cl::opt + EnableHotBlockRemat("amdgpu-enable-hot-block-remat", + cl::desc("Enable HotBlock Rematerialize optimization"), + cl::init(false), cl::Hidden); + // Enable GFX11+ VOPD static cl::opt EnableVOPD("amdgpu-enable-vopd", @@ -521,6 +527,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); + initializeAMDGPUHotBlockRematerializePass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); @@ -1539,6 +1546,10 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (TM->getOptLevel() > CodeGenOptLevel::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + // Rematerialize must be run before phi elimination + if (isPassEnabled(EnableHotBlockRemat)) + addPass(&AMDGPUHotBlockRematerializeID); + TargetPassConfig::addOptimizedRegAlloc(); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..79fdbba1d0db1 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHotBlockRematerialize.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp @@ -81,10 +82,12 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp + AMDGPUMIRUtils.cpp AMDGPUIGroupLP.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp + AMDGPUOccupancyAndLatencyHelper.cpp AMDGPUPerfHintAnalysis.cpp AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 7554b9f578fcb..aa4b3f948b726 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -47,6 +47,10 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + unsigned getMaxSGPR() const { + return std::max(getSGPRNum(), getSGPRTuplesWeight()); + } + /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR32]; } /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p From 6854976b4d2ae4af1d3caba6ef2b5c39c7925d2d Mon Sep 17 00:00:00 2001 From: Adam Yang <31109344+adam-yang@users.noreply.github.com> Date: Fri, 18 Apr 2025 15:24:29 -0700 Subject: [PATCH 02/11] First build --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 237 ++++++++- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 467 +++++++++++++++++- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 40 ++ .../AMDGPUOccupancyAndLatencyHelper.cpp | 151 ++++++ .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 27 + 5 files changed, 909 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 70b25beeb22b9..95237062a6093 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -37,6 +37,7 @@ namespace { typedef DenseSet InstSet; typedef DenseSet BlockSet; +template using BlockMap = MapVector; struct RematNode { enum class RematKind { @@ -107,20 +108,17 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass { AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; + + void applyCloneRemat(RematNode &Node, + std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF); void applyRemat(MapVector &RematMap, std::vector &HotBlocks, MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF); - void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, - llvm::SlotIndexes *SlotIndexes, - const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII); - void applyCloneRemat(RematNode &Node, - std::vector &HotBlocks, - MachineDominatorTree *DT, MachineRegisterInfo &MRI, - llvm::SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineFunction &MF); bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, bool &IsNearTarget); @@ -138,6 +136,227 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass { } }; +MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash( + MachineInstr *InstructionToMove, MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + const bool WillSmashScc = + InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) { + CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef( + MBB, CurrentInsertPoint, SIRI, SIII, &MRI); + } + + return CurrentInsertPoint; +} + +DenseMap reduceClonedMBBs( + unsigned Reg, BlockMap> &UserBlocks, + DenseSet &UserMBBSet, + std::vector &HotBlocks, MachineDominatorTree *DT) { + // Collect hot blocks which Exp is live in. + DenseSet HotBlockSet; + for (BlockLiveInfo &HotBlock : HotBlocks) { + if (HotBlock.InputLive.count(Reg)) { + HotBlockSet.insert(HotBlock.BB); + } + } + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseSet AfterHotRangeMBBs; + for (MachineBasicBlock *MBB : UserMBBSet) { + // Always clone in hot block. + if (HotBlockSet.count(MBB)) + continue; + + bool IsDomAllHotBlocks = true; + bool IsDomedByAllHotBlocks = true; + for (MachineBasicBlock *HotMBB : HotBlockSet) { + if (!DT->dominates(MBB, HotMBB)) { + IsDomAllHotBlocks = false; + } + if (!DT->dominates(HotMBB, MBB)) { + IsDomedByAllHotBlocks = false; + } + if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { + break; + } + } + if (IsDomAllHotBlocks) { + UserBlocks.erase(MBB); + } else if (IsDomedByAllHotBlocks) { + AfterHotRangeMBBs.insert(MBB); + } + } + + // Split after hotRange block set by domtree. + DenseMap DomMap; + if (!AfterHotRangeMBBs.empty()) { + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) { + if (MBB == MBB2) + continue; + if (DT->dominates(MBB, MBB2)) { + auto &Dom = DomMap[MBB]; + Dom.insert(MBB2); + auto &Dom2 = DomMap[MBB2]; + Dom.insert(Dom2.begin(), Dom2.end()); + } + } + } + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + auto &Dom = DomMap[MBB]; + for (MachineBasicBlock *DomedMBB : Dom) { + // Remove domedMBB. + DomMap.erase(DomedMBB); + UserMBBSet.erase(DomedMBB); + } + } + } + + return DomMap; +} + +void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef, + SmallVector &UserMIs) { + for (MachineInstr *UseMI : UserMIs) { + for (MachineOperand &MO : UseMI->operands()) { + if (!MO.isReg()) + continue; + if (MO.getReg() == Reg) { + MO.setReg(NewReg); + if (IsSubRegDef) + MO.setSubReg(0); + } + } + } +} + +void AMDGPUHotBlockRematerialize::applyCloneRemat(RematNode &Node, + std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF) { + unsigned Reg = Node.Reg; + + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + auto DefOp = DefMI->getOperand(0); + const MCInstrDesc &Desc = DefMI->getDesc(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + // When the unique def has subReg, just create newReg for the subReg part. + bool IsSubRegDef = false; + if (DefOp.getSubReg() != 0) { + RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg()); + IsSubRegDef = true; + } + const DebugLoc DL = DefMI->getDebugLoc(); + unsigned OpNum = DefMI->getNumOperands(); + + Node.Kind = RematNode::RematKind::Clone; + + // Group user in same blocks. + BlockMap> UserMap; + DenseSet UserMBBSet; + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); + UserMap[UseMI.getParent()].emplace_back(&UseMI); + UserMBBSet.insert(UseMI.getParent()); + } + + DenseMap DomMap = + reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT); + + for (auto UseIt : UserMap) { + MachineBasicBlock *MBB = UseIt.first; + // Skip same block uses. + if (MBB == DefMI->getParent()) { + continue; + } + // Skip MBB which share clone from other MBBs. + if (UserMBBSet.count(MBB) == 0) + continue; + + Register NewReg = MRI.createVirtualRegister(RC); + auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg); + for (unsigned I = 1; I < OpNum; I++) { + NewDef = NewDef.add(DefMI->getOperand(I)); + } + + MachineInstr *InsertPointMI = UseIt.second.front(); + SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI); + + for (MachineInstr *UseMI : UseIt.second) { + SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI); + if (LastSlot > Slot) { + LastSlot = Slot; + InsertPointMI = UseMI; + } + } + + MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash( + DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII); + + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(MF, MO); + } + + MBB->insert(InsertPoint, NewDef); + + SlotIndexes->insertMachineInstrInMaps(*NewDef); + + SmallVector &UserMIs = UseIt.second; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); + + // update users in dom MBBs. + auto DomMapIt = DomMap.find(MBB); + if (DomMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : DomMapIt->second) { + SmallVector &UserMIs = UserMap[UpdateMBB]; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); + } + } + + llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes); + } + if (MRI.use_empty(Reg)) { + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + } +} + +void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, + const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + MachineInstr *DefMI = Node.DefMI; + MachineInstr *InsertPointMI = Node.InsertPointMI; + MachineBasicBlock *MBB = nullptr; + + // Find a valid insert point. + MachineBasicBlock::iterator InsertPoint; + if (InsertPointMI) { + InsertPoint = InsertPointMI->getIterator(); + MBB = InsertPointMI->getParent(); + } else { + InsertPoint = Node.InsertBlock->getFirstTerminator(); + MBB = Node.InsertBlock; + } + + InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI, + SIRI, SIII); + + // Move instruction to new location. + DefMI->removeFromParent(); + InsertPoint->getParent()->insert(InsertPoint, DefMI); + + // Update slot index. + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); +} + void AMDGPUHotBlockRematerialize::applyRemat(MapVector &RematMap, std::vector &HotBlocks, MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index dc8b67e368516..6d6bd38c61c06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #define DEBUG_TYPE "xb-mir-util" using namespace llvm; @@ -79,14 +80,132 @@ bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) { return true; } +// LoopInfo contains a mapping from basic block to the innermost loop. Find +// the outermost loop in the loop nest that contains BB. +const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI, + const MachineBasicBlock *BB) { + const MachineLoop *L = LI->getLoopFor(BB); + if (L) { + while (const MachineLoop *Parent = L->getParentLoop()) + L = Parent; + } + return L; +} + +bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1, + const MachineBasicBlock *BB2) { + const MachineLoop *L1 = getOutermostLoop(LI, BB1); + const MachineLoop *L2 = getOutermostLoop(LI, BB2); + return L1 != nullptr && L1 == L2; +} + } // namespace + +namespace llvm { + +bool isSccLiveAt(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock::iterator MI) { + const TargetRegisterInfo *TRI = + MBB->getParent()->getRegInfo().getTargetRegisterInfo(); + for (auto It = MI; It != MBB->end(); ++It) { + const MachineInstr &CurMI = *It; + // Hit use of scc, it is live. + if (CurMI.readsRegister(AMDGPU::SCC, TRI)) + return true; + // Hit def of scc first, not live. + if (CurMI.definesRegister(AMDGPU::SCC, TRI)) + return false; + } + // Reach the end of MBB, check live-ins of MBB successors. + for (const MachineBasicBlock *Succ : MBB->successors()) { + if (Succ->isLiveIn(AMDGPU::SCC)) + return true; + } + return false; +} + +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( + MachineBasicBlock *MBB, MachineBasicBlock::iterator MI, + const TargetRegisterInfo *TRI, const SIInstrInfo *TII, + MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) { + // If SCC is dead at MI when we can use MI as the insert point. + if (!llvm::isSccLiveAt(MBB, MI)) { + return MI; + } + + const bool CheckForExecWrite = + Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; + + // Get the starting reverse iterator taking care to handle the MBB->end() + // case. + MachineBasicBlock::reverse_iterator Start; + if (MI == MBB->end()) { + Start = MBB->rbegin(); + } else { + Start = MI.getReverse(); + } + + // Otherwise, walk backwards through the block looking for a location where + // SCC is dead. + for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); + It != End; ++It) { + // If the instruction modifies exec then we cannot use it as + // an insertion point (if that is a constraint from the caller). + // The check for EXEC works for both wave64 and wave32 because + // it will also catch Writes to the subregisters (e.g. exec_lo). + if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) { + break; + } + + if (It->modifiesRegister(AMDGPU::SCC, TRI) && + !It->readsRegister(AMDGPU::SCC, TRI)) { + return It->getIterator(); + } + } + + // If no safe location can be found in the block we can save and restore + // SCC around MI. There is no way to directly read or Write SCC so we use + // s_cselect to read the current value of SCC and s_cmp to Write the saved + // value back to SCC. + // + // The generated code will look like this; + // + // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC + // <----- Newly created safe insert point. + // MI + // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC + // + Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) + .addImm(-1) + .addImm(0); + BuildMI(*MBB, std::next(MI->getIterator()), DL, + TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(TmpScc, RegState::Kill) + .addImm(0); + + return MI; +} + // In case like float4 v, v.x used and defined in one block, v.y used and define // in another block, one live interval could touch more than one MBB. // TouchedMBBSet is used for scheduling where local live interval could cross // multiple regions, need to calculate livereg for each region inside touched // MBB. -bool llvm::isLocalLiveInterval( +bool isLocalLiveInterval( const LiveInterval &LI, SlotIndexes *Indexes, SmallDenseSet &TouchedMBBSet) { if (LI.hasSubRanges()) { @@ -98,7 +217,7 @@ bool llvm::isLocalLiveInterval( return isLocalLiveRange(&LI, Indexes, TouchedMBBSet); } -bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { +bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { if (LI.hasSubRanges()) { for (const auto &S : LI.subranges()) { if (!isLocalLiveRange(&S, Indexes)) @@ -108,7 +227,7 @@ bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { return isLocalLiveRange(&LI, Indexes); } -void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { +void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { dbgs() << "\n live set: \n"; for (auto It : LiveSet) { @@ -121,7 +240,347 @@ void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { } } -namespace llvm { +LaneBitmask getRegMask(const MachineOperand &MO, + const MachineRegisterInfo &MRI) { + // We don't rely on read-undef_ flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0 + ? MRI.getMaxLaneMaskForVReg(MO.getReg()) + : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask( + MO.getSubReg()); +} + +struct Piece { + unsigned Reg; + unsigned Offset; + unsigned Size; + static SmallVector split(std::bitset<32> Mask) { + + SmallVector Pieces; + Piece Piece = {0, 0, 0}; + for (unsigned i = 0; i < 32; i++) { + if (Mask.test(i)) { + if (Piece.Size == 0) + Piece.Offset = i; + + Piece.Size++; + // Make sure no piece bigger than 8. + if (Piece.Size == 8) { + Pieces.emplace_back(Piece); + Piece.Size = 0; + } + } else { + if (Piece.Size == 0) { + continue; + } + Pieces.emplace_back(Piece); + Piece.Size = 0; + } + } + return Pieces; + } +}; + +static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI, + const MachineRegisterInfo &MRI) { + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + const TargetRegisterClass *SubregRC = + SIRI->getSubRegisterClass(RC, AMDGPU::sub0); + return SubregRC->LaneMask.getNumLanes(); +} + +static std::vector +getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI, + const TargetRegisterClass *RC, + LaneBitmask Mask) { + // TODO: this could replace the code it was copied from in SplitKit.cpp + + // First pass: Try to find a perfectly matching subregister index. + // If none exists find the one covering the most lanemask bits. + SmallVector PossibleIndexes; + unsigned BestIdx = 0; + const LaneBitmask Avoid = ~Mask; + { + unsigned BestCover = 0; + for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI->getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == Mask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside + if ((SubRegMask & Avoid).any()) + continue; + + unsigned PopCount = SubRegMask.getNumLanes(); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) { + LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " + << TRI->getRegClassName(RC) << " mask " + << PrintLaneMask(Mask) << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + std::vector Result; + Result.push_back(BestIdx); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx)); + while (Mask.any()) { + BestIdx = 0; + int BestCover = std::numeric_limits::min(); + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == Mask) { + BestIdx = Idx; + break; + } + + // Guaranteed above + assert((SubRegMask & Avoid).none()); + + // Try to cover as much of the remaining lanes as possible but as few of + // the already covered lanes as possible. + int Cover = (SubRegMask & Mask).getNumLanes() - + (SubRegMask & ~Mask).getNumLanes(); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) { + LLVM_DEBUG( + dbgs() << "Unable to find minimal spanning sub register(s) for " + << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask) + << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + Result.push_back(BestIdx); + Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx); + } + + return Result; +} + +static void updateSubReg(MachineOperand &UseMO, + const llvm::TargetRegisterClass *NewRC, + unsigned Offset, const SIRegisterInfo *SIRI) { + unsigned Size = NewRC->getLaneMask().getNumLanes(); + if (Size == 1) { + UseMO.setSubReg(0); + } else { + const uint32_t SubReg = UseMO.getSubReg(); + LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg); + + unsigned Mask = LaneMask.getAsInteger() >> Offset; + + unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask( + SIRI, NewRC, LaneBitmask(Mask)) + .front(); + + UseMO.setSubReg(NewSubReg); + } +} + +bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { + MachineOperand &DstMO = MI.getOperand(0); + // Skip case when dst subReg not 0. + if (DstMO.getSubReg()) { + return false; + } + Register Reg = DstMO.getReg(); + + SmallVector UseMOs; + for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) { + UseMOs.emplace_back(&UseMO); + } + + const llvm::TargetRegisterClass *NewRC = + SIRI->getRegClass(Desc.operands().front().RegClass); + if (!NewRC->isAllocatable()) { + if (SIRI->isSGPRClass(NewRC)) + NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits); + else if (SIRI->isVGPRClass(NewRC)) + NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits); + else + return false; + + if (!NewRC->isAllocatable()) + return false; + } + + unsigned NumLanes = NewRC->getLaneMask().getNumLanes(); + if (Offset > 0) { + // Update offset operand in MI. + MachineOperand *OffsetOp = + SIII->getNamedOperand(MI, AMDGPU::OpName::offset); + + const uint32_t LaneSize = sizeof(uint32_t); + if (OffsetOp) { + if (OffsetOp->isImm()) { + assert(OffsetOp != nullptr); + int64_t Offset = OffsetOp->getImm(); + Offset += Offset * LaneSize; + if (!SIII->isLegalMUBUFImmOffset(Offset)) { + return false; + } + OffsetOp->setImm(Offset); + } else { + return false; + } + } else { + OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset); + if (OffsetOp) { + Register NewOffsetReg = + MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(), + SIII->get(AMDGPU::S_ADD_U32)) + .addDef(NewOffsetReg) + .add(*OffsetOp) + .addImm(Offset * LaneSize); + MachineInstr *OffsetAddMI = OffsetAdd.getInstr(); + MachineBasicBlock::iterator InsertPoint = + llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI, + SIII, &MRI); + MI.getParent()->insert(InsertPoint, OffsetAddMI); + SIII->legalizeOperands(*OffsetAddMI); + OffsetOp->setReg(NewOffsetReg); + OffsetOp->setSubReg(0); + if (SlotIndexes) + SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI); + } else { + return false; + } + } + // Update subReg for users. + for (MachineOperand *UseMO : UseMOs) { + updateSubReg(*UseMO, NewRC, Offset, SIRI); + } + } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) { + // Clear subReg when it's a single 32-bit reg. + for (MachineOperand *UseMO : UseMOs) { + UseMO->setSubReg(0); + } + } + + MI.setDesc(Desc); + // Mutate reg class of Reg. + MRI.setRegClass(Reg, NewRC); + return true; +} + +bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + SlotIndexes *SlotIndexes) { + bool IsImm = false; + switch (MI.getOpcode()) { + default: + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM: + IsImm = true; + LLVM_FALLTHROUGH; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { + Register Reg = MI.getOperand(0).getReg(); + if (!MRI.getUniqueVRegDef(Reg)) + return false; + LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI); + LaneBitmask UseMask; + for (MachineOperand &MO : MRI.use_operands(Reg)) { + UseMask |= llvm::getRegMask(MO, MRI); + } + + const unsigned FullMask = DstMask.getAsInteger(); + unsigned Mask = UseMask.getAsInteger(); + if (Mask == FullMask) + return false; + // Split mask when there's gap. Then group mask to 2/4/8. + auto Pieces = Piece::split(std::bitset<32>(Mask)); + // Now only support 1 piece. + if (Pieces.size() != 1) + return false; + auto Piece = Pieces[0]; + if (Piece.Size > 8) + return false; + + // TODO: enable offset support when IsImm is true. + // Now if break different test when mul LaneSize or not mul for the offset. + if (IsImm && Piece.Offset != 0) + return false; + + const unsigned Num32BitLanes = + Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI); + + switch (Num32BitLanes) { + default: + return false; + case 1: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM + : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 2: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 3: + if (FullMask == 0xff) + return false; + LLVM_FALLTHROUGH; + case 4: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 5: + case 6: + case 7: + if (FullMask == 0xffff) + return false; + LLVM_FALLTHROUGH; + case 8: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR), + MRI, SIRI, SIII, SlotIndexes); + } + + } break; + } + return false; +} + unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index c4452c91a43a8..6b9079e5d65fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -24,6 +24,7 @@ class LiveInterval; class SlotIndexes; class MachineRegisterInfo; class SIRegisterInfo; +class SIInstrInfo; class MachineDominatorTree; class MachinePostDominatorTree; @@ -45,6 +46,45 @@ bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); using LiveSet = llvm::DenseMap; void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI); +bool isSccLiveAt(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock::iterator MI); + +// An enum used to pass additional constraints to +// `FindOrCreateInsertionPointForSccDef()`. This will further +// constrain the location where the scc def can be inserted. +enum SccDefInsertPointConstraintFlags { + None = 0, // No additional constraints. + NoExecWrite = 1, // Should be no modification of exec between BeforeInst and + // insert point. +}; + +// Look for a safe place to insert an instruction that defines scc. +// +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( + llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst, + const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII, + llvm::MachineRegisterInfo *MRI, + SccDefInsertPointConstraintFlags Constraints = + SccDefInsertPointConstraintFlags::None); + +// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only +// used 4 lanes. +bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *TRI, + const llvm::SIInstrInfo *TII, + llvm::SlotIndexes *SlotIndexes); + unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp index 32301130606a7..c2dbf1a8b297e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -12,7 +12,158 @@ // //==------------------------------------------------------------------------==// +#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "GCNSubtarget.h" +#include "SIInstrInfo.h" + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + +#include + namespace llvm { + +void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) { + unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1; + LatencyHide += LoopCount * S.LatencyHide; + MemLatency += LoopCount * S.MemLatency; + MixAlu += LoopCount * S.MixAlu; + Alu += LoopCount * S.Alu; + Lds += LoopCount * S.Lds; + SgprSpill |= S.SgprSpill; +} +// Does more occupancy give more perf. +bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const { + unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc); + // 10% is good enough. + if ((10 * Gain) >= Alu) + return true; + return false; +} + +unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const { + unsigned Latency = MemLatency; + return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc)); +} + +// AMDGPULatencyTracker +AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST) + : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {} + +void AMDGPULatencyTracker::scan(const MachineInstr &MI) { + if (MI.isDebugInstr()) + return; + int Latency = SIII->getInstrLatency(ItinerayData, MI); + // If inside latency hide. + if (!LatencyMIs.empty()) { + bool IsWaitCnt = false; + for (auto &MO : MI.operands()) { + if (MO.isReg()) { + Register Reg = MO.getReg(); + auto It = LatencyMIs.find(Reg); + if (It != LatencyMIs.end()) { + IsWaitCnt = true; + // If MI use mem result, update latency to mem latency. + int Cycle = It->second; + if (Cycle > Latency) + Latency = Cycle; + } + } + } + // Update latency for each mem latency inst. + for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) { + auto Prev = It; + auto L = (It++); + int Cycle = L->second; + if (Cycle <= Latency) { + // Only left cycles. + // Remove the reg. + LatencyMIs.erase(Prev); + if (IsWaitCnt && Cycle == Latency) { + Score.MemLatency += Cycle; + // Only count memLatency once, the rest is hide. + IsWaitCnt = false; + } else { + // Hide cycle or count mem latency? + Score.LatencyHide += Cycle; + } + } else { + L->second -= Latency; + // Hide latency. + Score.LatencyHide += Latency; + } + } + + } else { + // TODO: check branch/lds? + // TODO: check prevVAlu? + auto GetAluStatus = [](const MachineInstr &MI, + const llvm::SIInstrInfo *SIII) { + AluStatus Status = AluStatus::Nothing; + if (SIII->isVALU(MI.getOpcode())) { + Status = AluStatus::Vector; + } else if (SIII->isSALU(MI.getOpcode())) { + Status = AluStatus::Scalar; + } + return Status; + }; + AluStatus Status = GetAluStatus(MI, SIII); + + switch (PrevStatus) { + case AluStatus::Nothing: { + Score.Alu += Latency; + Score.MixAlu += Latency; + PrevStatus = Status; + } break; + case AluStatus::Vector: + case AluStatus::Scalar: { + Score.Alu += Latency; + // Ignore mix alu. + if (PrevStatus != Status) { + PrevStatus = AluStatus::Nothing; + } else { + Score.MixAlu += Latency; + } + } break; + } + } + // Update latency inst. + if (SIII->isHighLatencyDef(MI.getOpcode()) && MI.mayLoad()) { + Register Reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kHighLetency = 180; + LatencyMIs[Reg] = kHighLetency; + } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) { + Register Reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kLowLetency = 35; + LatencyMIs[Reg] = kLowLetency; + } } +SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI) { + SchedScore TotalScore; + for (auto &MFI : MF) { + MachineBasicBlock &MBB = MFI; + MachineBasicBlock::iterator Next; + AMDGPULatencyTracker LatencyTracker(ST); + for (auto &MI : MBB) { + LatencyTracker.scan(MI); + } + unsigned LoopDepth = 0; + if (MLI) { + LoopDepth = MLI->getLoopDepth(&MBB); + } + TotalScore.sum(LatencyTracker.Score, LoopDepth); + } + return TotalScore; +} + +} // namespace llvm + + diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h index f9be0a2c73d86..b513e7335ffe4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -15,11 +15,16 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/ADT/DenseMap.h" + namespace llvm { +class MachineInstr; class MachineFunction; class GCNSubtarget; class MachineLoopInfo; +class SIInstrInfo; struct SchedScore { // Score for this Sched result. @@ -45,6 +50,28 @@ struct SchedScore { unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; }; +struct AMDGPULatencyTracker { + AMDGPULatencyTracker(const llvm::GCNSubtarget &ST); + const llvm::SIInstrInfo *SIII; + const llvm::InstrItineraryData *ItinerayData; + // Latency MI dst reg to cycle map. + llvm::DenseMap LatencyMIs; + SchedScore Score; + // Low latency MI not wait. + unsigned HideLatency = 0; + unsigned MemLatency = 0; + // For simple, only consider mixture as one valu one salu. + // Not group now. + unsigned PrevSAlu = 0; + unsigned PrevVAlu = 0; + enum class AluStatus { + Nothing, + Vector, + Scalar, + } PrevStatus = AluStatus::Nothing; + void scan(const llvm::MachineInstr &MI); +}; + SchedScore collectLatency(llvm::MachineFunction &MF, const llvm::GCNSubtarget &ST, const llvm::MachineLoopInfo *MLI = nullptr); From 3c2b1f3acd43503c7f90781784687cd473af09fc Mon Sep 17 00:00:00 2001 From: Adam Yang <31109344+adam-yang@users.noreply.github.com> Date: Mon, 21 Apr 2025 15:29:26 -0700 Subject: [PATCH 03/11] Tests --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 13 +- .../CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir | 565 ++++++++++++++++++ .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 452 ++++++++++++++ 3 files changed, 1029 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 95237062a6093..5c628a89766c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -31,6 +31,8 @@ using namespace llvm; +static cl::opt + EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive"); static cl::opt TargetOccupancy("amdgpu-remat-target-occupancy"); namespace { @@ -723,6 +725,12 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, if (IsSingleDef) { // The reg might share with other candidates, check It here. // Count share reg in getReducedSize. + if (EnableAggressive) { + // In case of aggressive remat, treat multi use reg as shared reg and + // ignore size of shared reg. + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + } const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); if (unsigned SubIdx = MO.getSubReg()) { if (OpRC) @@ -1253,6 +1261,9 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop unsigned SLimit = Status.TargetSLimit; int RematSCnt = Status.MaxSPressure - SLimit; + // when agressive sgpr remat, reserve some for allocation lost. + if (EnableAggressive) + RematSCnt += NearTargetRegLimit; bool IsSGPRSpill = false; if (RematSCnt > 0) { @@ -1367,7 +1378,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop for (RematNode &Node : SRematList) { SRematMap[Node.Reg] = Node; RematCnt += Node.Size; - if (RematCnt > RematSCnt) + if (RematCnt > RematSCnt && !EnableAggressive) break; } NewRematSCnt = 0; diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir new file mode 100644 index 0000000000000..02a9836313360 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir @@ -0,0 +1,565 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that the buffer loads have been moved to the use and the lanes are reduced +# correctly. +# +# CHECK: bb.2: +#========================================================================== +# X4_IMM, Using .x +# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0 +# X4_IMM, Using .xy +# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0 +# X4_IMM, Using .xyz +# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0 +# X4_IMM, Using .yz +# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0 +# X4_IMM, Using .yzw +# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0 +#========================================================================== +# X8_IMM, Using .x +# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0 +# X8_IMM, Using .xy +# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0 +# X8_IMM, Using .xyz +# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0 +# X8_IMM, Using .xyzw +# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0 +# X8_IMM, Using .xyzw + 5th dword +# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0 +#========================================================================== +# X16_IMM, Using .xy and .zw +# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0 +#========================================================================== +# X4_SGPR, Using .x +# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0 +# X8_SGPR, Using .xy +# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0 +# X16_SGPR, Using .xy + .zw +# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0 +#========================================================================== +# +# +# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 +# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 +# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 +# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 +# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 +# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 +# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 +# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 +# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 +# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 +# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 +# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 +# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 +# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 +# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 +# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 +# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 +# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 +# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 +# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 +# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 +# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 +# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 +# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 +# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 +# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 +# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 +# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 +# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 +# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 +# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 +# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 +# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 +# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 +# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 +# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 +# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 +# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 +# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 +# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 +# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 +# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 +# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 +# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 +# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 +# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 +# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 +# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 +# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 +# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3 + + ; X4_IMM + %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0 + %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0 + %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0 + %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0 + %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0 + + ; X8_IMM + %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0 + %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0 + %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0 + %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0 + %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0 + + ; X16_IMM + %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0 + + ; X4_SGPR + %50:sgpr_32 = COPY $sgpr0 + %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0 + + ; X8_SGPR + %51:sgpr_32 = COPY $sgpr1 + %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0 + + ; X16_SGPR + %52:sgpr_32 = COPY $sgpr2 + %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0 + + %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0 + %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0 + %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0 + %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0 + %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0 + %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0 + %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0 + %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0 + %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0 + %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0 + %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0 + %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0 + %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0 + %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0 + %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0 + %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0 + %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0 + %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0 + %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0 + %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0 + %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0 + %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0 + %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0 + %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0 + %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0 + %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0 + %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0 + %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0 + %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0 + %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0 + %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0 + %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0 + %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0 + %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0 + %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0 + %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0 + %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0 + %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0 + %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0 + %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0 + %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0 + %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0 + %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0 + %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0 + %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0 + %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0 + %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0 + %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0 + %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0 + %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0 + + %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %8001:vgpr_32 = COPY %8000 + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + ;========================================================================== + ; X4_IMM, Using .x + S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0 + S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses. + + ; X4_IMM, Using .xy + S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0 + S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0 + + ; X4_IMM, Using .xyz + S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0 + S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0 + S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0 + + ; X4_IMM, Using .yz + S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0 + S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0 + + ; X4_IMM, Using .yzw + S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0 + S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0 + S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0 + + ;========================================================================== + ; X8_IMM, Using .x + S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0 + S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0 + + ; X8_IMM, Using .xy + S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0 + S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0 + + ; X8_IMM, Using .xyz + S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0 + S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0 + S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0 + + ; X8_IMM, Using .xyzw + S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0 + + ; X8_IMM, Using .xyzw + 5th dword + S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0 + + ;========================================================================== + ; X16_IMM, Using .xy and .zw + S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0 + S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0 + + ;========================================================================== + ; X4_SGPR, Using .x + S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0 + S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0 + + ; X8_SGPR, Using .xy + S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0 + S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0 + + ; X16_SGPR, Using .xy + .zw + S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0 + S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0 + + ;========================================================================== + S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 + + EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + + + + diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir new file mode 100644 index 0000000000000..69875261b74e9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -0,0 +1,452 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that the loads have been moved to the use +# CHECK: bb.2: +# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0 +# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0 +# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0 +# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0 +# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0 +# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0 +# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0 +# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0 +# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0 +# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0 +# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0 +# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0 +# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0 +# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0 +# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 +# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 +# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 +# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 +# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 +# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 +# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 +# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 +# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 +# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 +# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 +# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 +# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 +# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 +# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 +# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 +# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 +# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 +# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 +# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 +# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 +# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 +# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 +# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 +# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 +# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 +# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 +# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 +# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 +# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 +# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 +# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 +# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 +# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 +# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 +# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 +# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 +# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 +# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 +# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 +# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 +# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 +# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 +# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 +# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 +# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 +# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 +# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 +# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 +# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + + %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 + %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 + %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0 + %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0 + %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0 + %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0 + %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0 + %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0 + %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0 + %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0 + %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0 + %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0 + %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0 + %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0 + %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0 + %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0 + %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0 + %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0 + %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0 + %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0 + %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0 + %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0 + %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0 + %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0 + %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0 + %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0 + %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0 + %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0 + %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0 + %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0 + %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0 + %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0 + %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0 + %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0 + %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0 + %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0 + %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0 + %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0 + %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0 + %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0 + %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0 + %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0 + %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0 + %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0 + %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0 + %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0 + %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0 + %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0 + %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0 + %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0 + %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0 + %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0 + %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0 + %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0 + %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0 + %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0 + %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0 + %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0 + %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0 + %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0 + %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0 + %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0 + %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0 + %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0 + + %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %8001:vgpr_32 = COPY %8000 + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 + + EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... From dbdc9a48b78f7cc97f25d7e0195d1e5423d69265 Mon Sep 17 00:00:00 2001 From: Adam Yang <31109344+adam-yang@users.noreply.github.com> Date: Mon, 21 Apr 2025 15:59:28 -0700 Subject: [PATCH 04/11] Added test for the phi crash in pressure tracker --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 18 +- llvm/test/CodeGen/AMDGPU/remat/phi.mir | 607 ++++++++++++++++++++++ 2 files changed, 618 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi.mir diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index f74d12cfab0c0..7f76d14eb9ab0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -549,22 +549,26 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, if (!S.liveAt(SI)) { if (It == LiveRegs.end()) { It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) + if (!MRI->isSSA() && It == LiveRegs.end()) llvm_unreachable("register isn't live"); } - auto PrevMask = It->second; - It->second &= ~S.LaneMask; - CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + if (It != LiveRegs.end()) { + auto PrevMask = It->second; + It->second &= ~S.LaneMask; + CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + } } } if (It != LiveRegs.end() && It->second.none()) LiveRegs.erase(It); } else if (!LI.liveAt(SI)) { auto It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) + if (!MRI->isSSA() && It == LiveRegs.end()) llvm_unreachable("register isn't live"); - CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); - LiveRegs.erase(It); + if (It != LiveRegs.end()) { + CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); + LiveRegs.erase(It); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir new file mode 100644 index 0000000000000..2d22e9fba2593 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/phi.mir @@ -0,0 +1,607 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -amdgpu-remat-enable-hot-block-remat-aggressive -run-pass=amdgpu-hot-block-remat -o - | FileCheck %s + +# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are +# present. + +# CHECK: S_ENDPGM + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + + + %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %3000:sgpr_32 = S_MOV_B32 0 + %3001:sgpr_32 = S_MOV_B32 1 + %3002:sgpr_32 = S_MOV_B32 2 + %3003:sgpr_32 = S_MOV_B32 3 + %3004:sgpr_32 = S_MOV_B32 4 + %3005:sgpr_32 = S_MOV_B32 5 + %3006:sgpr_32 = S_MOV_B32 6 + %3007:sgpr_32 = S_MOV_B32 7 + %3008:sgpr_32 = S_MOV_B32 8 + %3009:sgpr_32 = S_MOV_B32 9 + %3010:sgpr_32 = S_MOV_B32 10 + %3011:sgpr_32 = S_MOV_B32 11 + %3012:sgpr_32 = S_MOV_B32 12 + %3013:sgpr_32 = S_MOV_B32 13 + %3014:sgpr_32 = S_MOV_B32 14 + %3015:sgpr_32 = S_MOV_B32 15 + %3016:sgpr_32 = S_MOV_B32 16 + %3017:sgpr_32 = S_MOV_B32 17 + %3018:sgpr_32 = S_MOV_B32 18 + %3019:sgpr_32 = S_MOV_B32 19 + %3020:sgpr_32 = S_MOV_B32 20 + %3021:sgpr_32 = S_MOV_B32 21 + %3022:sgpr_32 = S_MOV_B32 22 + %3023:sgpr_32 = S_MOV_B32 23 + %3024:sgpr_32 = S_MOV_B32 24 + %3025:sgpr_32 = S_MOV_B32 25 + %3026:sgpr_32 = S_MOV_B32 26 + %3027:sgpr_32 = S_MOV_B32 27 + %3028:sgpr_32 = S_MOV_B32 28 + %3029:sgpr_32 = S_MOV_B32 29 + %3030:sgpr_32 = S_MOV_B32 30 + %3031:sgpr_32 = S_MOV_B32 31 + %3032:sgpr_32 = S_MOV_B32 32 + %3033:sgpr_32 = S_MOV_B32 33 + %3034:sgpr_32 = S_MOV_B32 34 + %3035:sgpr_32 = S_MOV_B32 35 + %3036:sgpr_32 = S_MOV_B32 36 + %3037:sgpr_32 = S_MOV_B32 37 + %3038:sgpr_32 = S_MOV_B32 38 + %3039:sgpr_32 = S_MOV_B32 39 + %3040:sgpr_32 = S_MOV_B32 40 + %3041:sgpr_32 = S_MOV_B32 41 + %3042:sgpr_32 = S_MOV_B32 42 + %3043:sgpr_32 = S_MOV_B32 43 + %3044:sgpr_32 = S_MOV_B32 44 + %3045:sgpr_32 = S_MOV_B32 45 + %3046:sgpr_32 = S_MOV_B32 46 + %3047:sgpr_32 = S_MOV_B32 47 + %3048:sgpr_32 = S_MOV_B32 48 + %3049:sgpr_32 = S_MOV_B32 49 + %3050:sgpr_32 = S_MOV_B32 50 + %3051:sgpr_32 = S_MOV_B32 51 + %3052:sgpr_32 = S_MOV_B32 52 + %3053:sgpr_32 = S_MOV_B32 53 + %3054:sgpr_32 = S_MOV_B32 54 + %3055:sgpr_32 = S_MOV_B32 55 + %3056:sgpr_32 = S_MOV_B32 56 + %3057:sgpr_32 = S_MOV_B32 57 + %3058:sgpr_32 = S_MOV_B32 58 + %3059:sgpr_32 = S_MOV_B32 59 + %3060:sgpr_32 = S_MOV_B32 60 + %3061:sgpr_32 = S_MOV_B32 61 + %3062:sgpr_32 = S_MOV_B32 62 + %3063:sgpr_32 = S_MOV_B32 63 + %3064:sgpr_32 = S_MOV_B32 64 + %3065:sgpr_32 = S_MOV_B32 65 + %3066:sgpr_32 = S_MOV_B32 66 + %3067:sgpr_32 = S_MOV_B32 67 + %3068:sgpr_32 = S_MOV_B32 68 + %3069:sgpr_32 = S_MOV_B32 69 + %3070:sgpr_32 = S_MOV_B32 70 + %3071:sgpr_32 = S_MOV_B32 71 + %3072:sgpr_32 = S_MOV_B32 72 + %3073:sgpr_32 = S_MOV_B32 73 + %3074:sgpr_32 = S_MOV_B32 74 + %3075:sgpr_32 = S_MOV_B32 75 + %3076:sgpr_32 = S_MOV_B32 76 + %3077:sgpr_32 = S_MOV_B32 77 + %3078:sgpr_32 = S_MOV_B32 78 + %3079:sgpr_32 = S_MOV_B32 79 + %3080:sgpr_32 = S_MOV_B32 80 + %3081:sgpr_32 = S_MOV_B32 81 + %3082:sgpr_32 = S_MOV_B32 82 + %3083:sgpr_32 = S_MOV_B32 83 + %3084:sgpr_32 = S_MOV_B32 84 + %3085:sgpr_32 = S_MOV_B32 85 + %3086:sgpr_32 = S_MOV_B32 86 + %3087:sgpr_32 = S_MOV_B32 87 + %3088:sgpr_32 = S_MOV_B32 88 + %3089:sgpr_32 = S_MOV_B32 89 + %3090:sgpr_32 = S_MOV_B32 90 + %3091:sgpr_32 = S_MOV_B32 91 + %3092:sgpr_32 = S_MOV_B32 92 + %3093:sgpr_32 = S_MOV_B32 93 + %3094:sgpr_32 = S_MOV_B32 94 + %3095:sgpr_32 = S_MOV_B32 95 + %3096:sgpr_32 = S_MOV_B32 96 + %3097:sgpr_32 = S_MOV_B32 97 + %3098:sgpr_32 = S_MOV_B32 98 + %3099:sgpr_32 = S_MOV_B32 99 + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + %8001:vgpr_32 = COPY %8000 + %8002:vgpr_32 = COPY %8000 + %8003:vgpr_32 = COPY %8000 + %8004:vgpr_32 = COPY %8000 + %8005:vgpr_32 = COPY %8000 + %8006:vgpr_32 = COPY %8000 + %8007:vgpr_32 = COPY %8000 + %8008:vgpr_32 = COPY %8000 + %8009:vgpr_32 = COPY %8000 + %8010:vgpr_32 = COPY %8000 + %8011:vgpr_32 = COPY %8000 + %8012:vgpr_32 = COPY %8000 + %8013:vgpr_32 = COPY %8000 + %8014:vgpr_32 = COPY %8000 + %8015:vgpr_32 = COPY %8000 + %8016:vgpr_32 = COPY %8000 + %8017:vgpr_32 = COPY %8000 + + %9001:vgpr_32 = COPY %8001 + %9002:vgpr_32 = COPY %8002 + %9003:vgpr_32 = COPY %8003 + %9004:vgpr_32 = COPY %8004 + %9005:vgpr_32 = COPY %8005 + %9006:vgpr_32 = COPY %8006 + %9007:vgpr_32 = COPY %8007 + %9008:vgpr_32 = COPY %8008 + %9009:vgpr_32 = COPY %8009 + %9010:vgpr_32 = COPY %8010 + %9011:vgpr_32 = COPY %8011 + %9012:vgpr_32 = COPY %8012 + %9013:vgpr_32 = COPY %8013 + %9014:vgpr_32 = COPY %8014 + %9015:vgpr_32 = COPY %8015 + %9016:vgpr_32 = COPY %8016 + %9017:vgpr_32 = COPY %8017 + + S_BRANCH %bb.2 + + bb.2: + %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1 + %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1 + %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1 + %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1 + %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1 + %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1 + %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1 + %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1 + %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1 + %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1 + %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1 + %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1 + %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1 + %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1 + %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1 + %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1 + %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1 + %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1 + %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1 + %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1 + %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1 + %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1 + %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1 + %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1 + %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1 + %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1 + %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1 + %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1 + %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1 + %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1 + %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1 + %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1 + %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1 + %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1 + %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1 + %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1 + %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1 + %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1 + %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1 + %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1 + %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1 + %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1 + %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1 + %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1 + %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1 + %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1 + %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1 + %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1 + %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1 + %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1 + %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1 + %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1 + %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1 + %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1 + %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1 + %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1 + %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1 + %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1 + %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1 + %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1 + %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1 + %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1 + %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1 + %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1 + %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1 + %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1 + %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1 + %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1 + %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1 + %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1 + %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1 + %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1 + %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1 + %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1 + %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1 + %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1 + %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1 + %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1 + %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1 + %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1 + %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1 + %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1 + %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1 + %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1 + %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1 + %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1 + %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1 + %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1 + %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1 + %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1 + %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1 + %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1 + %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1 + %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1 + %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1 + %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1 + %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1 + %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1 + %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1 + %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1 + + + %3:vgpr_32 = IMPLICIT_DEF + + %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec + %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec + %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec + %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec + %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec + %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec + %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec + %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec + %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec + %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec + %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec + %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec + %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec + %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec + %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec + %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec + %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec + %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec + %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec + %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec + %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec + %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec + %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec + %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec + %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec + %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec + %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec + %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec + %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec + %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec + %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec + %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec + %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec + %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec + %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec + %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec + %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec + %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec + %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec + %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec + %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec + %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec + %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec + %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec + %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec + %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec + %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec + %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec + %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec + %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec + %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec + %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec + %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec + %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec + %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec + %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec + %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec + %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec + %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec + %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec + %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec + %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec + %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec + %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec + %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec + %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec + %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec + %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec + %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec + %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec + %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec + %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec + %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec + %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec + %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec + %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec + %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec + %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec + %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec + %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec + %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec + %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec + %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec + %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec + %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec + %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec + %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec + %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec + %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec + %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec + %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec + %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec + %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec + %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec + %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec + %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec + %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec + %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec + %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec + %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec + EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + From d4fd382d1a23303d1804c3169a589f2aa55a58b4 Mon Sep 17 00:00:00 2001 From: Adam Yang <31109344+adam-yang@users.noreply.github.com> Date: Mon, 21 Apr 2025 15:59:36 -0700 Subject: [PATCH 05/11] clang format --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 69 +++++++++---------- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 14 ++-- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 6 +- .../AMDGPUOccupancyAndLatencyHelper.cpp | 5 +- .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 4 +- 5 files changed, 46 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 5c628a89766c3..3c5d592602c6f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -12,20 +12,20 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "AMDGPUMIRUtils.h" #include "AMDGPUOccupancyAndLatencyHelper.h" -#include "AMDGPU.h" +#include "GCNRegPressure.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/SlotIndexes.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "GCNRegPressure.h" #define DEBUG_TYPE "amdgpu-hot-block-remat" @@ -111,19 +111,18 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; - void applyCloneRemat(RematNode &Node, - std::vector &HotBlocks, - MachineDominatorTree *DT, MachineRegisterInfo &MRI, - SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineFunction &MF); + void applyCloneRemat(RematNode &Node, std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF); void applyRemat(MapVector &RematMap, - std::vector &HotBlocks, MachineDominatorTree *DT, - llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - MachineFunction &MF); + std::vector &HotBlocks, + MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF); bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, - LiveIntervals *LIS, MachineDominatorTree *DT, - MachinePostDominatorTree *PDT, bool &IsNearTarget); + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, bool &IsNearTarget); StringRef getPassName() const override { return "AMDGPU rematerialize"; } @@ -237,11 +236,11 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef, } } -void AMDGPUHotBlockRematerialize::applyCloneRemat(RematNode &Node, - std::vector &HotBlocks, - MachineDominatorTree *DT, MachineRegisterInfo &MRI, - SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineFunction &MF) { +void AMDGPUHotBlockRematerialize::applyCloneRemat( + RematNode &Node, std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF) { unsigned Reg = Node.Reg; MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); @@ -359,11 +358,11 @@ void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, SlotIndexes->insertMachineInstrInMaps(*DefMI); } -void AMDGPUHotBlockRematerialize::applyRemat(MapVector &RematMap, - std::vector &HotBlocks, MachineDominatorTree *DT, - llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - MachineFunction &MF) { +void AMDGPUHotBlockRematerialize::applyRemat( + MapVector &RematMap, + std::vector &HotBlocks, MachineDominatorTree *DT, + llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { std::vector UpdateList; for (auto &It : RematMap) { UpdateList.emplace_back(It.second); @@ -381,8 +380,7 @@ void AMDGPUHotBlockRematerialize::applyRemat(MapVector &Rem if (Node.Kind == RematNode::RematKind::OneDefOneUse) { applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII); } else if (Node.Kind == RematNode::RematKind::Clone) { - applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, - MF); + applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF); } } } @@ -1234,9 +1232,12 @@ void dumpCandidates(std::vector &RematCandidates, int BlockIndex, dbgs() << "Total Size:" << TotalSize << "\n"; } -bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, - LiveIntervals *LIS, MachineDominatorTree *DT, - MachinePostDominatorTree *PDT, bool &IsNearTarget) { +bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, + MachineLoopInfo *MLI, + LiveIntervals *LIS, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, + bool &IsNearTarget) { const GCNSubtarget *ST = &MF.getSubtarget(); const SIInstrInfo *SIII = ST->getInstrInfo(); @@ -1489,8 +1490,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop if (!SRematMap.empty()) { IsUpdated = true; - applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, - MF); + applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, MF); LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); } @@ -1530,4 +1530,3 @@ char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID; FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() { return new AMDGPUHotBlockRematerialize(); } - diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 6d6bd38c61c06..dfb90e5545c8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -13,13 +13,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMIRUtils.h" -#include "SIRegisterInfo.h" #include "SIInstrInfo.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveInterval.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #define DEBUG_TYPE "xb-mir-util" using namespace llvm; @@ -101,11 +101,10 @@ bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1, } // namespace - namespace llvm { bool isSccLiveAt(llvm::MachineBasicBlock *MBB, - llvm::MachineBasicBlock::iterator MI) { + llvm::MachineBasicBlock::iterator MI) { const TargetRegisterInfo *TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo(); for (auto It = MI; It != MBB->end(); ++It) { @@ -205,9 +204,8 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( // TouchedMBBSet is used for scheduling where local live interval could cross // multiple regions, need to calculate livereg for each region inside touched // MBB. -bool isLocalLiveInterval( - const LiveInterval &LI, SlotIndexes *Indexes, - SmallDenseSet &TouchedMBBSet) { +bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes, + SmallDenseSet &TouchedMBBSet) { if (LI.hasSubRanges()) { for (const auto &S : LI.subranges()) { if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index 6b9079e5d65fb..2470e2bed482f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -86,8 +86,8 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI, llvm::SlotIndexes *SlotIndexes); unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, - const llvm::MachineRegisterInfo &MRI, - const llvm::SIRegisterInfo *SIRI); + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); void collectLiveSetPressure(const LiveSet &LiveSet, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, @@ -97,6 +97,6 @@ bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT, llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI, llvm::MachineBasicBlock *ToBB); -} +} // namespace llvm #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp index c2dbf1a8b297e..5c2b7904c46be 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -16,8 +16,8 @@ #include "GCNSubtarget.h" #include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include @@ -144,7 +144,6 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) { } } - SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, const llvm::MachineLoopInfo *MLI) { SchedScore TotalScore; @@ -165,5 +164,3 @@ SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, } } // namespace llvm - - diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h index b513e7335ffe4..e30df0d457863 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -15,8 +15,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H -#include "llvm/MC/MCInstrItineraries.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCInstrItineraries.h" namespace llvm { @@ -76,5 +76,5 @@ SchedScore collectLatency(llvm::MachineFunction &MF, const llvm::GCNSubtarget &ST, const llvm::MachineLoopInfo *MLI = nullptr); -} +} // namespace llvm #endif From 4f7d0dad93c64d94667e74dbd80fdabed3146144 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 22 Apr 2025 11:54:29 -0700 Subject: [PATCH 06/11] LLVM Style --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 87 +++++++------------ llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 59 +++++-------- .../AMDGPUOccupancyAndLatencyHelper.cpp | 16 ++-- .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 5 -- 4 files changed, 60 insertions(+), 107 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 3c5d592602c6f..e165b83b18850 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -176,21 +176,17 @@ DenseMap reduceClonedMBBs( bool IsDomAllHotBlocks = true; bool IsDomedByAllHotBlocks = true; for (MachineBasicBlock *HotMBB : HotBlockSet) { - if (!DT->dominates(MBB, HotMBB)) { + if (!DT->dominates(MBB, HotMBB)) IsDomAllHotBlocks = false; - } - if (!DT->dominates(HotMBB, MBB)) { + if (!DT->dominates(HotMBB, MBB)) IsDomedByAllHotBlocks = false; - } - if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { + if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) break; - } } - if (IsDomAllHotBlocks) { + if (IsDomAllHotBlocks) UserBlocks.erase(MBB); - } else if (IsDomedByAllHotBlocks) { + else if (IsDomedByAllHotBlocks) AfterHotRangeMBBs.insert(MBB); - } } // Split after hotRange block set by domtree. @@ -274,18 +270,16 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat( for (auto UseIt : UserMap) { MachineBasicBlock *MBB = UseIt.first; // Skip same block uses. - if (MBB == DefMI->getParent()) { + if (MBB == DefMI->getParent()) continue; - } // Skip MBB which share clone from other MBBs. if (UserMBBSet.count(MBB) == 0) continue; Register NewReg = MRI.createVirtualRegister(RC); auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg); - for (unsigned I = 1; I < OpNum; I++) { + for (unsigned I = 1; I < OpNum; I++) NewDef = NewDef.add(DefMI->getOperand(I)); - } MachineInstr *InsertPointMI = UseIt.second.front(); SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI); @@ -364,9 +358,9 @@ void AMDGPUHotBlockRematerialize::applyRemat( llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { std::vector UpdateList; - for (auto &It : RematMap) { + for (auto &It : RematMap) UpdateList.emplace_back(It.second); - } + // Sort update list with slotIndex to make sure def moved before use. // If use moved before def, It might not be the first use anymore. std::sort(UpdateList.begin(), UpdateList.end(), @@ -377,11 +371,10 @@ void AMDGPUHotBlockRematerialize::applyRemat( }); for (RematNode &Node : UpdateList) { - if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + if (Node.Kind == RematNode::RematKind::OneDefOneUse) applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII); - } else if (Node.Kind == RematNode::RematKind::Clone) { + else if (Node.Kind == RematNode::RematKind::Clone) applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF); - } } } @@ -410,12 +403,10 @@ unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, GCNRegPressure RP = RPTracker.getMaxPressureAndReset(); unsigned SPressure = RP.getMaxSGPR(); - if (SPressure > MaxSPressure) { + if (SPressure > MaxSPressure) MaxSPressure = SPressure; - } - if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) { + if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); - } Status.MBBPressureMap[&MBB] = RP; return RP.getOccupancy(*ST); } @@ -573,9 +564,8 @@ RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, unsigned SInputPressure = 0; uint64_t Mask = 0xf; while (Mask != 0) { - if (Mask & SInputMask) { + if (Mask & SInputMask) SInputPressure += 4; - } Mask = Mask << 4; } @@ -670,9 +660,8 @@ void updateLiveInfo(MapVector &RematMap, // still before LiveInfo.BB, It is still live. unsigned LiveBBIndex = RPOTIndexMap[CurBB]; unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; - if (LiveBBIndex > InsertBBIndex) { + if (LiveBBIndex > InsertBBIndex) continue; - } } // Already in remat map, don't need to check again, remove from // candidate. @@ -978,11 +967,10 @@ void buildRematCandiates(std::vector &Candidates, if (IsSafeCandidate) { int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR); - if (Gain > 0) { + if (Gain > 0) Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5)); - } else { + else IsSafeCandidate = false; - } } // Save unsafe reg. if (!IsSafeCandidate) @@ -1056,9 +1044,9 @@ int filterRematCandiates(std::vector &Candidates, // Work one def one use first. for (auto &Node : Candidates) { unsigned Reg = Node.Reg; - if (!MRI.hasOneNonDBGUse(Reg)) { + if (!MRI.hasOneNonDBGUse(Reg)) continue; - } + MachineInstr *DefMI = Node.DefMI; if (!isSafeToMove(DefMI, MRI)) { PinnedRegSet.insert(Reg); @@ -1074,9 +1062,9 @@ int filterRematCandiates(std::vector &Candidates, // Try multi use case. for (auto &Node : Candidates) { unsigned Reg = Node.Reg; - if (MRI.hasOneNonDBGUse(Reg)) { + if (MRI.hasOneNonDBGUse(Reg)) continue; - } + MachineInstr *DefMI = Node.DefMI; if (!isSafeToMove(DefMI, MRI)) { PinnedRegSet.insert(Reg); @@ -1161,10 +1149,9 @@ int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR, if (!Reg.isVirtual()) continue; - if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) { + if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) // Not support mix of v and s when remat now. continue; - } const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; @@ -1245,9 +1232,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, ReversePostOrderTraversal RPOT(&MF); DenseMap RPOTIndexMap; - for (MachineBasicBlock *MBB : RPOT) { + for (MachineBasicBlock *MBB : RPOT) RPOTIndexMap[MBB] = RPOTIndexMap.size(); - } auto &MRI = MF.getRegInfo(); @@ -1267,9 +1253,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, RematSCnt += NearTargetRegLimit; bool IsSGPRSpill = false; - if (RematSCnt > 0) { + if (RematSCnt > 0) IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); - } const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; @@ -1354,9 +1339,9 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, int RematSCnt = MaxSPressure - SReduced - SLimit; bool IsSGPRSpill = false; - if (RematSCnt > 0) { + if (RematSCnt > 0) IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF); - } + bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; // Try to add candidates into remat list. @@ -1393,15 +1378,13 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI); if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= RematSCnt) { - for (RematNode &Node : SRematList) { + for (RematNode &Node : SRematList) SRematMap[Node.Reg] = Node; - } } else { if (!IsForceRematSgpr) return false; - for (RematNode &Node : SRematList) { + for (RematNode &Node : SRematList) SRematMap[Node.Reg] = Node; - } // Find local one def one use candidates. for (MachineInstr &MI : *MBB) { if (MI.isDebugInstr()) @@ -1425,9 +1408,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, /*IsVGPR*/ false); if (Gain > 0) { // Skip case when DefMI has implicit define which used by UseMI. - if (isImplicitDefUse(&MI, &UseMI)) { + if (isImplicitDefUse(&MI, &UseMI)) continue; - } RematNode Node = {Reg, &MI, (unsigned)Gain >> 5}; Node.InsertPointMI = &UseMI; Node.Kind = RematNode::RematKind::OneDefOneUse; @@ -1459,19 +1441,16 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, bool IsVRematOK = (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty(); if (NeedSRemat && NeedVRemat) { - if (IsVRematOK && IsSRematOK) { + if (IsVRematOK && IsSRematOK) IsUpdated = true; - } else if (IsSGPRSpill) { + else if (IsSGPRSpill) IsUpdated = true; - } } else if (NeedSRemat) { - if (IsSRematOK) { + if (IsSRematOK) IsUpdated = true; - } } else if (NeedVRemat) { - if (IsVRematOK) { + if (IsVRematOK) IsUpdated = true; - } } // TODO: what to do when cannot reach target? if (NewRematSCnt > 0) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index dfb90e5545c8e..afa1a8853938f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -140,9 +140,8 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( const TargetRegisterInfo *TRI, const SIInstrInfo *TII, MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) { // If SCC is dead at MI when we can use MI as the insert point. - if (!llvm::isSccLiveAt(MBB, MI)) { + if (!llvm::isSccLiveAt(MBB, MI)) return MI; - } const bool CheckForExecWrite = Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; @@ -150,11 +149,10 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( // Get the starting reverse iterator taking care to handle the MBB->end() // case. MachineBasicBlock::reverse_iterator Start; - if (MI == MBB->end()) { + if (MI == MBB->end()) Start = MBB->rbegin(); - } else { + else Start = MI.getReverse(); - } // Otherwise, walk backwards through the block looking for a location where // SCC is dead. @@ -164,14 +162,12 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( // an insertion point (if that is a constraint from the caller). // The check for EXEC works for both wave64 and wave32 because // it will also catch Writes to the subregisters (e.g. exec_lo). - if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) { + if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) break; - } if (It->modifiesRegister(AMDGPU::SCC, TRI) && - !It->readsRegister(AMDGPU::SCC, TRI)) { + !It->readsRegister(AMDGPU::SCC, TRI)) return It->getIterator(); - } } // If no safe location can be found in the block we can save and restore @@ -207,20 +203,18 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes, SmallDenseSet &TouchedMBBSet) { if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) { + for (const auto &S : LI.subranges()) if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet)) return false; - } } return isLocalLiveRange(&LI, Indexes, TouchedMBBSet); } bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) { + for (const auto &S : LI.subranges()) if (!isLocalLiveRange(&S, Indexes)) return false; - } } return isLocalLiveRange(&LI, Indexes); } @@ -231,9 +225,8 @@ void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { for (auto It : LiveSet) { int Reg = It.first; dbgs() << printReg(Reg, SIRI); - if (It.second.any()) { + if (It.second.any()) dbgs() << " mask:" << It.second.getAsInteger(); - } dbgs() << "\n"; } } @@ -405,15 +398,13 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { MachineOperand &DstMO = MI.getOperand(0); // Skip case when dst subReg not 0. - if (DstMO.getSubReg()) { + if (DstMO.getSubReg()) return false; - } Register Reg = DstMO.getReg(); SmallVector UseMOs; - for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) { + for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) UseMOs.emplace_back(&UseMO); - } const llvm::TargetRegisterClass *NewRC = SIRI->getRegClass(Desc.operands().front().RegClass); @@ -441,9 +432,8 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, assert(OffsetOp != nullptr); int64_t Offset = OffsetOp->getImm(); Offset += Offset * LaneSize; - if (!SIII->isLegalMUBUFImmOffset(Offset)) { + if (!SIII->isLegalMUBUFImmOffset(Offset)) return false; - } OffsetOp->setImm(Offset); } else { return false; @@ -473,14 +463,12 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, } } // Update subReg for users. - for (MachineOperand *UseMO : UseMOs) { + for (MachineOperand *UseMO : UseMOs) updateSubReg(*UseMO, NewRC, Offset, SIRI); - } } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) { // Clear subReg when it's a single 32-bit reg. - for (MachineOperand *UseMO : UseMOs) { + for (MachineOperand *UseMO : UseMOs) UseMO->setSubReg(0); - } } MI.setDesc(Desc); @@ -511,9 +499,8 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, return false; LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI); LaneBitmask UseMask; - for (MachineOperand &MO : MRI.use_operands(Reg)) { + for (MachineOperand &MO : MRI.use_operands(Reg)) UseMask |= llvm::getRegMask(MO, MRI); - } const unsigned FullMask = DstMask.getAsInteger(); unsigned Mask = UseMask.getAsInteger(); @@ -602,11 +589,10 @@ void collectLiveSetPressure(const LiveSet &LiveSet, for (auto LiveIt : LiveSet) { unsigned Reg = LiveIt.first; unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI); - if (SIRI->isVGPR(MRI, Reg)) { + if (SIRI->isVGPR(MRI, Reg)) VPressure += Size; - } else { + else SPressure += Size; - } } } @@ -651,21 +637,18 @@ bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineLoopInfo *LI, MachineBasicBlock *ToBB) { - if (FromBB == ToBB) { + if (FromBB == ToBB) return true; - } - if (DT->dominates(FromBB, ToBB)) { + if (DT->dominates(FromBB, ToBB)) return true; - } - if (PDT->dominates(ToBB, FromBB)) { + if (PDT->dominates(ToBB, FromBB)) return true; - } - if (loopContainsBoth(LI, ToBB, FromBB)) { + if (loopContainsBoth(LI, ToBB, FromBB)) return true; - } + // TODO: cover case hotBB in loop, // one block in that loop dom BB or // BB post dom one block in that loop. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp index 5c2b7904c46be..6160fe5471376 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -101,11 +101,10 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) { auto GetAluStatus = [](const MachineInstr &MI, const llvm::SIInstrInfo *SIII) { AluStatus Status = AluStatus::Nothing; - if (SIII->isVALU(MI.getOpcode())) { + if (SIII->isVALU(MI.getOpcode())) Status = AluStatus::Vector; - } else if (SIII->isSALU(MI.getOpcode())) { + else if (SIII->isSALU(MI.getOpcode())) Status = AluStatus::Scalar; - } return Status; }; AluStatus Status = GetAluStatus(MI, SIII); @@ -120,11 +119,10 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) { case AluStatus::Scalar: { Score.Alu += Latency; // Ignore mix alu. - if (PrevStatus != Status) { + if (PrevStatus != Status) PrevStatus = AluStatus::Nothing; - } else { + else Score.MixAlu += Latency; - } } break; } } @@ -151,13 +149,11 @@ SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, MachineBasicBlock &MBB = MFI; MachineBasicBlock::iterator Next; AMDGPULatencyTracker LatencyTracker(ST); - for (auto &MI : MBB) { + for (auto &MI : MBB) LatencyTracker.scan(MI); - } unsigned LoopDepth = 0; - if (MLI) { + if (MLI) LoopDepth = MLI->getLoopDepth(&MBB); - } TotalScore.sum(LatencyTracker.Score, LoopDepth); } return TotalScore; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h index e30df0d457863..9c63fa7e6b4a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -39,12 +39,7 @@ struct SchedScore { unsigned Lds = 0; // Todo: count lds. SchedScore() {} - // Other info which can help compare schedule result. - float computeScore() const; - float computeScore2() const; - void sum(const SchedScore &S, unsigned LoopDepth = 0); - bool isBetter(const SchedScore &S) const; bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const; // More latency can be hiden with ExtraOcc. unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; From 3cb2c8d067cf8c106702ab2807ba749c4e47e848 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Thu, 1 May 2025 12:45:37 -0700 Subject: [PATCH 07/11] Avoid duplicate shadow variable names --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index e165b83b18850..91371e0f5fe55 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -1247,21 +1247,24 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, unsigned VLimit = Status.TargetVLimit; unsigned SLimit = Status.TargetSLimit; - int RematSCnt = Status.MaxSPressure - SLimit; - // when agressive sgpr remat, reserve some for allocation lost. - if (EnableAggressive) - RematSCnt += NearTargetRegLimit; - - bool IsSGPRSpill = false; - if (RematSCnt > 0) - IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); - - const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; - - // If bound by lds, skip. - if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && - !IsForceRematSgpr) - return false; + // Early check for + { + int InitialRematSCnt = Status.MaxSPressure - SLimit; + // when agressive sgpr remat, reserve some for allocation lost. + if (EnableAggressive) + InitialRematSCnt += NearTargetRegLimit; + + bool InitialIsSGPRSpill = false; + if (InitialRematSCnt > 0) + InitialIsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); + + const bool InitialIsForceRematSgpr = InitialIsSGPRSpill || Status.NotBalance; + + // If bound by lds, skip. + if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && + !InitialIsForceRematSgpr) + return false; + } MachineBasicBlock *EntryMBB = &MF.front(); @@ -1277,6 +1280,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineBasicBlock *MBB = *It; auto &RP = Status.MBBPressureMap[MBB]; // ignore block not hot. + if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit && (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) < Status.TargetSLimit) From 0775bb87d6739918a0b3d0cee85e7aefc0d1f220 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Fri, 2 May 2025 15:48:48 -0700 Subject: [PATCH 08/11] Big cleanup to clarify the flow of data, the purpose of functions, etc --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 241 ++++++++++-------- 1 file changed, 140 insertions(+), 101 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 91371e0f5fe55..9aa52ac1cf69e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -53,7 +53,7 @@ struct RematNode { RematNode(unsigned R, MachineInstr *MI, unsigned S) : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr), Kind(RematKind::Candidate), Size(S) {} - unsigned Reg; + Register Reg; MachineInstr *DefMI; MachineBasicBlock *InsertBlock; union { @@ -61,7 +61,7 @@ struct RematNode { unsigned UserCount; }; RematKind Kind; - unsigned Size; + unsigned Size; // This is actually the Gain of the candidate. }; struct BlockLiveInfo { @@ -152,7 +152,7 @@ MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash( } DenseMap reduceClonedMBBs( - unsigned Reg, BlockMap> &UserBlocks, + Register Reg, BlockMap> &UserBlocks, DenseSet &UserMBBSet, std::vector &HotBlocks, MachineDominatorTree *DT) { // Collect hot blocks which Exp is live in. @@ -217,7 +217,7 @@ DenseMap reduceClonedMBBs( return DomMap; } -void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef, +void updateUsers(Register Reg, unsigned NewReg, bool IsSubRegDef, SmallVector &UserMIs) { for (MachineInstr *UseMI : UserMIs) { for (MachineOperand &MO : UseMI->operands()) { @@ -237,20 +237,16 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat( MachineDominatorTree *DT, MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { - unsigned Reg = Node.Reg; - + Register Reg = Node.Reg; MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); - auto DefOp = DefMI->getOperand(0); + const MCInstrDesc &Desc = DefMI->getDesc(); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - // When the unique def has subReg, just create newReg for the subReg part. - bool IsSubRegDef = false; - if (DefOp.getSubReg() != 0) { - RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg()); - IsSubRegDef = true; - } - const DebugLoc DL = DefMI->getDebugLoc(); - unsigned OpNum = DefMI->getNumOperands(); + const TargetRegisterClass *RC = + SIRI->getAllocatableClass(SIII->getOpRegClass(*DefMI, 0)); + const bool IsSubRegDef = DefMI->getOperand(0).getSubReg() != 0; + + const DebugLoc &DL = DefMI->getDebugLoc(); + const unsigned OpNum = DefMI->getNumOperands(); Node.Kind = RematNode::RematKind::Clone; @@ -550,7 +546,7 @@ RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, const Register Reg = Livein.first; const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); assert(Reg.isPhysical() && "input must be physical reg"); - unsigned RegSize = RC->getLaneMask().getNumLanes(); + Register RegSize = RC->getLaneMask().getNumLanes(); if (SIRI->isVGPR(MRI, Reg)) { VInputPressure += RegSize; } else { @@ -621,8 +617,11 @@ bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { return false; } -// SGPR has alignment requirment, cannot get accurate reg number. -const unsigned NearTargetRegLimit = 10; +static unsigned AlignToSgprAllocationGranularity(const GCNSubtarget *ST, + unsigned SgprCount) { + return llvm::alignTo(SgprCount, ST->getSGPRAllocGranule()); +} + bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST, MachineFunction &MF) { unsigned MaxSGPR = ST->getAddressableNumSGPRs(); @@ -638,13 +637,13 @@ bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST, } // Skip live reg remated to other block. -void updateLiveInfo(MapVector &RematMap, - GCNRPTracker::LiveRegSet &LiveSet, - const GCNRPTracker::LiveRegSet &InputLive, - MachineBasicBlock *CurBB, - DenseMap &RPOTIndexMap) { +void updateLiveInfo( + const MapVector &RematMap, + GCNRPTracker::LiveRegSet &LiveSet, + const GCNRPTracker::LiveRegSet &InputLive, const MachineBasicBlock *CurBB, + DenseMap &RPOTIndexMap) { for (auto &It : RematMap) { - unsigned Reg = It.first; + Register Reg = It.first; // Skip reg not in live set. if (!LiveSet.count(Reg)) continue; @@ -669,8 +668,17 @@ void updateLiveInfo(MapVector &RematMap, } } -int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, bool IsVGPR) { +// Returns the actual register saving that would be achieved by moving or +// cloning this instruction. It's essentially: +// +// size(defs) - size(uses) +// +// Note if it is not safe to move/clone this instruction, this function returns +// 0. +// +int rematGainInBits(MachineInstr *DefMI, Register Reg, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + bool IsVGPR) { int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); for (MachineOperand &MO : DefMI->operands()) { if (MO.isImm()) @@ -804,7 +812,7 @@ MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT, } MachineBasicBlock * -findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, +findInsertBlock(MachineInstr &DefMI, Register Reg, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, const MachineRegisterInfo &MRI, bool MemBound) { @@ -869,14 +877,14 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { return true; } -void addOneDefOneUseCandidate(RematNode &Node, - std::vector &RematList, - MachineRegisterInfo &MRI, int &RematCnt, +void addOneDefOneUseCandidate(std::vector *OutRematList, + int *OutRematCnt, const RematNode &Node, + MachineRegisterInfo &MRI, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, bool IsVGPR, bool MemBound) { - unsigned Reg = Node.Reg; + Register Reg = Node.Reg; MachineInstr *DefMI = Node.DefMI; unsigned Size = Node.Size; @@ -918,24 +926,26 @@ void addOneDefOneUseCandidate(RematNode &Node, return; } - Node.InsertBlock = InsertBB; - Node.InsertPointMI = UseMI; - Node.Kind = RematNode::RematKind::OneDefOneUse; - RematList.emplace_back(Node); - RematCnt += Size; + RematNode FilteredNode = Node; + FilteredNode.InsertBlock = InsertBB; + FilteredNode.InsertPointMI = UseMI; + FilteredNode.Kind = RematNode::RematKind::OneDefOneUse; + OutRematList->emplace_back(FilteredNode); + *OutRematCnt += Size; } -void buildRematCandiates(std::vector &Candidates, +// Build remat candidates from the registers in `CandidateRegSet`. +void buildRematCandiates(std::vector *OutCandidates, + DenseSet *PinnedRegSet, GCNRPTracker::LiveRegSet &CandidateRegSet, - DenseSet &PinnedRegSet, const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, bool IsVGPR) { - for (auto LiveRegIt : CandidateRegSet) { - unsigned Reg = LiveRegIt.first; + for (const auto &LiveRegIt : CandidateRegSet) { + Register Reg = LiveRegIt.first; // Skip unsafe reg. - if (PinnedRegSet.count(Reg)) + if (PinnedRegSet->count(Reg)) continue; if (SIRI->isVGPR(MRI, Reg) != IsVGPR) @@ -966,32 +976,32 @@ void buildRematCandiates(std::vector &Candidates, } if (IsSafeCandidate) { - int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR); + int Gain = rematGainInBits(MI, Reg, MRI, SIRI, IsVGPR); if (Gain > 0) - Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5)); + OutCandidates->emplace_back(RematNode(Reg, MI, Gain >> 5)); else IsSafeCandidate = false; } // Save unsafe reg. if (!IsSafeCandidate) - PinnedRegSet.insert(Reg); + PinnedRegSet->insert(Reg); } // Sort by gain. - std::sort(Candidates.begin(), Candidates.end(), + std::sort(OutCandidates->begin(), OutCandidates->end(), [](RematNode &I, RematNode &J) { return I.Size > J.Size; }); } -void addCloneCandidate(std::vector &CloneList, - std::vector &RematList, - DenseSet &PinnedRegSet, - MachineRegisterInfo &MRI, int &RematCnt) { +void addCloneCandidate(std::vector *OutRematList, int *OutRematCnt, + DenseSet *OutPinnedRegSet, + std::vector &&CloneList, + const MachineRegisterInfo &MRI) { // Group user in same blocks. std::vector UserSetList(CloneList.size()); for (size_t I = 0; I < CloneList.size(); I++) { auto *Node = CloneList[I]; - unsigned Reg = Node->Reg; + Register Reg = Node->Reg; MachineInstr *DefMI = Node->DefMI; // Group user in same blocks. BlockSet &UserSet = UserSetList[I]; @@ -1008,7 +1018,7 @@ void addCloneCandidate(std::vector &CloneList, // Mark cannot remat for now. // TODO: try to split if is bigger than 4 and only used once per // channel. - PinnedRegSet.insert(Reg); + OutPinnedRegSet->insert(Reg); continue; } } @@ -1029,31 +1039,38 @@ void addCloneCandidate(std::vector &CloneList, for (RematNode *Node : CloneList) { Node->Kind = RematNode::RematKind::Clone; - RematList.emplace_back(*Node); - RematCnt += Node->Size; + OutRematList->emplace_back(*Node); + *OutRematCnt += Node->Size; } } -int filterRematCandiates(std::vector &Candidates, - std::vector &RematList, - DenseSet &PinnedRegSet, +// Filter `Candidates` into `OutRematList` based on whether +// safe to move, and decides on the actual type of Candidate (move vs cline). +// +// Updates `OutPinnedRegSet` with registers that cannot/should not be moved. +// +// Returns the accumulated size of all filtered candidates. +// +int filterRematCandiates(std::vector *OutRematList, + DenseSet *OutPinnedRegSet, + std::vector &&Candidates, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) { int RematCnt = 0; // Work one def one use first. for (auto &Node : Candidates) { - unsigned Reg = Node.Reg; + Register Reg = Node.Reg; if (!MRI.hasOneNonDBGUse(Reg)) continue; MachineInstr *DefMI = Node.DefMI; if (!isSafeToMove(DefMI, MRI)) { - PinnedRegSet.insert(Reg); + OutPinnedRegSet->insert(Reg); continue; } - addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI, + addOneDefOneUseCandidate(OutRematList, &RematCnt, Node, MRI, DT, PDT, MLI, IsVGPR, MemBound); } @@ -1061,13 +1078,13 @@ int filterRematCandiates(std::vector &Candidates, std::vector CloneList; // Try multi use case. for (auto &Node : Candidates) { - unsigned Reg = Node.Reg; + Register Reg = Node.Reg; if (MRI.hasOneNonDBGUse(Reg)) continue; MachineInstr *DefMI = Node.DefMI; if (!isSafeToMove(DefMI, MRI)) { - PinnedRegSet.insert(Reg); + OutPinnedRegSet->insert(Reg); continue; } @@ -1075,18 +1092,25 @@ int filterRematCandiates(std::vector &Candidates, CloneList.emplace_back(&Node); } - addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt); + addCloneCandidate(OutRematList, &RematCnt, OutPinnedRegSet, + std::move(CloneList), MRI); } return RematCnt; } -int getReducedSize(MapVector &RematMap, - GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts, - const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo, - DenseMap &RPOTIndexMap) { +// Calculate the reduced register pressure of RematMap w.r.t. the BB associated +// with LiveInfo. +// Returns the number of registers reduced, and the instructions associated with +// the reduction nodes into `OutReducedInsts`. +int getReducedSize(const MapVector &RematMap, + GCNRPTracker::LiveRegSet &CanidateSet, + const MachineRegisterInfo &MRI, + const BlockLiveInfo &LiveInfo, + DenseMap &RPOTIndexMap, + InstSet *OutReducedInsts) { int ReducedSize = 0; - for (auto &It : RematMap) { + for (const auto &It : RematMap) { Register Reg = It.first; if (!CanidateSet.count(Reg)) @@ -1115,7 +1139,7 @@ int getReducedSize(MapVector &RematMap, } if (IsReduced) { ReducedSize += Node.Size; - ReducedInsts.insert(Node.DefMI); + OutReducedInsts->insert(Node.DefMI); } // Already in remat map, don't need to check again, remove from candidate. @@ -1125,11 +1149,15 @@ int getReducedSize(MapVector &RematMap, return ReducedSize; } -int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR, +// Calculate the amount of OVERLAPPING register pressure among all +// the instructions in `ReducedInsts`. E.g for: +// x = COPY a:sgpr_32 +// y = COPY a:sgpr_32 +// This function would return 1. +int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { - // Find shared operand in ReducedInsts. int SharedSize = 0; DenseMap SharedRegMaskMap; for (MachineInstr *DefMI : ReducedInsts) { @@ -1156,6 +1184,7 @@ int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR, const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; unsigned Mask; + // FIXME: Lane mask is now in the granularity of 16-bit lanes. if (unsigned SubIdx = MO.getSubReg()) { OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; @@ -1219,6 +1248,9 @@ void dumpCandidates(std::vector &RematCandidates, int BlockIndex, dbgs() << "Total Size:" << TotalSize << "\n"; } +// A heuristic number for keeping the target SGPR number away from the limit. +constexpr unsigned SgprLimitBias = 10; + bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, @@ -1231,8 +1263,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, const SIRegisterInfo *SIRI = ST->getRegisterInfo(); ReversePostOrderTraversal RPOT(&MF); - DenseMap RPOTIndexMap; - for (MachineBasicBlock *MBB : RPOT) + DenseMap RPOTIndexMap; + for (const MachineBasicBlock *MBB : RPOT) RPOTIndexMap[MBB] = RPOTIndexMap.size(); auto &MRI = MF.getRegInfo(); @@ -1244,25 +1276,23 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, if (Status.TargetOcc >= MaxOcc) return false; - unsigned VLimit = Status.TargetVLimit; - unsigned SLimit = Status.TargetSLimit; - // Early check for { - int InitialRematSCnt = Status.MaxSPressure - SLimit; + int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit; // when agressive sgpr remat, reserve some for allocation lost. if (EnableAggressive) - InitialRematSCnt += NearTargetRegLimit; + InitialRematSCnt += SgprLimitBias; bool InitialIsSGPRSpill = false; if (InitialRematSCnt > 0) InitialIsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); - const bool InitialIsForceRematSgpr = InitialIsSGPRSpill || Status.NotBalance; + const bool InitialIsForceRematSgpr = + InitialIsSGPRSpill || Status.NotBalance; // If bound by lds, skip. if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && - !InitialIsForceRematSgpr) + !InitialIsForceRematSgpr) return false; } @@ -1274,7 +1304,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MapVector VRematMap; MapVector SRematMap; // Reg which cannot move around to remat. - DenseSet PinnedRegSet; + DenseSet PinnedRegSet; std::vector HotBlocks; for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) { MachineBasicBlock *MBB = *It; @@ -1317,7 +1347,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MaxSPressure = SPressure; } MaxSPressure += RegForVCC + Status.InputPhysicalSPressure; - if (MaxVPressure <= VLimit && MaxSPressure <= SLimit) + if (MaxVPressure <= Status.TargetVLimit && + MaxSPressure <= Status.TargetSLimit) continue; // Build block live info. @@ -1333,14 +1364,14 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, // Update reg pressure based on remat list. InstSet VReducedInsts; InstSet SReducedInsts; - int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI, - LiveInfo, RPOTIndexMap); - int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI, - LiveInfo, RPOTIndexMap); + int VReduced = getReducedSize(VRematMap, CandidateRegs, MRI, LiveInfo, + RPOTIndexMap, &VReducedInsts); + int SReduced = getReducedSize(SRematMap, CandidateRegs, MRI, LiveInfo, + RPOTIndexMap, &SReducedInsts); - // Calculate size need to be remat. - int RematVCnt = MaxVPressure - VReduced - VLimit; - int RematSCnt = MaxSPressure - SReduced - SLimit; + // Calculate size need to be remat for this BB. + const int RematVCnt = MaxVPressure - VReduced - Status.TargetVLimit; + const int RematSCnt = MaxSPressure - SReduced - Status.TargetSLimit; bool IsSGPRSpill = false; if (RematSCnt > 0) @@ -1353,34 +1384,41 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, if (RematSCnt > 0) { // Build candidate nodes. std::vector SRematCandidates; - buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI, + buildRematCandiates(&SRematCandidates, &PinnedRegSet, CandidateRegs, MRI, SIII, SIRI, /*IsVGPR*/ false); LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI)); std::vector SRematList; // Filter candidates. - NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList, - PinnedRegSet, DT, PDT, MLI, MRI, - /*IsVGPR*/ false, Status.MemBound); + NewRematSCnt = + filterRematCandiates(&SRematList, &PinnedRegSet, + std::move(SRematCandidates), DT, PDT, MLI, MRI, + /*IsVGPR*/ false, Status.MemBound); if (NewRematSCnt > RematSCnt) { // Has enough remat node to cover rematCnt. int RematCnt = 0; for (RematNode &Node : SRematList) { SRematMap[Node.Reg] = Node; RematCnt += Node.Size; + // Stop if the size had reached the required amount, unless + // aggressive is set. if (RematCnt > RematSCnt && !EnableAggressive) break; } NewRematSCnt = 0; } else { - for (RematNode &Node : SRematList) { SReducedInsts.insert(Node.DefMI); } - // Check shared size. + // Check shared size. These are reg uses that are shared among all the + // instructions. The overlap will not actually contribute to the + // pressure increase when an instruction is moved/cloned, so it can be + // treated as a gain. int SharedReducedSize = getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI); - if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= + + int LocalGains = 0; + if (((NewRematSCnt + SharedReducedSize) + (int)SgprLimitBias) >= RematSCnt) { for (RematNode &Node : SRematList) SRematMap[Node.Reg] = Node; @@ -1408,8 +1446,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); if (UseMI.getParent() != MBB) continue; - int Gain = rematGain(&MI, Reg, MRI, SIRI, - /*IsVGPR*/ false); + int Gain = rematGainInBits(&MI, Reg, MRI, SIRI, + /*IsVGPR*/ false); if (Gain > 0) { // Skip case when DefMI has implicit define which used by UseMI. if (isImplicitDefUse(&MI, &UseMI)) @@ -1418,11 +1456,12 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, Node.InsertPointMI = &UseMI; Node.Kind = RematNode::RematKind::OneDefOneUse; SRematMap[Reg] = Node; - SharedReducedSize += Node.Size; + LocalGains += Node.Size; } } } - NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize; + NewRematSCnt = + RematSCnt - NewRematSCnt - SharedReducedSize - LocalGains; } } // If works, continue. @@ -1458,7 +1497,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, } // TODO: what to do when cannot reach target? if (NewRematSCnt > 0) { - if ((unsigned)NewRematSCnt <= NearTargetRegLimit) { + if ((unsigned)NewRematSCnt <= ST->getSGPRAllocGranule()) { IsNearTarget = true; } else { if (!IsSGPRSpill) From f861409fb5484412179696ae0613be62334af17f Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 5 May 2025 09:34:31 -0700 Subject: [PATCH 09/11] Deleted the isLocal* functions from Utils --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 67 ------------------- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 8 --- 3 files changed, 2 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 9aa52ac1cf69e..2d6cc5f010bd5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -443,7 +443,7 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, const auto &LI = LIS->getInterval(Reg); // Skip local live interval to make live input/ouput faster. - if (llvm::isLocalLiveInterval(LI, SlotIndexes)) + if (LIS->intervalIsInOneMBB(LI)) continue; for (auto InputIt : MBBInputSlotMap) { @@ -1276,7 +1276,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, if (Status.TargetOcc >= MaxOcc) return false; - // Early check for + // Early checks { int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit; // when agressive sgpr remat, reserve some for allocation lost. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index afa1a8853938f..81395e1ab887c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -36,49 +36,6 @@ bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, } // namespace llvm namespace { -bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes, - SmallDenseSet &TouchedMBBSet) { - MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); - MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); - // Treat non inst as not local. - if (!StartMI || !EndMI) - return false; - // is local when parent MBB the same. - bool IsSameMBB = StartMI->getParent() == EndMI->getParent(); - if (!IsSameMBB) - return false; - // Collect touched MBB. - MachineBasicBlock *MBB = StartMI->getParent(); - TouchedMBBSet.insert(MBB); - return true; -} - -bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes, - SmallDenseSet &TouchedMBBSet) { - for (const LiveRange::Segment &Seg : Range->segments) { - if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet)) - return false; - } - return true; -} - -bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) { - MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); - MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); - // Treat non inst as not local. - if (!StartMI || !EndMI) - return false; - // is local when parent MBB the same. - return StartMI->getParent() == EndMI->getParent(); -} - -bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) { - for (const LiveRange::Segment &Seg : Range->segments) { - if (!isLocalSegment(&Seg, Indexes)) - return false; - } - return true; -} // LoopInfo contains a mapping from basic block to the innermost loop. Find // the outermost loop in the loop nest that contains BB. @@ -195,30 +152,6 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( return MI; } -// In case like float4 v, v.x used and defined in one block, v.y used and define -// in another block, one live interval could touch more than one MBB. -// TouchedMBBSet is used for scheduling where local live interval could cross -// multiple regions, need to calculate livereg for each region inside touched -// MBB. -bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes, - SmallDenseSet &TouchedMBBSet) { - if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) - if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet)) - return false; - } - return isLocalLiveRange(&LI, Indexes, TouchedMBBSet); -} - -bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { - if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) - if (!isLocalLiveRange(&S, Indexes)) - return false; - } - return isLocalLiveRange(&LI, Indexes); -} - void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { dbgs() << "\n live set: \n"; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index 2470e2bed482f..d9fa63ba2b5ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -33,14 +33,6 @@ constexpr unsigned RegForVCC = 2; bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, llvm::MachineBasicBlock &MBB); -// Check if LI live cross basic blocks, save all touched basic block if is -// local. -bool isLocalLiveInterval( - const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes, - llvm::SmallDenseSet &TouchedMBBSet); -bool isLocalLiveInterval(const llvm::LiveInterval &LI, - llvm::SlotIndexes *Indexes); - bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); using LiveSet = llvm::DenseMap; From eaaf6ddaf9858c4d1ea34beaac6dcf1694199a60 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 6 May 2025 19:41:41 -0700 Subject: [PATCH 10/11] Fixed SCC, and updated and simplified tests --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 69 +- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 412 ++---------- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 6 +- llvm/test/CodeGen/AMDGPU/remat/phi.mir | 607 ------------------ .../CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir | 565 ---------------- .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 569 +++++----------- .../AMDGPU/remat/simple_sgpr_long_scc.mir | 575 +++++++++++++++++ .../AMDGPU/remat/simple_sgpr_no_scc.mir | 564 ++++++++++++++++ .../CodeGen/AMDGPU/remat/simple_sgpr_phi.mir | 304 +++++++++ .../CodeGen/AMDGPU/remat/simple_sgpr_scc.mir | 564 ++++++++++++++++ 10 files changed, 2233 insertions(+), 2002 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi.mir delete mode 100644 llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 2d6cc5f010bd5..3a0fa5cad4c13 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -32,7 +32,7 @@ using namespace llvm; static cl::opt - EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive"); + EnableAggressiveSgpr("amdgpu-remat-enable-hot-block-remat-aggressive-sgpr"); static cl::opt TargetOccupancy("amdgpu-remat-target-occupancy"); namespace { @@ -114,12 +114,14 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass { void applyCloneRemat(RematNode &Node, std::vector &HotBlocks, MachineDominatorTree *DT, MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineFunction &MF); + const SIInstrInfo *SIII, LiveIntervals *LIS, + MachineFunction &MF); void applyRemat(MapVector &RematMap, std::vector &HotBlocks, MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineFunction &MF); + const SIInstrInfo *SIII, LiveIntervals *LIS, + MachineFunction &MF); bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, bool &IsNearTarget); @@ -140,12 +142,12 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass { MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash( MachineInstr *InstructionToMove, MachineBasicBlock *MBB, MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS) { const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); if (WillSmashScc) { CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef( - MBB, CurrentInsertPoint, SIRI, SIII, &MRI); + MBB, CurrentInsertPoint, SIRI, SIII, &MRI, LIS); } return CurrentInsertPoint; @@ -236,7 +238,7 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat( RematNode &Node, std::vector &HotBlocks, MachineDominatorTree *DT, MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineFunction &MF) { + const SIInstrInfo *SIII, LiveIntervals *LIS, MachineFunction &MF) { Register Reg = Node.Reg; MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); @@ -289,7 +291,7 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat( } MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash( - DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII); + DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII, LIS); for (MachineMemOperand *MO : DefMI->memoperands()) { NewDef->addMemOperand(MF, MO); @@ -310,8 +312,6 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat( updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); } } - - llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes); } if (MRI.use_empty(Reg)) { SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); @@ -320,8 +320,8 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat( void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, - const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII) { + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + LiveIntervals *LIS) { MachineInstr *DefMI = Node.DefMI; MachineInstr *InsertPointMI = Node.InsertPointMI; MachineBasicBlock *MBB = nullptr; @@ -337,7 +337,7 @@ void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, } InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI, - SIRI, SIII); + SIRI, SIII, LIS); // Move instruction to new location. DefMI->removeFromParent(); @@ -352,7 +352,8 @@ void AMDGPUHotBlockRematerialize::applyRemat( MapVector &RematMap, std::vector &HotBlocks, MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS, + MachineFunction &MF) { std::vector UpdateList; for (auto &It : RematMap) UpdateList.emplace_back(It.second); @@ -368,9 +369,10 @@ void AMDGPUHotBlockRematerialize::applyRemat( for (RematNode &Node : UpdateList) { if (Node.Kind == RematNode::RematKind::OneDefOneUse) - applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII); + applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII, LIS); else if (Node.Kind == RematNode::RematKind::Clone) - applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF); + applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, LIS, + MF); } } @@ -617,11 +619,6 @@ bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { return false; } -static unsigned AlignToSgprAllocationGranularity(const GCNSubtarget *ST, - unsigned SgprCount) { - return llvm::alignTo(SgprCount, ST->getSGPRAllocGranule()); -} - bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST, MachineFunction &MF) { unsigned MaxSGPR = ST->getAddressableNumSGPRs(); @@ -720,7 +717,7 @@ int rematGainInBits(MachineInstr *DefMI, Register Reg, if (IsSingleDef) { // The reg might share with other candidates, check It here. // Count share reg in getReducedSize. - if (EnableAggressive) { + if (EnableAggressiveSgpr) { // In case of aggressive remat, treat multi use reg as shared reg and // ignore size of shared reg. if (!MRI.hasOneNonDBGUse(Reg)) @@ -858,7 +855,7 @@ bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { return false; } -bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { +bool isSafeToMoveOrClone(MachineInstr *DefMI, MachineRegisterInfo &MRI) { // Do not move PHI nodes if (isUsedByPhi(DefMI, MRI)) return false; @@ -869,7 +866,7 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { MachineOperand &Op = DefMI->getOperand(I); if (!Op.isReg()) continue; - if (!MRI.getUniqueVRegDef(Op.getReg()) && + if (!Op.getReg().isPhysical() && !MRI.getUniqueVRegDef(Op.getReg()) && !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) { return false; } @@ -1065,7 +1062,7 @@ int filterRematCandiates(std::vector *OutRematList, continue; MachineInstr *DefMI = Node.DefMI; - if (!isSafeToMove(DefMI, MRI)) { + if (!isSafeToMoveOrClone(DefMI, MRI)) { OutPinnedRegSet->insert(Reg); continue; } @@ -1083,7 +1080,7 @@ int filterRematCandiates(std::vector *OutRematList, continue; MachineInstr *DefMI = Node.DefMI; - if (!isSafeToMove(DefMI, MRI)) { + if (!isSafeToMoveOrClone(DefMI, MRI)) { OutPinnedRegSet->insert(Reg); continue; } @@ -1149,6 +1146,12 @@ int getReducedSize(const MapVector &RematMap, return ReducedSize; } +static unsigned getNumLanesIn32BitReg(bool IsVgpr) { + const TargetRegisterClass *RC = + IsVgpr ? &AMDGPU::VGPR_32RegClass : &AMDGPU::SGPR_32RegClass; + return RC->LaneMask.getNumLanes(); +} + // Calculate the amount of OVERLAPPING register pressure among all // the instructions in `ReducedInsts`. E.g for: // x = COPY a:sgpr_32 @@ -1157,7 +1160,6 @@ int getReducedSize(const MapVector &RematMap, int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { - int SharedSize = 0; DenseMap SharedRegMaskMap; for (MachineInstr *DefMI : ReducedInsts) { @@ -1182,8 +1184,9 @@ int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR, continue; const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); - int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; - unsigned Mask; + const int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + + unsigned Mask = 0; // FIXME: Lane mask is now in the granularity of 16-bit lanes. if (unsigned SubIdx = MO.getSubReg()) { OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); @@ -1210,7 +1213,9 @@ int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR, } } } - return SharedSize; + + const unsigned NumLanesPerReg = getNumLanesIn32BitReg(IsVGPR); + return SharedSize / NumLanesPerReg; } void dumpRematMap(MapVector &RematMap, @@ -1280,7 +1285,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, { int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit; // when agressive sgpr remat, reserve some for allocation lost. - if (EnableAggressive) + if (EnableAggressiveSgpr) InitialRematSCnt += SgprLimitBias; bool InitialIsSGPRSpill = false; @@ -1402,7 +1407,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, RematCnt += Node.Size; // Stop if the size had reached the required amount, unless // aggressive is set. - if (RematCnt > RematSCnt && !EnableAggressive) + if (RematCnt > RematSCnt && !EnableAggressiveSgpr) break; } NewRematSCnt = 0; @@ -1512,7 +1517,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, if (!SRematMap.empty()) { IsUpdated = true; - applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, MF); + applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, LIS, MF); LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 81395e1ab887c..4c55d172018d4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -17,6 +17,7 @@ #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -60,25 +61,14 @@ bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1, namespace llvm { -bool isSccLiveAt(llvm::MachineBasicBlock *MBB, - llvm::MachineBasicBlock::iterator MI) { - const TargetRegisterInfo *TRI = - MBB->getParent()->getRegInfo().getTargetRegisterInfo(); - for (auto It = MI; It != MBB->end(); ++It) { - const MachineInstr &CurMI = *It; - // Hit use of scc, it is live. - if (CurMI.readsRegister(AMDGPU::SCC, TRI)) - return true; - // Hit def of scc first, not live. - if (CurMI.definesRegister(AMDGPU::SCC, TRI)) - return false; - } - // Reach the end of MBB, check live-ins of MBB successors. - for (const MachineBasicBlock *Succ : MBB->successors()) { - if (Succ->isLiveIn(AMDGPU::SCC)) - return true; - } - return false; +bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS) { + if (!LIS) + return true; + const TargetRegisterInfo *TRI = MI.getMF()->getSubtarget().getRegisterInfo(); + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); } // @@ -95,21 +85,16 @@ bool isSccLiveAt(llvm::MachineBasicBlock *MBB, MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( MachineBasicBlock *MBB, MachineBasicBlock::iterator MI, const TargetRegisterInfo *TRI, const SIInstrInfo *TII, - MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) { + MachineRegisterInfo *MRI, LiveIntervals *LIS, + SccDefInsertPointConstraintFlags Constraints) { // If SCC is dead at MI when we can use MI as the insert point. - if (!llvm::isSccLiveAt(MBB, MI)) + if (!llvm::isSccLiveAt(*MI, LIS)) return MI; const bool CheckForExecWrite = Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; - // Get the starting reverse iterator taking care to handle the MBB->end() - // case. - MachineBasicBlock::reverse_iterator Start; - if (MI == MBB->end()) - Start = MBB->rbegin(); - else - Start = MI.getReverse(); + MachineBasicBlock::reverse_iterator Start = MI.getReverse(); // Otherwise, walk backwards through the block looking for a location where // SCC is dead. @@ -122,8 +107,7 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) break; - if (It->modifiesRegister(AMDGPU::SCC, TRI) && - !It->readsRegister(AMDGPU::SCC, TRI)) + if (!llvm::isSccLiveAt(*It, LIS)) return It->getIterator(); } @@ -134,20 +118,35 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( // // The generated code will look like this; // - // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC + // %SavedSCC = COPY $scc # Save SCC // <----- Newly created safe insert point. // MI - // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC + // $scc = COPY %SavedSCC # Restore SCC // Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); DebugLoc DL = MI->getDebugLoc(); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) - .addImm(-1) - .addImm(0); - BuildMI(*MBB, std::next(MI->getIterator()), DL, - TII->get(AMDGPU::S_CMP_LG_U32)) - .addReg(TmpScc, RegState::Kill) - .addImm(0); + auto CopyFrom = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), TmpScc).addReg(AMDGPU::SCC); + auto CopyTo = BuildMI(*MBB, std::next(MI->getIterator()), DL, + TII->get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(TmpScc); + + // Cut the live segment. + auto SlotIndexes = LIS->getSlotIndexes(); + SlotIndexes->insertMachineInstrInMaps(*CopyFrom); + SlotIndexes->insertMachineInstrInMaps(*CopyTo); + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + auto OldSegment = *LR.getSegmentContaining(LIS->getInstructionIndex(*MI)); + LiveRange::Segment NewSegA( + OldSegment.start, + SlotIndexes->getInstructionIndex(*CopyFrom).getRegSlot(), + OldSegment.valno); + LiveRange::Segment NewSegB(LIS->getInstructionIndex(*CopyTo).getRegSlot(), + OldSegment.end, OldSegment.valno); + LR.removeSegment(OldSegment); + LR.addSegment(NewSegA); + LR.addSegment(NewSegB); return MI; } @@ -164,341 +163,6 @@ void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { } } -LaneBitmask getRegMask(const MachineOperand &MO, - const MachineRegisterInfo &MRI) { - // We don't rely on read-undef_ flag because in case of tentative schedule - // tracking it isn't set correctly yet. This works correctly however since - // use mask has been tracked before using LIS. - return MO.getSubReg() == 0 - ? MRI.getMaxLaneMaskForVReg(MO.getReg()) - : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask( - MO.getSubReg()); -} - -struct Piece { - unsigned Reg; - unsigned Offset; - unsigned Size; - static SmallVector split(std::bitset<32> Mask) { - - SmallVector Pieces; - Piece Piece = {0, 0, 0}; - for (unsigned i = 0; i < 32; i++) { - if (Mask.test(i)) { - if (Piece.Size == 0) - Piece.Offset = i; - - Piece.Size++; - // Make sure no piece bigger than 8. - if (Piece.Size == 8) { - Pieces.emplace_back(Piece); - Piece.Size = 0; - } - } else { - if (Piece.Size == 0) { - continue; - } - Pieces.emplace_back(Piece); - Piece.Size = 0; - } - } - return Pieces; - } -}; - -static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI, - const MachineRegisterInfo &MRI) { - const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); - const TargetRegisterClass *SubregRC = - SIRI->getSubRegisterClass(RC, AMDGPU::sub0); - return SubregRC->LaneMask.getNumLanes(); -} - -static std::vector -getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI, - const TargetRegisterClass *RC, - LaneBitmask Mask) { - // TODO: this could replace the code it was copied from in SplitKit.cpp - - // First pass: Try to find a perfectly matching subregister index. - // If none exists find the one covering the most lanemask bits. - SmallVector PossibleIndexes; - unsigned BestIdx = 0; - const LaneBitmask Avoid = ~Mask; - { - unsigned BestCover = 0; - for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { - // Is this index even compatible with the given class? - if (TRI->getSubClassWithSubReg(RC, Idx) != RC) - continue; - LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); - // Early exit if we found a perfect match. - if (SubRegMask == Mask) { - BestIdx = Idx; - break; - } - - // The index must not cover any lanes outside - if ((SubRegMask & Avoid).any()) - continue; - - unsigned PopCount = SubRegMask.getNumLanes(); - PossibleIndexes.push_back(Idx); - if (PopCount > BestCover) { - BestCover = PopCount; - BestIdx = Idx; - } - } - } - - // Abort if we cannot possibly implement the COPY with the given indexes. - if (BestIdx == 0) { - LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " - << TRI->getRegClassName(RC) << " mask " - << PrintLaneMask(Mask) << '\n'); - assert(false && "Impossible to span reg class"); - return std::vector(); - } - - std::vector Result; - Result.push_back(BestIdx); - - // Greedy heuristic: Keep iterating keeping the best covering subreg index - // each time. - Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx)); - while (Mask.any()) { - BestIdx = 0; - int BestCover = std::numeric_limits::min(); - for (unsigned Idx : PossibleIndexes) { - LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); - // Early exit if we found a perfect match. - if (SubRegMask == Mask) { - BestIdx = Idx; - break; - } - - // Guaranteed above - assert((SubRegMask & Avoid).none()); - - // Try to cover as much of the remaining lanes as possible but as few of - // the already covered lanes as possible. - int Cover = (SubRegMask & Mask).getNumLanes() - - (SubRegMask & ~Mask).getNumLanes(); - if (Cover > BestCover) { - BestCover = Cover; - BestIdx = Idx; - } - } - - if (BestIdx == 0) { - LLVM_DEBUG( - dbgs() << "Unable to find minimal spanning sub register(s) for " - << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask) - << '\n'); - assert(false && "Impossible to span reg class"); - return std::vector(); - } - - Result.push_back(BestIdx); - Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx); - } - - return Result; -} - -static void updateSubReg(MachineOperand &UseMO, - const llvm::TargetRegisterClass *NewRC, - unsigned Offset, const SIRegisterInfo *SIRI) { - unsigned Size = NewRC->getLaneMask().getNumLanes(); - if (Size == 1) { - UseMO.setSubReg(0); - } else { - const uint32_t SubReg = UseMO.getSubReg(); - LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg); - - unsigned Mask = LaneMask.getAsInteger() >> Offset; - - unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask( - SIRI, NewRC, LaneBitmask(Mask)) - .front(); - - UseMO.setSubReg(NewSubReg); - } -} - -bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, - MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { - MachineOperand &DstMO = MI.getOperand(0); - // Skip case when dst subReg not 0. - if (DstMO.getSubReg()) - return false; - Register Reg = DstMO.getReg(); - - SmallVector UseMOs; - for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) - UseMOs.emplace_back(&UseMO); - - const llvm::TargetRegisterClass *NewRC = - SIRI->getRegClass(Desc.operands().front().RegClass); - if (!NewRC->isAllocatable()) { - if (SIRI->isSGPRClass(NewRC)) - NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits); - else if (SIRI->isVGPRClass(NewRC)) - NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits); - else - return false; - - if (!NewRC->isAllocatable()) - return false; - } - - unsigned NumLanes = NewRC->getLaneMask().getNumLanes(); - if (Offset > 0) { - // Update offset operand in MI. - MachineOperand *OffsetOp = - SIII->getNamedOperand(MI, AMDGPU::OpName::offset); - - const uint32_t LaneSize = sizeof(uint32_t); - if (OffsetOp) { - if (OffsetOp->isImm()) { - assert(OffsetOp != nullptr); - int64_t Offset = OffsetOp->getImm(); - Offset += Offset * LaneSize; - if (!SIII->isLegalMUBUFImmOffset(Offset)) - return false; - OffsetOp->setImm(Offset); - } else { - return false; - } - } else { - OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset); - if (OffsetOp) { - Register NewOffsetReg = - MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(), - SIII->get(AMDGPU::S_ADD_U32)) - .addDef(NewOffsetReg) - .add(*OffsetOp) - .addImm(Offset * LaneSize); - MachineInstr *OffsetAddMI = OffsetAdd.getInstr(); - MachineBasicBlock::iterator InsertPoint = - llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI, - SIII, &MRI); - MI.getParent()->insert(InsertPoint, OffsetAddMI); - SIII->legalizeOperands(*OffsetAddMI); - OffsetOp->setReg(NewOffsetReg); - OffsetOp->setSubReg(0); - if (SlotIndexes) - SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI); - } else { - return false; - } - } - // Update subReg for users. - for (MachineOperand *UseMO : UseMOs) - updateSubReg(*UseMO, NewRC, Offset, SIRI); - } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) { - // Clear subReg when it's a single 32-bit reg. - for (MachineOperand *UseMO : UseMOs) - UseMO->setSubReg(0); - } - - MI.setDesc(Desc); - // Mutate reg class of Reg. - MRI.setRegClass(Reg, NewRC); - return true; -} - -bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - SlotIndexes *SlotIndexes) { - bool IsImm = false; - switch (MI.getOpcode()) { - default: - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM: - IsImm = true; - LLVM_FALLTHROUGH; - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { - Register Reg = MI.getOperand(0).getReg(); - if (!MRI.getUniqueVRegDef(Reg)) - return false; - LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI); - LaneBitmask UseMask; - for (MachineOperand &MO : MRI.use_operands(Reg)) - UseMask |= llvm::getRegMask(MO, MRI); - - const unsigned FullMask = DstMask.getAsInteger(); - unsigned Mask = UseMask.getAsInteger(); - if (Mask == FullMask) - return false; - // Split mask when there's gap. Then group mask to 2/4/8. - auto Pieces = Piece::split(std::bitset<32>(Mask)); - // Now only support 1 piece. - if (Pieces.size() != 1) - return false; - auto Piece = Pieces[0]; - if (Piece.Size > 8) - return false; - - // TODO: enable offset support when IsImm is true. - // Now if break different test when mul LaneSize or not mul for the offset. - if (IsImm && Piece.Offset != 0) - return false; - - const unsigned Num32BitLanes = - Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI); - - switch (Num32BitLanes) { - default: - return false; - case 1: - return reduceChannel(Piece.Offset, MI, - SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM - : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), - MRI, SIRI, SIII, SlotIndexes); - case 2: - return reduceChannel(Piece.Offset, MI, - SIII->get(IsImm - ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM - : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), - MRI, SIRI, SIII, SlotIndexes); - case 3: - if (FullMask == 0xff) - return false; - LLVM_FALLTHROUGH; - case 4: - return reduceChannel(Piece.Offset, MI, - SIII->get(IsImm - ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM - : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR), - MRI, SIRI, SIII, SlotIndexes); - case 5: - case 6: - case 7: - if (FullMask == 0xffff) - return false; - LLVM_FALLTHROUGH; - case 8: - return reduceChannel(Piece.Offset, MI, - SIII->get(IsImm - ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM - : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR), - MRI, SIRI, SIII, SlotIndexes); - } - - } break; - } - return false; -} - unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index d9fa63ba2b5ee..14cd350398f4c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -21,6 +21,7 @@ namespace llvm { class LiveInterval; +class LiveIntervals; class SlotIndexes; class MachineRegisterInfo; class SIRegisterInfo; @@ -38,8 +39,7 @@ bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); using LiveSet = llvm::DenseMap; void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI); -bool isSccLiveAt(llvm::MachineBasicBlock *MBB, - llvm::MachineBasicBlock::iterator MI); +bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS); // An enum used to pass additional constraints to // `FindOrCreateInsertionPointForSccDef()`. This will further @@ -66,7 +66,7 @@ enum SccDefInsertPointConstraintFlags { llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst, const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII, - llvm::MachineRegisterInfo *MRI, + llvm::MachineRegisterInfo *MRI, LiveIntervals *LIS, SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None); diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir deleted file mode 100644 index 2d22e9fba2593..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/remat/phi.mir +++ /dev/null @@ -1,607 +0,0 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -amdgpu-remat-enable-hot-block-remat-aggressive -run-pass=amdgpu-hot-block-remat -o - | FileCheck %s - -# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are -# present. - -# CHECK: S_ENDPGM - ---- | - source_filename = ".\main.ll" - define amdgpu_ps void @main() #1 { - ret void - } - attributes #1 = { "target-cpu"="gfx1010" } - !llvm.ident = !{!0} - !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} -... ---- -name: main -tracksRegLiveness: true -liveins: - - { reg: '$sgpr0' } - - { reg: '$sgpr1' } - - { reg: '$sgpr2' } - - { reg: '$sgpr3' } - - { reg: '$sgpr4' } - - { reg: '$sgpr5' } - - { reg: '$sgpr6' } - - { reg: '$sgpr7' } - - { reg: '$sgpr8' } - - { reg: '$sgpr8' } - - { reg: '$vgpr0' } - - { reg: '$vgpr1' } -body: | - bb.0: - successors: %bb.1, %bb.2 - liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 - - %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 - ; undef %0.sub0:sgpr_64 = COPY $sgpr0 - ; undef %0.sub1:sgpr_64 = COPY $sgpr1 - - %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 - ; undef %1.sub0:sgpr_128 = COPY $sgpr4 - ; undef %1.sub1:sgpr_128 = COPY $sgpr5 - ; undef %1.sub2:sgpr_128 = COPY $sgpr6 - ; undef %1.sub3:sgpr_128 = COPY $sgpr7 - - - %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec - %3000:sgpr_32 = S_MOV_B32 0 - %3001:sgpr_32 = S_MOV_B32 1 - %3002:sgpr_32 = S_MOV_B32 2 - %3003:sgpr_32 = S_MOV_B32 3 - %3004:sgpr_32 = S_MOV_B32 4 - %3005:sgpr_32 = S_MOV_B32 5 - %3006:sgpr_32 = S_MOV_B32 6 - %3007:sgpr_32 = S_MOV_B32 7 - %3008:sgpr_32 = S_MOV_B32 8 - %3009:sgpr_32 = S_MOV_B32 9 - %3010:sgpr_32 = S_MOV_B32 10 - %3011:sgpr_32 = S_MOV_B32 11 - %3012:sgpr_32 = S_MOV_B32 12 - %3013:sgpr_32 = S_MOV_B32 13 - %3014:sgpr_32 = S_MOV_B32 14 - %3015:sgpr_32 = S_MOV_B32 15 - %3016:sgpr_32 = S_MOV_B32 16 - %3017:sgpr_32 = S_MOV_B32 17 - %3018:sgpr_32 = S_MOV_B32 18 - %3019:sgpr_32 = S_MOV_B32 19 - %3020:sgpr_32 = S_MOV_B32 20 - %3021:sgpr_32 = S_MOV_B32 21 - %3022:sgpr_32 = S_MOV_B32 22 - %3023:sgpr_32 = S_MOV_B32 23 - %3024:sgpr_32 = S_MOV_B32 24 - %3025:sgpr_32 = S_MOV_B32 25 - %3026:sgpr_32 = S_MOV_B32 26 - %3027:sgpr_32 = S_MOV_B32 27 - %3028:sgpr_32 = S_MOV_B32 28 - %3029:sgpr_32 = S_MOV_B32 29 - %3030:sgpr_32 = S_MOV_B32 30 - %3031:sgpr_32 = S_MOV_B32 31 - %3032:sgpr_32 = S_MOV_B32 32 - %3033:sgpr_32 = S_MOV_B32 33 - %3034:sgpr_32 = S_MOV_B32 34 - %3035:sgpr_32 = S_MOV_B32 35 - %3036:sgpr_32 = S_MOV_B32 36 - %3037:sgpr_32 = S_MOV_B32 37 - %3038:sgpr_32 = S_MOV_B32 38 - %3039:sgpr_32 = S_MOV_B32 39 - %3040:sgpr_32 = S_MOV_B32 40 - %3041:sgpr_32 = S_MOV_B32 41 - %3042:sgpr_32 = S_MOV_B32 42 - %3043:sgpr_32 = S_MOV_B32 43 - %3044:sgpr_32 = S_MOV_B32 44 - %3045:sgpr_32 = S_MOV_B32 45 - %3046:sgpr_32 = S_MOV_B32 46 - %3047:sgpr_32 = S_MOV_B32 47 - %3048:sgpr_32 = S_MOV_B32 48 - %3049:sgpr_32 = S_MOV_B32 49 - %3050:sgpr_32 = S_MOV_B32 50 - %3051:sgpr_32 = S_MOV_B32 51 - %3052:sgpr_32 = S_MOV_B32 52 - %3053:sgpr_32 = S_MOV_B32 53 - %3054:sgpr_32 = S_MOV_B32 54 - %3055:sgpr_32 = S_MOV_B32 55 - %3056:sgpr_32 = S_MOV_B32 56 - %3057:sgpr_32 = S_MOV_B32 57 - %3058:sgpr_32 = S_MOV_B32 58 - %3059:sgpr_32 = S_MOV_B32 59 - %3060:sgpr_32 = S_MOV_B32 60 - %3061:sgpr_32 = S_MOV_B32 61 - %3062:sgpr_32 = S_MOV_B32 62 - %3063:sgpr_32 = S_MOV_B32 63 - %3064:sgpr_32 = S_MOV_B32 64 - %3065:sgpr_32 = S_MOV_B32 65 - %3066:sgpr_32 = S_MOV_B32 66 - %3067:sgpr_32 = S_MOV_B32 67 - %3068:sgpr_32 = S_MOV_B32 68 - %3069:sgpr_32 = S_MOV_B32 69 - %3070:sgpr_32 = S_MOV_B32 70 - %3071:sgpr_32 = S_MOV_B32 71 - %3072:sgpr_32 = S_MOV_B32 72 - %3073:sgpr_32 = S_MOV_B32 73 - %3074:sgpr_32 = S_MOV_B32 74 - %3075:sgpr_32 = S_MOV_B32 75 - %3076:sgpr_32 = S_MOV_B32 76 - %3077:sgpr_32 = S_MOV_B32 77 - %3078:sgpr_32 = S_MOV_B32 78 - %3079:sgpr_32 = S_MOV_B32 79 - %3080:sgpr_32 = S_MOV_B32 80 - %3081:sgpr_32 = S_MOV_B32 81 - %3082:sgpr_32 = S_MOV_B32 82 - %3083:sgpr_32 = S_MOV_B32 83 - %3084:sgpr_32 = S_MOV_B32 84 - %3085:sgpr_32 = S_MOV_B32 85 - %3086:sgpr_32 = S_MOV_B32 86 - %3087:sgpr_32 = S_MOV_B32 87 - %3088:sgpr_32 = S_MOV_B32 88 - %3089:sgpr_32 = S_MOV_B32 89 - %3090:sgpr_32 = S_MOV_B32 90 - %3091:sgpr_32 = S_MOV_B32 91 - %3092:sgpr_32 = S_MOV_B32 92 - %3093:sgpr_32 = S_MOV_B32 93 - %3094:sgpr_32 = S_MOV_B32 94 - %3095:sgpr_32 = S_MOV_B32 95 - %3096:sgpr_32 = S_MOV_B32 96 - %3097:sgpr_32 = S_MOV_B32 97 - %3098:sgpr_32 = S_MOV_B32 98 - %3099:sgpr_32 = S_MOV_B32 99 - - - %8000:vgpr_32 = IMPLICIT_DEF - %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode - $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 - S_CBRANCH_EXECZ %bb.2, implicit $exec - S_BRANCH %bb.1 - - bb.1: - successors: %bb.2 - - %8001:vgpr_32 = COPY %8000 - %8002:vgpr_32 = COPY %8000 - %8003:vgpr_32 = COPY %8000 - %8004:vgpr_32 = COPY %8000 - %8005:vgpr_32 = COPY %8000 - %8006:vgpr_32 = COPY %8000 - %8007:vgpr_32 = COPY %8000 - %8008:vgpr_32 = COPY %8000 - %8009:vgpr_32 = COPY %8000 - %8010:vgpr_32 = COPY %8000 - %8011:vgpr_32 = COPY %8000 - %8012:vgpr_32 = COPY %8000 - %8013:vgpr_32 = COPY %8000 - %8014:vgpr_32 = COPY %8000 - %8015:vgpr_32 = COPY %8000 - %8016:vgpr_32 = COPY %8000 - %8017:vgpr_32 = COPY %8000 - - %9001:vgpr_32 = COPY %8001 - %9002:vgpr_32 = COPY %8002 - %9003:vgpr_32 = COPY %8003 - %9004:vgpr_32 = COPY %8004 - %9005:vgpr_32 = COPY %8005 - %9006:vgpr_32 = COPY %8006 - %9007:vgpr_32 = COPY %8007 - %9008:vgpr_32 = COPY %8008 - %9009:vgpr_32 = COPY %8009 - %9010:vgpr_32 = COPY %8010 - %9011:vgpr_32 = COPY %8011 - %9012:vgpr_32 = COPY %8012 - %9013:vgpr_32 = COPY %8013 - %9014:vgpr_32 = COPY %8014 - %9015:vgpr_32 = COPY %8015 - %9016:vgpr_32 = COPY %8016 - %9017:vgpr_32 = COPY %8017 - - S_BRANCH %bb.2 - - bb.2: - %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1 - %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1 - %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1 - %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1 - %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1 - %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1 - %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1 - %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1 - %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1 - %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1 - %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1 - %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1 - %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1 - %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1 - %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1 - %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1 - %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1 - %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1 - %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1 - %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1 - %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1 - %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1 - %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1 - %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1 - %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1 - %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1 - %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1 - %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1 - %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1 - %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1 - %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1 - %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1 - %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1 - %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1 - %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1 - %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1 - %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1 - %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1 - %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1 - %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1 - %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1 - %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1 - %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1 - %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1 - %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1 - %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1 - %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1 - %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1 - %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1 - %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1 - %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1 - %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1 - %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1 - %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1 - %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1 - %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1 - %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1 - %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1 - %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1 - %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1 - %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1 - %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1 - %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1 - %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1 - %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1 - %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1 - %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1 - %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1 - %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1 - %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1 - %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1 - %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1 - %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1 - %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1 - %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1 - %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1 - %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1 - %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1 - %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1 - %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1 - %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1 - %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1 - %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1 - %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1 - %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1 - %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1 - %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1 - %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1 - %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1 - %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1 - %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1 - %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1 - %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1 - %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1 - %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1 - %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1 - %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1 - %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1 - %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1 - %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1 - - - %3:vgpr_32 = IMPLICIT_DEF - - %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec - %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec - %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec - %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec - %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec - %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec - %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec - %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec - %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec - %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec - %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec - %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec - %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec - %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec - %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec - %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec - %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec - %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec - %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec - %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec - %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec - %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec - %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec - %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec - %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec - %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec - %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec - %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec - %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec - %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec - %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec - %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec - %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec - %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec - %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec - %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec - %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec - %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec - %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec - %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec - %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec - %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec - %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec - %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec - %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec - %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec - %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec - %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec - %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec - %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec - %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec - %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec - %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec - %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec - %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec - %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec - %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec - %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec - %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec - %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec - %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec - %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec - %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec - %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec - %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec - %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec - %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec - %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec - %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec - %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec - %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec - %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec - %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec - %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec - %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec - %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec - %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec - %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec - %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec - %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec - %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec - %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec - %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec - %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec - %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec - %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec - %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec - %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec - %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec - %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec - %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec - %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec - %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec - %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec - %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec - %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec - %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec - %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec - %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec - %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec - EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec - - - S_ENDPGM 0 -... - diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir deleted file mode 100644 index 02a9836313360..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir +++ /dev/null @@ -1,565 +0,0 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s - -# Check that the buffer loads have been moved to the use and the lanes are reduced -# correctly. -# -# CHECK: bb.2: -#========================================================================== -# X4_IMM, Using .x -# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0 -# X4_IMM, Using .xy -# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0 -# X4_IMM, Using .xyz -# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0 -# X4_IMM, Using .yz -# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0 -# X4_IMM, Using .yzw -# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0 -#========================================================================== -# X8_IMM, Using .x -# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0 -# X8_IMM, Using .xy -# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0 -# X8_IMM, Using .xyz -# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0 -# X8_IMM, Using .xyzw -# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0 -# X8_IMM, Using .xyzw + 5th dword -# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0 -#========================================================================== -# X16_IMM, Using .xy and .zw -# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 -# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0 -# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0 -#========================================================================== -# X4_SGPR, Using .x -# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0 -# X8_SGPR, Using .xy -# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0 -# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0 -# X16_SGPR, Using .xy + .zw -# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0 -# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0 -# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0 -#========================================================================== -# -# -# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 -# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 -# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 -# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 -# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 -# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 -# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 -# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 -# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 -# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 -# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 -# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 -# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 -# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 -# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 -# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 -# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 -# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 -# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 -# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 -# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 -# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 -# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 -# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 -# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 -# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 -# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 -# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 -# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 -# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 -# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 -# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 -# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 -# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 -# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 -# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 -# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 -# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 -# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 -# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 -# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 -# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 -# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 -# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 -# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 -# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 -# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 -# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 -# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 -# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 - - ---- | - source_filename = ".\main.ll" - define amdgpu_ps void @main() #1 { - ret void - } - attributes #1 = { "target-cpu"="gfx1010" } - !llvm.ident = !{!0} - !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} -... ---- -name: main -tracksRegLiveness: true -liveins: - - { reg: '$sgpr0' } - - { reg: '$sgpr1' } - - { reg: '$sgpr2' } - - { reg: '$sgpr3' } - - { reg: '$sgpr4' } - - { reg: '$sgpr5' } - - { reg: '$sgpr6' } - - { reg: '$sgpr7' } - - { reg: '$sgpr8' } - - { reg: '$sgpr8' } - - { reg: '$vgpr0' } - - { reg: '$vgpr1' } -body: | - bb.0: - successors: %bb.1, %bb.2 - liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1 - - %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 - %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 - %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3 - - ; X4_IMM - %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0 - %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0 - %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0 - %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0 - %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0 - - ; X8_IMM - %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0 - %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0 - %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0 - %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0 - %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0 - - ; X16_IMM - %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0 - - ; X4_SGPR - %50:sgpr_32 = COPY $sgpr0 - %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0 - - ; X8_SGPR - %51:sgpr_32 = COPY $sgpr1 - %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0 - - ; X16_SGPR - %52:sgpr_32 = COPY $sgpr2 - %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0 - - %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0 - %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0 - %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0 - %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0 - %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0 - %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0 - %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0 - %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0 - %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0 - %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0 - %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0 - %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0 - %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0 - %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0 - %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0 - %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0 - %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0 - %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0 - %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0 - %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0 - %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0 - %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0 - %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0 - %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0 - %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0 - %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0 - %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0 - %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0 - %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0 - %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0 - %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0 - %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0 - %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0 - %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0 - %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0 - %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0 - %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0 - %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0 - %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0 - %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0 - %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0 - %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0 - %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0 - %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0 - %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0 - %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0 - %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0 - %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0 - %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0 - %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0 - - %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - - - %8000:vgpr_32 = IMPLICIT_DEF - %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode - $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 - S_CBRANCH_EXECZ %bb.2, implicit $exec - S_BRANCH %bb.1 - - bb.1: - successors: %bb.2 - %8001:vgpr_32 = COPY %8000 - S_BRANCH %bb.2 - - bb.2: - - %3:vgpr_32 = IMPLICIT_DEF - ;========================================================================== - ; X4_IMM, Using .x - S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0 - S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses. - - ; X4_IMM, Using .xy - S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0 - S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0 - - ; X4_IMM, Using .xyz - S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0 - S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0 - S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0 - - ; X4_IMM, Using .yz - S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0 - S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0 - - ; X4_IMM, Using .yzw - S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0 - S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0 - S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0 - - ;========================================================================== - ; X8_IMM, Using .x - S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0 - S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0 - - ; X8_IMM, Using .xy - S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0 - S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0 - - ; X8_IMM, Using .xyz - S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0 - S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0 - S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0 - - ; X8_IMM, Using .xyzw - S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0 - S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0 - S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0 - S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0 - - ; X8_IMM, Using .xyzw + 5th dword - S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0 - S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0 - S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0 - S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0 - S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0 - - ;========================================================================== - ; X16_IMM, Using .xy and .zw - S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0 - S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0 - - ;========================================================================== - ; X4_SGPR, Using .x - S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0 - S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0 - - ; X8_SGPR, Using .xy - S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0 - S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0 - - ; X16_SGPR, Using .xy + .zw - S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0 - S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0 - - ;========================================================================== - S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 - - EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec - - - S_ENDPGM 0 -... - - - - diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir index 69875261b74e9..d6c6173cd523e 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -1,452 +1,179 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s # Check that the loads have been moved to the use +# CHECK: bb.0: +# CHECK-NOT: S_LOAD_DWORDX4_IMM # CHECK: bb.2: -# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0 -# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0 -# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0 -# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0 -# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0 -# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0 -# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0 -# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0 -# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0 -# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0 -# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0 -# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0 -# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0 -# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0 -# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 -# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 -# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 -# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 -# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 -# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 -# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 -# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 -# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 -# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 -# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 -# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 -# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 -# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 -# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 -# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 -# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 -# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 -# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 -# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 -# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 -# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 -# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 -# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 -# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 -# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 -# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 -# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 -# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 -# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 -# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 -# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 -# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 -# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 -# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 -# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 -# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 -# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 -# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 -# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 -# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 -# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 -# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 -# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 -# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 -# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 -# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 -# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 -# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 -# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 -# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 +# CHECK: %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0 +# CHECK: KILL %t0 +# CHECK: %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0 +# CHECK: KILL %t2 +# CHECK: %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0 +# CHECK: KILL %t4 +# CHECK: %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0 +# CHECK: KILL %t6 +# CHECK: %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0 +# CHECK: KILL %t8 +# CHECK: %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0 +# CHECK: KILL %t10 +# CHECK: %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0 +# CHECK: KILL %t12 +# CHECK: %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0 +# CHECK: KILL %t14 +# CHECK: %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0 +# CHECK: KILL %t16 +# CHECK: %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0 +# CHECK: KILL %t18 +# CHECK: %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0 +# CHECK: KILL %t20 +# CHECK: %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0 +# CHECK: KILL %t22 +# CHECK: %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0 +# CHECK: KILL %t24 +# CHECK: %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0 +# CHECK: KILL %t26 +# CHECK: %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0 +# CHECK: KILL %t28 +# CHECK: %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0 +# CHECK: KILL %t30 +# CHECK: %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0 +# CHECK: KILL %t32 +# CHECK: %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0 +# CHECK: KILL %t34 +# CHECK: %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0 +# CHECK: KILL %t36 +# CHECK: %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0 +# CHECK: KILL %t38 +# CHECK: %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0 +# CHECK: KILL %t40 +# CHECK: %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0 +# CHECK: KILL %t42 +# CHECK: %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0 +# CHECK: KILL %t44 +# CHECK: %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0 +# CHECK: KILL %t46 +# CHECK: %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0 +# CHECK: KILL %t48 +# CHECK: %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0 +# CHECK: KILL %t50 +# CHECK: %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0 +# CHECK: KILL %t52 +# CHECK: %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0 +# CHECK: KILL %t54 +# CHECK: %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0 +# CHECK: KILL %t56 +# CHECK: %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0 +# CHECK: KILL %t58 +# CHECK: %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0 +# CHECK: KILL %t60 +# CHECK: %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0 +# CHECK: KILL %t62 --- | - source_filename = ".\main.ll" - define amdgpu_ps void @main() #1 { + define amdgpu_ps void @main() { ret void } - attributes #1 = { "target-cpu"="gfx1010" } - !llvm.ident = !{!0} - !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} ... --- name: main tracksRegLiveness: true -liveins: - - { reg: '$sgpr0' } - - { reg: '$sgpr1' } - - { reg: '$sgpr2' } - - { reg: '$sgpr3' } - - { reg: '$sgpr4' } - - { reg: '$sgpr5' } - - { reg: '$sgpr6' } - - { reg: '$sgpr7' } - - { reg: '$sgpr8' } - - { reg: '$sgpr8' } - - { reg: '$vgpr0' } - - { reg: '$vgpr1' } body: | bb.0: successors: %bb.1, %bb.2 - liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF - %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 - ; undef %0.sub0:sgpr_64 = COPY $sgpr0 - ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + %ptr:sgpr_64 = IMPLICIT_DEF - %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 - ; undef %1.sub0:sgpr_128 = COPY $sgpr4 - ; undef %1.sub1:sgpr_128 = COPY $sgpr5 - ; undef %1.sub2:sgpr_128 = COPY $sgpr6 - ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + ; Defs + %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0 + %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0 + %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0 + %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0 + %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0 + %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0 + %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0 + %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0 + %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0 + %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0 + %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0 + %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0 + %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0 + %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0 + %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0 + %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0 + %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0 + %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0 + %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0 + %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0 + %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0 + %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0 + %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0 + %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0 + %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0 + %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0 + %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0 + %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0 + %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0 + %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0 + %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0 + %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0 - %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 - %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 - %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0 - %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0 - %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0 - %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0 - %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0 - %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0 - %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0 - %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0 - %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0 - %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0 - %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0 - %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0 - %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0 - %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0 - %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0 - %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0 - %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0 - %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0 - %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0 - %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0 - %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0 - %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0 - %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0 - %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0 - %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0 - %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0 - %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0 - %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0 - %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0 - %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0 - %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0 - %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0 - %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0 - %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0 - %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0 - %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0 - %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0 - %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0 - %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0 - %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0 - %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0 - %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0 - %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0 - %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0 - %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0 - %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0 - %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0 - %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0 - %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0 - %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0 - %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0 - %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0 - %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0 - %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0 - %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0 - %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0 - %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0 - %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0 - %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0 - %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0 - %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0 - %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0 - %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - - - %8000:vgpr_32 = IMPLICIT_DEF - %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode - $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 S_CBRANCH_EXECZ %bb.2, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.2 - %8001:vgpr_32 = COPY %8000 S_BRANCH %bb.2 bb.2: - - %3:vgpr_32 = IMPLICIT_DEF - S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 - S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 - - EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec - EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 S_ENDPGM 0 ... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir new file mode 100644 index 0000000000000..a4e9c69d53b7c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir @@ -0,0 +1,575 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test checks that when there are no safe spot to clone/move instructions that +# modify $scc, a safe spot is created for it. + +# CHECK: bb.0: +# CHECK-NOT: S_NOT_B32: +# CHECK: bb.2: +# Save scc +# CHECK: %[[#scc0:]]:sreg_32_xm0 = COPY $scc +# CHECK: %t0:sgpr_32 = S_NOT_B32 0 +# CHECK: KILL %t0 +# All subsequent moves are placed within the safe spot created for the first one. +# CHECK: %t2:sgpr_32 = S_NOT_B32 1 +# CHECK: %t4:sgpr_32 = S_NOT_B32 2 +# CHECK: %t6:sgpr_32 = S_NOT_B32 3 +# CHECK: %t8:sgpr_32 = S_NOT_B32 4 +# CHECK: %t10:sgpr_32 = S_NOT_B32 5 +# CHECK: %t12:sgpr_32 = S_NOT_B32 6 +# CHECK: %t14:sgpr_32 = S_NOT_B32 7 +# CHECK: %t16:sgpr_32 = S_NOT_B32 8 +# CHECK: %t18:sgpr_32 = S_NOT_B32 9 +# CHECK: %t20:sgpr_32 = S_NOT_B32 10 +# CHECK: %t22:sgpr_32 = S_NOT_B32 11 +# CHECK: %t24:sgpr_32 = S_NOT_B32 12 +# CHECK: %t26:sgpr_32 = S_NOT_B32 13 +# CHECK: %t28:sgpr_32 = S_NOT_B32 14 +# CHECK: %t30:sgpr_32 = S_NOT_B32 15 +# CHECK: %t32:sgpr_32 = S_NOT_B32 16 +# CHECK: %t34:sgpr_32 = S_NOT_B32 17 +# CHECK: %t36:sgpr_32 = S_NOT_B32 18 +# CHECK: %t38:sgpr_32 = S_NOT_B32 19 +# CHECK: %t40:sgpr_32 = S_NOT_B32 20 +# CHECK: %t42:sgpr_32 = S_NOT_B32 21 +# CHECK: %t44:sgpr_32 = S_NOT_B32 22 +# CHECK: %t46:sgpr_32 = S_NOT_B32 23 +# CHECK: %t48:sgpr_32 = S_NOT_B32 24 +# CHECK: %t50:sgpr_32 = S_NOT_B32 25 +# CHECK: %t52:sgpr_32 = S_NOT_B32 26 +# CHECK: %t54:sgpr_32 = S_NOT_B32 27 +# CHECK: %t56:sgpr_32 = S_NOT_B32 28 +# CHECK: %t58:sgpr_32 = S_NOT_B32 29 +# CHECK: %t60:sgpr_32 = S_NOT_B32 30 +# CHECK: %t62:sgpr_32 = S_NOT_B32 31 +# CHECK: %t64:sgpr_32 = S_NOT_B32 32 +# CHECK: %t66:sgpr_32 = S_NOT_B32 33 +# CHECK: %t68:sgpr_32 = S_NOT_B32 34 +# CHECK: %t70:sgpr_32 = S_NOT_B32 35 +# CHECK: %t72:sgpr_32 = S_NOT_B32 36 +# CHECK: %t74:sgpr_32 = S_NOT_B32 37 +# CHECK: %t76:sgpr_32 = S_NOT_B32 38 +# CHECK: %t78:sgpr_32 = S_NOT_B32 39 +# CHECK: %t80:sgpr_32 = S_NOT_B32 40 +# CHECK: %t82:sgpr_32 = S_NOT_B32 41 +# CHECK: %t84:sgpr_32 = S_NOT_B32 42 +# CHECK: %t86:sgpr_32 = S_NOT_B32 43 +# CHECK: %t88:sgpr_32 = S_NOT_B32 44 +# CHECK: %t90:sgpr_32 = S_NOT_B32 45 +# CHECK: %t92:sgpr_32 = S_NOT_B32 46 +# CHECK: %t94:sgpr_32 = S_NOT_B32 47 +# CHECK: %t96:sgpr_32 = S_NOT_B32 48 +# CHECK: %t98:sgpr_32 = S_NOT_B32 49 +# CHECK: %t100:sgpr_32 = S_NOT_B32 50 +# CHECK: %t102:sgpr_32 = S_NOT_B32 51 +# CHECK: %t104:sgpr_32 = S_NOT_B32 52 +# CHECK: %t106:sgpr_32 = S_NOT_B32 53 +# CHECK: %t108:sgpr_32 = S_NOT_B32 54 +# CHECK: %t110:sgpr_32 = S_NOT_B32 55 +# CHECK: %t112:sgpr_32 = S_NOT_B32 56 +# CHECK: %t114:sgpr_32 = S_NOT_B32 57 +# CHECK: %t116:sgpr_32 = S_NOT_B32 58 +# CHECK: %t118:sgpr_32 = S_NOT_B32 59 +# CHECK: %t120:sgpr_32 = S_NOT_B32 60 +# CHECK: %t122:sgpr_32 = S_NOT_B32 61 +# CHECK: %t124:sgpr_32 = S_NOT_B32 62 +# CHECK: %t126:sgpr_32 = S_NOT_B32 63 +# CHECK: %t128:sgpr_32 = S_NOT_B32 64 +# CHECK: %t130:sgpr_32 = S_NOT_B32 65 +# CHECK: %t132:sgpr_32 = S_NOT_B32 66 +# CHECK: %t134:sgpr_32 = S_NOT_B32 67 +# CHECK: %t136:sgpr_32 = S_NOT_B32 68 +# CHECK: %t138:sgpr_32 = S_NOT_B32 69 +# CHECK: %t140:sgpr_32 = S_NOT_B32 70 +# CHECK: %t142:sgpr_32 = S_NOT_B32 71 +# CHECK: %t144:sgpr_32 = S_NOT_B32 72 +# CHECK: %t146:sgpr_32 = S_NOT_B32 73 +# CHECK: %t148:sgpr_32 = S_NOT_B32 74 +# CHECK: %t150:sgpr_32 = S_NOT_B32 75 +# CHECK: %t152:sgpr_32 = S_NOT_B32 76 +# CHECK: %t154:sgpr_32 = S_NOT_B32 77 +# CHECK: %t156:sgpr_32 = S_NOT_B32 78 +# CHECK: %t158:sgpr_32 = S_NOT_B32 79 +# CHECK: %t160:sgpr_32 = S_NOT_B32 80 +# CHECK: %t162:sgpr_32 = S_NOT_B32 81 +# CHECK: %t164:sgpr_32 = S_NOT_B32 82 +# CHECK: %t166:sgpr_32 = S_NOT_B32 83 +# CHECK: %t168:sgpr_32 = S_NOT_B32 84 +# CHECK: %t170:sgpr_32 = S_NOT_B32 85 +# CHECK: %t172:sgpr_32 = S_NOT_B32 86 +# CHECK: %t174:sgpr_32 = S_NOT_B32 87 +# CHECK: %t176:sgpr_32 = S_NOT_B32 88 +# CHECK: %t178:sgpr_32 = S_NOT_B32 89 +# CHECK: %t180:sgpr_32 = S_NOT_B32 90 +# CHECK: %t182:sgpr_32 = S_NOT_B32 91 +# CHECK: %t184:sgpr_32 = S_NOT_B32 92 +# CHECK: %t186:sgpr_32 = S_NOT_B32 93 +# CHECK: %t188:sgpr_32 = S_NOT_B32 94 +# CHECK: %t190:sgpr_32 = S_NOT_B32 95 +# CHECK: %t192:sgpr_32 = S_NOT_B32 96 +# CHECK: %t194:sgpr_32 = S_NOT_B32 97 +# CHECK: %t196:sgpr_32 = S_NOT_B32 98 +# CHECK: %t198:sgpr_32 = S_NOT_B32 99 +# CHECK: %t200:sgpr_32 = S_NOT_B32 100 +# CHECK: %t202:sgpr_32 = S_NOT_B32 101 +# CHECK: %t204:sgpr_32 = S_NOT_B32 102 +# CHECK: %t206:sgpr_32 = S_NOT_B32 103 +# CHECK: %t208:sgpr_32 = S_NOT_B32 104 +# CHECK: %t210:sgpr_32 = S_NOT_B32 105 +# CHECK: %t212:sgpr_32 = S_NOT_B32 106 +# CHECK: %t214:sgpr_32 = S_NOT_B32 107 +# CHECK: %t216:sgpr_32 = S_NOT_B32 108 +# CHECK: %t218:sgpr_32 = S_NOT_B32 109 +# CHECK: %t220:sgpr_32 = S_NOT_B32 110 +# CHECK: %t222:sgpr_32 = S_NOT_B32 111 +# CHECK: %t224:sgpr_32 = S_NOT_B32 112 +# CHECK: %t226:sgpr_32 = S_NOT_B32 113 +# CHECK: %t228:sgpr_32 = S_NOT_B32 114 +# CHECK: %t230:sgpr_32 = S_NOT_B32 115 +# CHECK: %t232:sgpr_32 = S_NOT_B32 116 +# CHECK: %t234:sgpr_32 = S_NOT_B32 117 +# CHECK: %t236:sgpr_32 = S_NOT_B32 118 +# CHECK: %t238:sgpr_32 = S_NOT_B32 119 +# CHECK: %t240:sgpr_32 = S_NOT_B32 120 +# CHECK: %t242:sgpr_32 = S_NOT_B32 121 +# CHECK: %t244:sgpr_32 = S_NOT_B32 122 +# CHECK: %t246:sgpr_32 = S_NOT_B32 123 +# CHECK: %t248:sgpr_32 = S_NOT_B32 124 +# CHECK: %t250:sgpr_32 = S_NOT_B32 125 +# CHECK: %t252:sgpr_32 = S_NOT_B32 126 +# CHECK: %t254:sgpr_32 = S_NOT_B32 127 +# Restore scc +# CHECK: $scc = COPY %[[#scc0]] +# CHECK: KILL %t2 +# CHECK: KILL %t4 +# CHECK: KILL %t6 +# CHECK: KILL %t8 +# CHECK: KILL %t10 +# CHECK: KILL %t12 +# CHECK: KILL %t14 +# CHECK: KILL %t16 +# CHECK: KILL %t18 +# CHECK: KILL %t20 +# CHECK: KILL %t22 +# CHECK: KILL %t24 +# CHECK: KILL %t26 +# CHECK: KILL %t28 +# CHECK: KILL %t30 +# CHECK: KILL %t32 +# CHECK: KILL %t34 +# CHECK: KILL %t36 +# CHECK: KILL %t38 +# CHECK: KILL %t40 +# CHECK: KILL %t42 +# CHECK: KILL %t44 +# CHECK: KILL %t46 +# CHECK: KILL %t48 +# CHECK: KILL %t50 +# CHECK: KILL %t52 +# CHECK: KILL %t54 +# CHECK: KILL %t56 +# CHECK: KILL %t58 +# CHECK: KILL %t60 +# CHECK: KILL %t62 +# CHECK: KILL %t64 +# CHECK: KILL %t66 +# CHECK: KILL %t68 +# CHECK: KILL %t70 +# CHECK: KILL %t72 +# CHECK: KILL %t74 +# CHECK: KILL %t76 +# CHECK: KILL %t78 +# CHECK: KILL %t80 +# CHECK: KILL %t82 +# CHECK: KILL %t84 +# CHECK: KILL %t86 +# CHECK: KILL %t88 +# CHECK: KILL %t90 +# CHECK: KILL %t92 +# CHECK: KILL %t94 +# CHECK: KILL %t96 +# CHECK: KILL %t98 +# CHECK: KILL %t100 +# CHECK: KILL %t102 +# CHECK: KILL %t104 +# CHECK: KILL %t106 +# CHECK: KILL %t108 +# CHECK: KILL %t110 +# CHECK: KILL %t112 +# CHECK: KILL %t114 +# CHECK: KILL %t116 +# CHECK: KILL %t118 +# CHECK: KILL %t120 +# CHECK: KILL %t122 +# CHECK: KILL %t124 +# CHECK: KILL %t126 +# CHECK: KILL %t128 +# CHECK: KILL %t130 +# CHECK: KILL %t132 +# CHECK: KILL %t134 +# CHECK: KILL %t136 +# CHECK: KILL %t138 +# CHECK: KILL %t140 +# CHECK: KILL %t142 +# CHECK: KILL %t144 +# CHECK: KILL %t146 +# CHECK: KILL %t148 +# CHECK: KILL %t150 +# CHECK: KILL %t152 +# CHECK: KILL %t154 +# CHECK: KILL %t156 +# CHECK: KILL %t158 +# CHECK: KILL %t160 +# CHECK: KILL %t162 +# CHECK: KILL %t164 +# CHECK: KILL %t166 +# CHECK: KILL %t168 +# CHECK: KILL %t170 +# CHECK: KILL %t172 +# CHECK: KILL %t174 +# CHECK: KILL %t176 +# CHECK: KILL %t178 +# CHECK: KILL %t180 +# CHECK: KILL %t182 +# CHECK: KILL %t184 +# CHECK: KILL %t186 +# CHECK: KILL %t188 +# CHECK: KILL %t190 +# CHECK: KILL %t192 +# CHECK: KILL %t194 +# CHECK: KILL %t196 +# CHECK: KILL %t198 +# CHECK: KILL %t200 +# CHECK: KILL %t202 +# CHECK: KILL %t204 +# CHECK: KILL %t206 +# CHECK: KILL %t208 +# CHECK: KILL %t210 +# CHECK: KILL %t212 +# CHECK: KILL %t214 +# CHECK: KILL %t216 +# CHECK: KILL %t218 +# CHECK: KILL %t220 +# CHECK: KILL %t222 +# CHECK: KILL %t224 +# CHECK: KILL %t226 +# CHECK: KILL %t228 +# CHECK: KILL %t230 +# CHECK: KILL %t232 +# CHECK: KILL %t234 +# CHECK: KILL %t236 +# CHECK: KILL %t238 +# CHECK: KILL %t240 +# CHECK: KILL %t242 +# CHECK: KILL %t244 +# CHECK: KILL %t246 +# CHECK: KILL %t248 +# CHECK: KILL %t250 +# CHECK: KILL %t252 +# CHECK: KILL %t254 + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Def scc + $scc = IMPLICIT_DEF + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + liveins: $scc + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + liveins: $scc + ; Uses + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + KILL %t64 + KILL %t66 + KILL %t68 + KILL %t70 + KILL %t72 + KILL %t74 + KILL %t76 + KILL %t78 + KILL %t80 + KILL %t82 + KILL %t84 + KILL %t86 + KILL %t88 + KILL %t90 + KILL %t92 + KILL %t94 + KILL %t96 + KILL %t98 + KILL %t100 + KILL %t102 + KILL %t104 + KILL %t106 + KILL %t108 + KILL %t110 + KILL %t112 + KILL %t114 + KILL %t116 + KILL %t118 + KILL %t120 + KILL %t122 + KILL %t124 + KILL %t126 + KILL %t128 + KILL %t130 + KILL %t132 + KILL %t134 + KILL %t136 + KILL %t138 + KILL %t140 + KILL %t142 + KILL %t144 + KILL %t146 + KILL %t148 + KILL %t150 + KILL %t152 + KILL %t154 + KILL %t156 + KILL %t158 + KILL %t160 + KILL %t162 + KILL %t164 + KILL %t166 + KILL %t168 + KILL %t170 + KILL %t172 + KILL %t174 + KILL %t176 + KILL %t178 + KILL %t180 + KILL %t182 + KILL %t184 + KILL %t186 + KILL %t188 + KILL %t190 + KILL %t192 + KILL %t194 + KILL %t196 + KILL %t198 + KILL %t200 + KILL %t202 + KILL %t204 + KILL %t206 + KILL %t208 + KILL %t210 + KILL %t212 + KILL %t214 + KILL %t216 + KILL %t218 + KILL %t220 + KILL %t222 + KILL %t224 + KILL %t226 + KILL %t228 + KILL %t230 + KILL %t232 + KILL %t234 + KILL %t236 + KILL %t238 + KILL %t240 + KILL %t242 + KILL %t244 + KILL %t246 + KILL %t248 + KILL %t250 + KILL %t252 + KILL %t254 + + KILL $scc + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir new file mode 100644 index 0000000000000..39d21dbda3819 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir @@ -0,0 +1,564 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test checks that scalar instructions that define $scc are not sunk into ranges where $scc is live +# CHECK: bb.0: +# CHECK-NOT: S_NOT_B32: +# CHECK: bb.2: +# CHECK: %t0:sgpr_32 = S_NOT_B32 0 +# CHECK: %t2:sgpr_32 = S_NOT_B32 1 +# CHECK: %t4:sgpr_32 = S_NOT_B32 2 +# CHECK: %t6:sgpr_32 = S_NOT_B32 3 +# CHECK: %t8:sgpr_32 = S_NOT_B32 4 +# CHECK: %t10:sgpr_32 = S_NOT_B32 5 +# CHECK: %t12:sgpr_32 = S_NOT_B32 6 +# CHECK: %t14:sgpr_32 = S_NOT_B32 7 +# CHECK: %t16:sgpr_32 = S_NOT_B32 8 +# CHECK: %t18:sgpr_32 = S_NOT_B32 9 +# CHECK: %t20:sgpr_32 = S_NOT_B32 10 +# CHECK: %t22:sgpr_32 = S_NOT_B32 11 +# CHECK: %t24:sgpr_32 = S_NOT_B32 12 +# CHECK: %t26:sgpr_32 = S_NOT_B32 13 +# CHECK: %t28:sgpr_32 = S_NOT_B32 14 +# CHECK: %t30:sgpr_32 = S_NOT_B32 15 +# CHECK: %t32:sgpr_32 = S_NOT_B32 16 +# CHECK: %t34:sgpr_32 = S_NOT_B32 17 +# CHECK: %t36:sgpr_32 = S_NOT_B32 18 +# CHECK: %t38:sgpr_32 = S_NOT_B32 19 +# CHECK: %t40:sgpr_32 = S_NOT_B32 20 +# CHECK: %t42:sgpr_32 = S_NOT_B32 21 +# CHECK: %t44:sgpr_32 = S_NOT_B32 22 +# CHECK: %t46:sgpr_32 = S_NOT_B32 23 +# CHECK: %t48:sgpr_32 = S_NOT_B32 24 +# CHECK: %t50:sgpr_32 = S_NOT_B32 25 +# CHECK: %t52:sgpr_32 = S_NOT_B32 26 +# CHECK: %t54:sgpr_32 = S_NOT_B32 27 +# CHECK: %t56:sgpr_32 = S_NOT_B32 28 +# CHECK: %t58:sgpr_32 = S_NOT_B32 29 +# CHECK: %t60:sgpr_32 = S_NOT_B32 30 +# CHECK: %t62:sgpr_32 = S_NOT_B32 31 +# CHECK: %t64:sgpr_32 = S_NOT_B32 32 +# CHECK: %t66:sgpr_32 = S_NOT_B32 33 +# CHECK: %t68:sgpr_32 = S_NOT_B32 34 +# CHECK: %t70:sgpr_32 = S_NOT_B32 35 +# CHECK: %t72:sgpr_32 = S_NOT_B32 36 +# CHECK: %t74:sgpr_32 = S_NOT_B32 37 +# CHECK: %t76:sgpr_32 = S_NOT_B32 38 +# CHECK: %t78:sgpr_32 = S_NOT_B32 39 +# CHECK: %t80:sgpr_32 = S_NOT_B32 40 +# CHECK: %t82:sgpr_32 = S_NOT_B32 41 +# CHECK: %t84:sgpr_32 = S_NOT_B32 42 +# CHECK: %t86:sgpr_32 = S_NOT_B32 43 +# CHECK: %t88:sgpr_32 = S_NOT_B32 44 +# CHECK: %t90:sgpr_32 = S_NOT_B32 45 +# CHECK: %t92:sgpr_32 = S_NOT_B32 46 +# CHECK: %t94:sgpr_32 = S_NOT_B32 47 +# CHECK: %t96:sgpr_32 = S_NOT_B32 48 +# CHECK: %t98:sgpr_32 = S_NOT_B32 49 +# CHECK: %t100:sgpr_32 = S_NOT_B32 50 +# CHECK: %t102:sgpr_32 = S_NOT_B32 51 +# CHECK: %t104:sgpr_32 = S_NOT_B32 52 +# CHECK: %t106:sgpr_32 = S_NOT_B32 53 +# CHECK: %t108:sgpr_32 = S_NOT_B32 54 +# CHECK: %t110:sgpr_32 = S_NOT_B32 55 +# CHECK: %t112:sgpr_32 = S_NOT_B32 56 +# CHECK: %t114:sgpr_32 = S_NOT_B32 57 +# CHECK: %t116:sgpr_32 = S_NOT_B32 58 +# CHECK: %t118:sgpr_32 = S_NOT_B32 59 +# CHECK: %t120:sgpr_32 = S_NOT_B32 60 +# CHECK: %t122:sgpr_32 = S_NOT_B32 61 +# CHECK: %t124:sgpr_32 = S_NOT_B32 62 +# CHECK: %t126:sgpr_32 = S_NOT_B32 63 +# CHECK: %t128:sgpr_32 = S_NOT_B32 64 +# CHECK: %t130:sgpr_32 = S_NOT_B32 65 +# CHECK: %t132:sgpr_32 = S_NOT_B32 66 +# CHECK: %t134:sgpr_32 = S_NOT_B32 67 +# CHECK: %t136:sgpr_32 = S_NOT_B32 68 +# CHECK: %t138:sgpr_32 = S_NOT_B32 69 +# CHECK: %t140:sgpr_32 = S_NOT_B32 70 +# CHECK: %t142:sgpr_32 = S_NOT_B32 71 +# CHECK: %t144:sgpr_32 = S_NOT_B32 72 +# CHECK: %t146:sgpr_32 = S_NOT_B32 73 +# CHECK: %t148:sgpr_32 = S_NOT_B32 74 +# CHECK: %t150:sgpr_32 = S_NOT_B32 75 +# CHECK: %t152:sgpr_32 = S_NOT_B32 76 +# CHECK: %t154:sgpr_32 = S_NOT_B32 77 +# CHECK: %t156:sgpr_32 = S_NOT_B32 78 +# CHECK: %t158:sgpr_32 = S_NOT_B32 79 +# CHECK: %t160:sgpr_32 = S_NOT_B32 80 +# CHECK: %t162:sgpr_32 = S_NOT_B32 81 +# CHECK: %t164:sgpr_32 = S_NOT_B32 82 +# CHECK: %t166:sgpr_32 = S_NOT_B32 83 +# CHECK: %t168:sgpr_32 = S_NOT_B32 84 +# CHECK: %t170:sgpr_32 = S_NOT_B32 85 +# CHECK: %t172:sgpr_32 = S_NOT_B32 86 +# CHECK: %t174:sgpr_32 = S_NOT_B32 87 +# CHECK: %t176:sgpr_32 = S_NOT_B32 88 +# CHECK: %t178:sgpr_32 = S_NOT_B32 89 +# CHECK: %t180:sgpr_32 = S_NOT_B32 90 +# CHECK: %t182:sgpr_32 = S_NOT_B32 91 +# CHECK: %t184:sgpr_32 = S_NOT_B32 92 +# CHECK: %t186:sgpr_32 = S_NOT_B32 93 +# CHECK: %t188:sgpr_32 = S_NOT_B32 94 +# CHECK: %t190:sgpr_32 = S_NOT_B32 95 +# CHECK: %t192:sgpr_32 = S_NOT_B32 96 +# CHECK: %t194:sgpr_32 = S_NOT_B32 97 +# CHECK: %t196:sgpr_32 = S_NOT_B32 98 +# CHECK: %t198:sgpr_32 = S_NOT_B32 99 +# CHECK: %t200:sgpr_32 = S_NOT_B32 100 +# CHECK: %t202:sgpr_32 = S_NOT_B32 101 +# CHECK: %t204:sgpr_32 = S_NOT_B32 102 +# CHECK: %t206:sgpr_32 = S_NOT_B32 103 +# CHECK: %t208:sgpr_32 = S_NOT_B32 104 +# CHECK: %t210:sgpr_32 = S_NOT_B32 105 +# CHECK: %t212:sgpr_32 = S_NOT_B32 106 +# CHECK: %t214:sgpr_32 = S_NOT_B32 107 +# CHECK: %t216:sgpr_32 = S_NOT_B32 108 +# CHECK: %t218:sgpr_32 = S_NOT_B32 109 +# CHECK: %t220:sgpr_32 = S_NOT_B32 110 +# CHECK: %t222:sgpr_32 = S_NOT_B32 111 +# CHECK: %t224:sgpr_32 = S_NOT_B32 112 +# CHECK: %t226:sgpr_32 = S_NOT_B32 113 +# CHECK: %t228:sgpr_32 = S_NOT_B32 114 +# CHECK: %t230:sgpr_32 = S_NOT_B32 115 +# CHECK: %t232:sgpr_32 = S_NOT_B32 116 +# CHECK: %t234:sgpr_32 = S_NOT_B32 117 +# CHECK: %t236:sgpr_32 = S_NOT_B32 118 +# CHECK: %t238:sgpr_32 = S_NOT_B32 119 +# CHECK: %t240:sgpr_32 = S_NOT_B32 120 +# CHECK: %t242:sgpr_32 = S_NOT_B32 121 +# CHECK: %t244:sgpr_32 = S_NOT_B32 122 +# CHECK: %t246:sgpr_32 = S_NOT_B32 123 +# CHECK: %t248:sgpr_32 = S_NOT_B32 124 +# CHECK: %t250:sgpr_32 = S_NOT_B32 125 +# CHECK: %t252:sgpr_32 = S_NOT_B32 126 +# CHECK: %t254:sgpr_32 = S_NOT_B32 127 +# CHECK: KILL %t0 +# CHECK: KILL %t2 +# CHECK: KILL %t4 +# CHECK: KILL %t6 +# CHECK: KILL %t8 +# CHECK: KILL %t10 +# CHECK: KILL %t12 +# CHECK: KILL %t14 +# CHECK: KILL %t16 +# CHECK: KILL %t18 +# CHECK: KILL %t20 +# CHECK: KILL %t22 +# CHECK: KILL %t24 +# CHECK: KILL %t26 +# CHECK: KILL %t28 +# CHECK: KILL %t30 +# CHECK: KILL %t32 +# CHECK: KILL %t34 +# CHECK: KILL %t36 +# CHECK: KILL %t38 +# CHECK: KILL %t40 +# CHECK: KILL %t42 +# CHECK: KILL %t44 +# CHECK: KILL %t46 +# CHECK: KILL %t48 +# CHECK: KILL %t50 +# CHECK: KILL %t52 +# CHECK: KILL %t54 +# CHECK: KILL %t56 +# CHECK: KILL %t58 +# CHECK: KILL %t60 +# CHECK: KILL %t62 +# CHECK: KILL %t64 +# CHECK: KILL %t66 +# CHECK: KILL %t68 +# CHECK: KILL %t70 +# CHECK: KILL %t72 +# CHECK: KILL %t74 +# CHECK: KILL %t76 +# CHECK: KILL %t78 +# CHECK: KILL %t80 +# CHECK: KILL %t82 +# CHECK: KILL %t84 +# CHECK: KILL %t86 +# CHECK: KILL %t88 +# CHECK: KILL %t90 +# CHECK: KILL %t92 +# CHECK: KILL %t94 +# CHECK: KILL %t96 +# CHECK: KILL %t98 +# CHECK: KILL %t100 +# CHECK: KILL %t102 +# CHECK: KILL %t104 +# CHECK: KILL %t106 +# CHECK: KILL %t108 +# CHECK: KILL %t110 +# CHECK: KILL %t112 +# CHECK: KILL %t114 +# CHECK: KILL %t116 +# CHECK: KILL %t118 +# CHECK: KILL %t120 +# CHECK: KILL %t122 +# CHECK: KILL %t124 +# CHECK: KILL %t126 +# CHECK: KILL %t128 +# CHECK: KILL %t130 +# CHECK: KILL %t132 +# CHECK: KILL %t134 +# CHECK: KILL %t136 +# CHECK: KILL %t138 +# CHECK: KILL %t140 +# CHECK: KILL %t142 +# CHECK: KILL %t144 +# CHECK: KILL %t146 +# CHECK: KILL %t148 +# CHECK: KILL %t150 +# CHECK: KILL %t152 +# CHECK: KILL %t154 +# CHECK: KILL %t156 +# CHECK: KILL %t158 +# CHECK: KILL %t160 +# CHECK: KILL %t162 +# CHECK: KILL %t164 +# CHECK: KILL %t166 +# CHECK: KILL %t168 +# CHECK: KILL %t170 +# CHECK: KILL %t172 +# CHECK: KILL %t174 +# CHECK: KILL %t176 +# CHECK: KILL %t178 +# CHECK: KILL %t180 +# CHECK: KILL %t182 +# CHECK: KILL %t184 +# CHECK: KILL %t186 +# CHECK: KILL %t188 +# CHECK: KILL %t190 +# CHECK: KILL %t192 +# CHECK: KILL %t194 +# CHECK: KILL %t196 +# CHECK: KILL %t198 +# CHECK: KILL %t200 +# CHECK: KILL %t202 +# CHECK: KILL %t204 +# CHECK: KILL %t206 +# CHECK: KILL %t208 +# CHECK: KILL %t210 +# CHECK: KILL %t212 +# CHECK: KILL %t214 +# CHECK: KILL %t216 +# CHECK: KILL %t218 +# CHECK: KILL %t220 +# CHECK: KILL %t222 +# CHECK: KILL %t224 +# CHECK: KILL %t226 +# CHECK: KILL %t228 +# CHECK: KILL %t230 +# CHECK: KILL %t232 +# CHECK: KILL %t234 +# CHECK: KILL %t236 +# CHECK: KILL %t238 +# CHECK: KILL %t240 +# CHECK: KILL %t242 +# CHECK: KILL %t244 +# CHECK: KILL %t246 +# CHECK: KILL %t248 +# CHECK: KILL %t250 +# CHECK: KILL %t252 +# CHECK: KILL %t254 + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + $scc = IMPLICIT_DEF + ; Uses + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + KILL %t64 + KILL %t66 + KILL %t68 + KILL %t70 + KILL %t72 + KILL %t74 + KILL %t76 + KILL %t78 + KILL %t80 + KILL %t82 + KILL %t84 + KILL %t86 + KILL %t88 + KILL %t90 + KILL %t92 + KILL %t94 + KILL %t96 + KILL %t98 + KILL %t100 + KILL %t102 + KILL %t104 + KILL %t106 + KILL %t108 + KILL %t110 + KILL %t112 + KILL %t114 + KILL %t116 + KILL %t118 + KILL %t120 + KILL %t122 + KILL %t124 + KILL %t126 + KILL %t128 + KILL %t130 + KILL %t132 + KILL %t134 + KILL %t136 + KILL %t138 + KILL %t140 + KILL %t142 + KILL %t144 + KILL %t146 + KILL %t148 + KILL %t150 + KILL %t152 + KILL %t154 + KILL %t156 + KILL %t158 + KILL %t160 + KILL %t162 + KILL %t164 + KILL %t166 + KILL %t168 + KILL %t170 + KILL %t172 + KILL %t174 + KILL %t176 + KILL %t178 + KILL %t180 + KILL %t182 + KILL %t184 + KILL %t186 + KILL %t188 + KILL %t190 + KILL %t192 + KILL %t194 + KILL %t196 + KILL %t198 + KILL %t200 + KILL %t202 + KILL %t204 + KILL %t206 + KILL %t208 + KILL %t210 + KILL %t212 + KILL %t214 + KILL %t216 + KILL %t218 + KILL %t220 + KILL %t222 + KILL %t224 + KILL %t226 + KILL %t228 + KILL %t230 + KILL %t232 + KILL %t234 + KILL %t236 + KILL %t238 + KILL %t240 + KILL %t242 + KILL %t244 + KILL %t246 + KILL %t248 + KILL %t250 + KILL %t252 + KILL %t254 + + KILL $scc + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir new file mode 100644 index 0000000000000..305bf87a6120e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir @@ -0,0 +1,304 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are present +# CHECK: S_ENDPGM + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %s0:sgpr_32 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.2: + %phi0:sgpr_32 = PHI %t0, %bb.0, %s0, %bb.1 + %phi2:sgpr_32 = PHI %t2, %bb.0, %s0, %bb.1 + %phi4:sgpr_32 = PHI %t4, %bb.0, %s0, %bb.1 + %phi6:sgpr_32 = PHI %t6, %bb.0, %s0, %bb.1 + %phi8:sgpr_32 = PHI %t8, %bb.0, %s0, %bb.1 + %phi10:sgpr_32 = PHI %t10, %bb.0, %s0, %bb.1 + %phi12:sgpr_32 = PHI %t12, %bb.0, %s0, %bb.1 + %phi14:sgpr_32 = PHI %t14, %bb.0, %s0, %bb.1 + %phi16:sgpr_32 = PHI %t16, %bb.0, %s0, %bb.1 + %phi18:sgpr_32 = PHI %t18, %bb.0, %s0, %bb.1 + %phi20:sgpr_32 = PHI %t20, %bb.0, %s0, %bb.1 + %phi22:sgpr_32 = PHI %t22, %bb.0, %s0, %bb.1 + %phi24:sgpr_32 = PHI %t24, %bb.0, %s0, %bb.1 + %phi26:sgpr_32 = PHI %t26, %bb.0, %s0, %bb.1 + %phi28:sgpr_32 = PHI %t28, %bb.0, %s0, %bb.1 + %phi30:sgpr_32 = PHI %t30, %bb.0, %s0, %bb.1 + %phi32:sgpr_32 = PHI %t32, %bb.0, %s0, %bb.1 + %phi34:sgpr_32 = PHI %t34, %bb.0, %s0, %bb.1 + %phi36:sgpr_32 = PHI %t36, %bb.0, %s0, %bb.1 + %phi38:sgpr_32 = PHI %t38, %bb.0, %s0, %bb.1 + %phi40:sgpr_32 = PHI %t40, %bb.0, %s0, %bb.1 + %phi42:sgpr_32 = PHI %t42, %bb.0, %s0, %bb.1 + %phi44:sgpr_32 = PHI %t44, %bb.0, %s0, %bb.1 + %phi46:sgpr_32 = PHI %t46, %bb.0, %s0, %bb.1 + %phi48:sgpr_32 = PHI %t48, %bb.0, %s0, %bb.1 + %phi50:sgpr_32 = PHI %t50, %bb.0, %s0, %bb.1 + %phi52:sgpr_32 = PHI %t52, %bb.0, %s0, %bb.1 + %phi54:sgpr_32 = PHI %t54, %bb.0, %s0, %bb.1 + %phi56:sgpr_32 = PHI %t56, %bb.0, %s0, %bb.1 + %phi58:sgpr_32 = PHI %t58, %bb.0, %s0, %bb.1 + %phi60:sgpr_32 = PHI %t60, %bb.0, %s0, %bb.1 + %phi62:sgpr_32 = PHI %t62, %bb.0, %s0, %bb.1 + %phi64:sgpr_32 = PHI %t64, %bb.0, %s0, %bb.1 + %phi66:sgpr_32 = PHI %t66, %bb.0, %s0, %bb.1 + %phi68:sgpr_32 = PHI %t68, %bb.0, %s0, %bb.1 + %phi70:sgpr_32 = PHI %t70, %bb.0, %s0, %bb.1 + %phi72:sgpr_32 = PHI %t72, %bb.0, %s0, %bb.1 + %phi74:sgpr_32 = PHI %t74, %bb.0, %s0, %bb.1 + %phi76:sgpr_32 = PHI %t76, %bb.0, %s0, %bb.1 + %phi78:sgpr_32 = PHI %t78, %bb.0, %s0, %bb.1 + %phi80:sgpr_32 = PHI %t80, %bb.0, %s0, %bb.1 + %phi82:sgpr_32 = PHI %t82, %bb.0, %s0, %bb.1 + %phi84:sgpr_32 = PHI %t84, %bb.0, %s0, %bb.1 + %phi86:sgpr_32 = PHI %t86, %bb.0, %s0, %bb.1 + %phi88:sgpr_32 = PHI %t88, %bb.0, %s0, %bb.1 + %phi90:sgpr_32 = PHI %t90, %bb.0, %s0, %bb.1 + %phi92:sgpr_32 = PHI %t92, %bb.0, %s0, %bb.1 + %phi94:sgpr_32 = PHI %t94, %bb.0, %s0, %bb.1 + %phi96:sgpr_32 = PHI %t96, %bb.0, %s0, %bb.1 + %phi98:sgpr_32 = PHI %t98, %bb.0, %s0, %bb.1 + %phi100:sgpr_32 = PHI %t100, %bb.0, %s0, %bb.1 + %phi102:sgpr_32 = PHI %t102, %bb.0, %s0, %bb.1 + %phi104:sgpr_32 = PHI %t104, %bb.0, %s0, %bb.1 + %phi106:sgpr_32 = PHI %t106, %bb.0, %s0, %bb.1 + %phi108:sgpr_32 = PHI %t108, %bb.0, %s0, %bb.1 + %phi110:sgpr_32 = PHI %t110, %bb.0, %s0, %bb.1 + %phi112:sgpr_32 = PHI %t112, %bb.0, %s0, %bb.1 + %phi114:sgpr_32 = PHI %t114, %bb.0, %s0, %bb.1 + %phi116:sgpr_32 = PHI %t116, %bb.0, %s0, %bb.1 + %phi118:sgpr_32 = PHI %t118, %bb.0, %s0, %bb.1 + %phi120:sgpr_32 = PHI %t120, %bb.0, %s0, %bb.1 + %phi122:sgpr_32 = PHI %t122, %bb.0, %s0, %bb.1 + %phi124:sgpr_32 = PHI %t124, %bb.0, %s0, %bb.1 + %phi126:sgpr_32 = PHI %t126, %bb.0, %s0, %bb.1 + %phi128:sgpr_32 = PHI %t128, %bb.0, %s0, %bb.1 + %phi130:sgpr_32 = PHI %t130, %bb.0, %s0, %bb.1 + %phi132:sgpr_32 = PHI %t132, %bb.0, %s0, %bb.1 + %phi134:sgpr_32 = PHI %t134, %bb.0, %s0, %bb.1 + %phi136:sgpr_32 = PHI %t136, %bb.0, %s0, %bb.1 + %phi138:sgpr_32 = PHI %t138, %bb.0, %s0, %bb.1 + %phi140:sgpr_32 = PHI %t140, %bb.0, %s0, %bb.1 + %phi142:sgpr_32 = PHI %t142, %bb.0, %s0, %bb.1 + %phi144:sgpr_32 = PHI %t144, %bb.0, %s0, %bb.1 + %phi146:sgpr_32 = PHI %t146, %bb.0, %s0, %bb.1 + %phi148:sgpr_32 = PHI %t148, %bb.0, %s0, %bb.1 + %phi150:sgpr_32 = PHI %t150, %bb.0, %s0, %bb.1 + %phi152:sgpr_32 = PHI %t152, %bb.0, %s0, %bb.1 + %phi154:sgpr_32 = PHI %t154, %bb.0, %s0, %bb.1 + %phi156:sgpr_32 = PHI %t156, %bb.0, %s0, %bb.1 + %phi158:sgpr_32 = PHI %t158, %bb.0, %s0, %bb.1 + %phi160:sgpr_32 = PHI %t160, %bb.0, %s0, %bb.1 + %phi162:sgpr_32 = PHI %t162, %bb.0, %s0, %bb.1 + %phi164:sgpr_32 = PHI %t164, %bb.0, %s0, %bb.1 + %phi166:sgpr_32 = PHI %t166, %bb.0, %s0, %bb.1 + %phi168:sgpr_32 = PHI %t168, %bb.0, %s0, %bb.1 + %phi170:sgpr_32 = PHI %t170, %bb.0, %s0, %bb.1 + %phi172:sgpr_32 = PHI %t172, %bb.0, %s0, %bb.1 + %phi174:sgpr_32 = PHI %t174, %bb.0, %s0, %bb.1 + %phi176:sgpr_32 = PHI %t176, %bb.0, %s0, %bb.1 + %phi178:sgpr_32 = PHI %t178, %bb.0, %s0, %bb.1 + %phi180:sgpr_32 = PHI %t180, %bb.0, %s0, %bb.1 + %phi182:sgpr_32 = PHI %t182, %bb.0, %s0, %bb.1 + %phi184:sgpr_32 = PHI %t184, %bb.0, %s0, %bb.1 + %phi186:sgpr_32 = PHI %t186, %bb.0, %s0, %bb.1 + %phi188:sgpr_32 = PHI %t188, %bb.0, %s0, %bb.1 + %phi190:sgpr_32 = PHI %t190, %bb.0, %s0, %bb.1 + %phi192:sgpr_32 = PHI %t192, %bb.0, %s0, %bb.1 + %phi194:sgpr_32 = PHI %t194, %bb.0, %s0, %bb.1 + %phi196:sgpr_32 = PHI %t196, %bb.0, %s0, %bb.1 + %phi198:sgpr_32 = PHI %t198, %bb.0, %s0, %bb.1 + %phi200:sgpr_32 = PHI %t200, %bb.0, %s0, %bb.1 + %phi202:sgpr_32 = PHI %t202, %bb.0, %s0, %bb.1 + %phi204:sgpr_32 = PHI %t204, %bb.0, %s0, %bb.1 + %phi206:sgpr_32 = PHI %t206, %bb.0, %s0, %bb.1 + %phi208:sgpr_32 = PHI %t208, %bb.0, %s0, %bb.1 + %phi210:sgpr_32 = PHI %t210, %bb.0, %s0, %bb.1 + %phi212:sgpr_32 = PHI %t212, %bb.0, %s0, %bb.1 + %phi214:sgpr_32 = PHI %t214, %bb.0, %s0, %bb.1 + %phi216:sgpr_32 = PHI %t216, %bb.0, %s0, %bb.1 + %phi218:sgpr_32 = PHI %t218, %bb.0, %s0, %bb.1 + %phi220:sgpr_32 = PHI %t220, %bb.0, %s0, %bb.1 + %phi222:sgpr_32 = PHI %t222, %bb.0, %s0, %bb.1 + %phi224:sgpr_32 = PHI %t224, %bb.0, %s0, %bb.1 + %phi226:sgpr_32 = PHI %t226, %bb.0, %s0, %bb.1 + %phi228:sgpr_32 = PHI %t228, %bb.0, %s0, %bb.1 + %phi230:sgpr_32 = PHI %t230, %bb.0, %s0, %bb.1 + %phi232:sgpr_32 = PHI %t232, %bb.0, %s0, %bb.1 + %phi234:sgpr_32 = PHI %t234, %bb.0, %s0, %bb.1 + %phi236:sgpr_32 = PHI %t236, %bb.0, %s0, %bb.1 + %phi238:sgpr_32 = PHI %t238, %bb.0, %s0, %bb.1 + %phi240:sgpr_32 = PHI %t240, %bb.0, %s0, %bb.1 + %phi242:sgpr_32 = PHI %t242, %bb.0, %s0, %bb.1 + %phi244:sgpr_32 = PHI %t244, %bb.0, %s0, %bb.1 + %phi246:sgpr_32 = PHI %t246, %bb.0, %s0, %bb.1 + %phi248:sgpr_32 = PHI %t248, %bb.0, %s0, %bb.1 + %phi250:sgpr_32 = PHI %t250, %bb.0, %s0, %bb.1 + %phi252:sgpr_32 = PHI %t252, %bb.0, %s0, %bb.1 + %phi254:sgpr_32 = PHI %t254, %bb.0, %s0, %bb.1 + + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir new file mode 100644 index 0000000000000..94e86a61c09d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir @@ -0,0 +1,564 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test checks that instructions that use $scc are sunk to users +# CHECK: bb.0: +# CHECK-NOT: S_NOT_B32: +# CHECK: bb.2: +# CHECK: %t0:sgpr_32 = S_NOT_B32 0 +# CHECK: KILL %t0 +# CHECK: %t2:sgpr_32 = S_NOT_B32 1 +# CHECK: KILL %t2 +# CHECK: %t4:sgpr_32 = S_NOT_B32 2 +# CHECK: KILL %t4 +# CHECK: %t6:sgpr_32 = S_NOT_B32 3 +# CHECK: KILL %t6 +# CHECK: %t8:sgpr_32 = S_NOT_B32 4 +# CHECK: KILL %t8 +# CHECK: %t10:sgpr_32 = S_NOT_B32 5 +# CHECK: KILL %t10 +# CHECK: %t12:sgpr_32 = S_NOT_B32 6 +# CHECK: KILL %t12 +# CHECK: %t14:sgpr_32 = S_NOT_B32 7 +# CHECK: KILL %t14 +# CHECK: %t16:sgpr_32 = S_NOT_B32 8 +# CHECK: KILL %t16 +# CHECK: %t18:sgpr_32 = S_NOT_B32 9 +# CHECK: KILL %t18 +# CHECK: %t20:sgpr_32 = S_NOT_B32 10 +# CHECK: KILL %t20 +# CHECK: %t22:sgpr_32 = S_NOT_B32 11 +# CHECK: KILL %t22 +# CHECK: %t24:sgpr_32 = S_NOT_B32 12 +# CHECK: KILL %t24 +# CHECK: %t26:sgpr_32 = S_NOT_B32 13 +# CHECK: KILL %t26 +# CHECK: %t28:sgpr_32 = S_NOT_B32 14 +# CHECK: KILL %t28 +# CHECK: %t30:sgpr_32 = S_NOT_B32 15 +# CHECK: KILL %t30 +# CHECK: %t32:sgpr_32 = S_NOT_B32 16 +# CHECK: KILL %t32 +# CHECK: %t34:sgpr_32 = S_NOT_B32 17 +# CHECK: KILL %t34 +# CHECK: %t36:sgpr_32 = S_NOT_B32 18 +# CHECK: KILL %t36 +# CHECK: %t38:sgpr_32 = S_NOT_B32 19 +# CHECK: KILL %t38 +# CHECK: %t40:sgpr_32 = S_NOT_B32 20 +# CHECK: KILL %t40 +# CHECK: %t42:sgpr_32 = S_NOT_B32 21 +# CHECK: KILL %t42 +# CHECK: %t44:sgpr_32 = S_NOT_B32 22 +# CHECK: KILL %t44 +# CHECK: %t46:sgpr_32 = S_NOT_B32 23 +# CHECK: KILL %t46 +# CHECK: %t48:sgpr_32 = S_NOT_B32 24 +# CHECK: KILL %t48 +# CHECK: %t50:sgpr_32 = S_NOT_B32 25 +# CHECK: KILL %t50 +# CHECK: %t52:sgpr_32 = S_NOT_B32 26 +# CHECK: KILL %t52 +# CHECK: %t54:sgpr_32 = S_NOT_B32 27 +# CHECK: KILL %t54 +# CHECK: %t56:sgpr_32 = S_NOT_B32 28 +# CHECK: KILL %t56 +# CHECK: %t58:sgpr_32 = S_NOT_B32 29 +# CHECK: KILL %t58 +# CHECK: %t60:sgpr_32 = S_NOT_B32 30 +# CHECK: KILL %t60 +# CHECK: %t62:sgpr_32 = S_NOT_B32 31 +# CHECK: KILL %t62 +# CHECK: %t64:sgpr_32 = S_NOT_B32 32 +# CHECK: KILL %t64 +# CHECK: %t66:sgpr_32 = S_NOT_B32 33 +# CHECK: KILL %t66 +# CHECK: %t68:sgpr_32 = S_NOT_B32 34 +# CHECK: KILL %t68 +# CHECK: %t70:sgpr_32 = S_NOT_B32 35 +# CHECK: KILL %t70 +# CHECK: %t72:sgpr_32 = S_NOT_B32 36 +# CHECK: KILL %t72 +# CHECK: %t74:sgpr_32 = S_NOT_B32 37 +# CHECK: KILL %t74 +# CHECK: %t76:sgpr_32 = S_NOT_B32 38 +# CHECK: KILL %t76 +# CHECK: %t78:sgpr_32 = S_NOT_B32 39 +# CHECK: KILL %t78 +# CHECK: %t80:sgpr_32 = S_NOT_B32 40 +# CHECK: KILL %t80 +# CHECK: %t82:sgpr_32 = S_NOT_B32 41 +# CHECK: KILL %t82 +# CHECK: %t84:sgpr_32 = S_NOT_B32 42 +# CHECK: KILL %t84 +# CHECK: %t86:sgpr_32 = S_NOT_B32 43 +# CHECK: KILL %t86 +# CHECK: %t88:sgpr_32 = S_NOT_B32 44 +# CHECK: KILL %t88 +# CHECK: %t90:sgpr_32 = S_NOT_B32 45 +# CHECK: KILL %t90 +# CHECK: %t92:sgpr_32 = S_NOT_B32 46 +# CHECK: KILL %t92 +# CHECK: %t94:sgpr_32 = S_NOT_B32 47 +# CHECK: KILL %t94 +# CHECK: %t96:sgpr_32 = S_NOT_B32 48 +# CHECK: KILL %t96 +# CHECK: %t98:sgpr_32 = S_NOT_B32 49 +# CHECK: KILL %t98 +# CHECK: %t100:sgpr_32 = S_NOT_B32 50 +# CHECK: KILL %t100 +# CHECK: %t102:sgpr_32 = S_NOT_B32 51 +# CHECK: KILL %t102 +# CHECK: %t104:sgpr_32 = S_NOT_B32 52 +# CHECK: KILL %t104 +# CHECK: %t106:sgpr_32 = S_NOT_B32 53 +# CHECK: KILL %t106 +# CHECK: %t108:sgpr_32 = S_NOT_B32 54 +# CHECK: KILL %t108 +# CHECK: %t110:sgpr_32 = S_NOT_B32 55 +# CHECK: KILL %t110 +# CHECK: %t112:sgpr_32 = S_NOT_B32 56 +# CHECK: KILL %t112 +# CHECK: %t114:sgpr_32 = S_NOT_B32 57 +# CHECK: KILL %t114 +# CHECK: %t116:sgpr_32 = S_NOT_B32 58 +# CHECK: KILL %t116 +# CHECK: %t118:sgpr_32 = S_NOT_B32 59 +# CHECK: KILL %t118 +# CHECK: %t120:sgpr_32 = S_NOT_B32 60 +# CHECK: KILL %t120 +# CHECK: %t122:sgpr_32 = S_NOT_B32 61 +# CHECK: KILL %t122 +# CHECK: %t124:sgpr_32 = S_NOT_B32 62 +# CHECK: KILL %t124 +# CHECK: %t126:sgpr_32 = S_NOT_B32 63 +# CHECK: KILL %t126 +# CHECK: %t128:sgpr_32 = S_NOT_B32 64 +# CHECK: KILL %t128 +# CHECK: %t130:sgpr_32 = S_NOT_B32 65 +# CHECK: KILL %t130 +# CHECK: %t132:sgpr_32 = S_NOT_B32 66 +# CHECK: KILL %t132 +# CHECK: %t134:sgpr_32 = S_NOT_B32 67 +# CHECK: KILL %t134 +# CHECK: %t136:sgpr_32 = S_NOT_B32 68 +# CHECK: KILL %t136 +# CHECK: %t138:sgpr_32 = S_NOT_B32 69 +# CHECK: KILL %t138 +# CHECK: %t140:sgpr_32 = S_NOT_B32 70 +# CHECK: KILL %t140 +# CHECK: %t142:sgpr_32 = S_NOT_B32 71 +# CHECK: KILL %t142 +# CHECK: %t144:sgpr_32 = S_NOT_B32 72 +# CHECK: KILL %t144 +# CHECK: %t146:sgpr_32 = S_NOT_B32 73 +# CHECK: KILL %t146 +# CHECK: %t148:sgpr_32 = S_NOT_B32 74 +# CHECK: KILL %t148 +# CHECK: %t150:sgpr_32 = S_NOT_B32 75 +# CHECK: KILL %t150 +# CHECK: %t152:sgpr_32 = S_NOT_B32 76 +# CHECK: KILL %t152 +# CHECK: %t154:sgpr_32 = S_NOT_B32 77 +# CHECK: KILL %t154 +# CHECK: %t156:sgpr_32 = S_NOT_B32 78 +# CHECK: KILL %t156 +# CHECK: %t158:sgpr_32 = S_NOT_B32 79 +# CHECK: KILL %t158 +# CHECK: %t160:sgpr_32 = S_NOT_B32 80 +# CHECK: KILL %t160 +# CHECK: %t162:sgpr_32 = S_NOT_B32 81 +# CHECK: KILL %t162 +# CHECK: %t164:sgpr_32 = S_NOT_B32 82 +# CHECK: KILL %t164 +# CHECK: %t166:sgpr_32 = S_NOT_B32 83 +# CHECK: KILL %t166 +# CHECK: %t168:sgpr_32 = S_NOT_B32 84 +# CHECK: KILL %t168 +# CHECK: %t170:sgpr_32 = S_NOT_B32 85 +# CHECK: KILL %t170 +# CHECK: %t172:sgpr_32 = S_NOT_B32 86 +# CHECK: KILL %t172 +# CHECK: %t174:sgpr_32 = S_NOT_B32 87 +# CHECK: KILL %t174 +# CHECK: %t176:sgpr_32 = S_NOT_B32 88 +# CHECK: KILL %t176 +# CHECK: %t178:sgpr_32 = S_NOT_B32 89 +# CHECK: KILL %t178 +# CHECK: %t180:sgpr_32 = S_NOT_B32 90 +# CHECK: KILL %t180 +# CHECK: %t182:sgpr_32 = S_NOT_B32 91 +# CHECK: KILL %t182 +# CHECK: %t184:sgpr_32 = S_NOT_B32 92 +# CHECK: KILL %t184 +# CHECK: %t186:sgpr_32 = S_NOT_B32 93 +# CHECK: KILL %t186 +# CHECK: %t188:sgpr_32 = S_NOT_B32 94 +# CHECK: KILL %t188 +# CHECK: %t190:sgpr_32 = S_NOT_B32 95 +# CHECK: KILL %t190 +# CHECK: %t192:sgpr_32 = S_NOT_B32 96 +# CHECK: KILL %t192 +# CHECK: %t194:sgpr_32 = S_NOT_B32 97 +# CHECK: KILL %t194 +# CHECK: %t196:sgpr_32 = S_NOT_B32 98 +# CHECK: KILL %t196 +# CHECK: %t198:sgpr_32 = S_NOT_B32 99 +# CHECK: KILL %t198 +# CHECK: %t200:sgpr_32 = S_NOT_B32 100 +# CHECK: KILL %t200 +# CHECK: %t202:sgpr_32 = S_NOT_B32 101 +# CHECK: KILL %t202 +# CHECK: %t204:sgpr_32 = S_NOT_B32 102 +# CHECK: KILL %t204 +# CHECK: %t206:sgpr_32 = S_NOT_B32 103 +# CHECK: KILL %t206 +# CHECK: %t208:sgpr_32 = S_NOT_B32 104 +# CHECK: KILL %t208 +# CHECK: %t210:sgpr_32 = S_NOT_B32 105 +# CHECK: KILL %t210 +# CHECK: %t212:sgpr_32 = S_NOT_B32 106 +# CHECK: KILL %t212 +# CHECK: %t214:sgpr_32 = S_NOT_B32 107 +# CHECK: KILL %t214 +# CHECK: %t216:sgpr_32 = S_NOT_B32 108 +# CHECK: KILL %t216 +# CHECK: %t218:sgpr_32 = S_NOT_B32 109 +# CHECK: KILL %t218 +# CHECK: %t220:sgpr_32 = S_NOT_B32 110 +# CHECK: KILL %t220 +# CHECK: %t222:sgpr_32 = S_NOT_B32 111 +# CHECK: KILL %t222 +# CHECK: %t224:sgpr_32 = S_NOT_B32 112 +# CHECK: KILL %t224 +# CHECK: %t226:sgpr_32 = S_NOT_B32 113 +# CHECK: KILL %t226 +# CHECK: %t228:sgpr_32 = S_NOT_B32 114 +# CHECK: KILL %t228 +# CHECK: %t230:sgpr_32 = S_NOT_B32 115 +# CHECK: KILL %t230 +# CHECK: %t232:sgpr_32 = S_NOT_B32 116 +# CHECK: KILL %t232 +# CHECK: %t234:sgpr_32 = S_NOT_B32 117 +# CHECK: KILL %t234 +# CHECK: %t236:sgpr_32 = S_NOT_B32 118 +# CHECK: KILL %t236 +# CHECK: %t238:sgpr_32 = S_NOT_B32 119 +# CHECK: KILL %t238 +# CHECK: %t240:sgpr_32 = S_NOT_B32 120 +# CHECK: KILL %t240 +# CHECK: %t242:sgpr_32 = S_NOT_B32 121 +# CHECK: KILL %t242 +# CHECK: %t244:sgpr_32 = S_NOT_B32 122 +# CHECK: KILL %t244 +# CHECK: %t246:sgpr_32 = S_NOT_B32 123 +# CHECK: KILL %t246 +# CHECK: %t248:sgpr_32 = S_NOT_B32 124 +# CHECK: KILL %t248 +# CHECK: %t250:sgpr_32 = S_NOT_B32 125 +# CHECK: KILL %t250 +# CHECK: %t252:sgpr_32 = S_NOT_B32 126 +# CHECK: KILL %t252 +# CHECK: %t254:sgpr_32 = S_NOT_B32 127 +# CHECK: KILL %t254 + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + + ; Uses + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + KILL %t64 + KILL %t66 + KILL %t68 + KILL %t70 + KILL %t72 + KILL %t74 + KILL %t76 + KILL %t78 + KILL %t80 + KILL %t82 + KILL %t84 + KILL %t86 + KILL %t88 + KILL %t90 + KILL %t92 + KILL %t94 + KILL %t96 + KILL %t98 + KILL %t100 + KILL %t102 + KILL %t104 + KILL %t106 + KILL %t108 + KILL %t110 + KILL %t112 + KILL %t114 + KILL %t116 + KILL %t118 + KILL %t120 + KILL %t122 + KILL %t124 + KILL %t126 + KILL %t128 + KILL %t130 + KILL %t132 + KILL %t134 + KILL %t136 + KILL %t138 + KILL %t140 + KILL %t142 + KILL %t144 + KILL %t146 + KILL %t148 + KILL %t150 + KILL %t152 + KILL %t154 + KILL %t156 + KILL %t158 + KILL %t160 + KILL %t162 + KILL %t164 + KILL %t166 + KILL %t168 + KILL %t170 + KILL %t172 + KILL %t174 + KILL %t176 + KILL %t178 + KILL %t180 + KILL %t182 + KILL %t184 + KILL %t186 + KILL %t188 + KILL %t190 + KILL %t192 + KILL %t194 + KILL %t196 + KILL %t198 + KILL %t200 + KILL %t202 + KILL %t204 + KILL %t206 + KILL %t208 + KILL %t210 + KILL %t212 + KILL %t214 + KILL %t216 + KILL %t218 + KILL %t220 + KILL %t222 + KILL %t224 + KILL %t226 + KILL %t228 + KILL %t230 + KILL %t232 + KILL %t234 + KILL %t236 + KILL %t238 + KILL %t240 + KILL %t242 + KILL %t244 + KILL %t246 + KILL %t248 + KILL %t250 + KILL %t252 + KILL %t254 + + + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file From d755d527434295a157824fe51b0da601778cc14f Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 6 May 2025 20:18:42 -0700 Subject: [PATCH 11/11] call empty instead of size==0 --- llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 3a0fa5cad4c13..b00d286c938f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -817,7 +817,7 @@ findInsertBlock(MachineInstr &DefMI, Register Reg, MachineDominatorTree *DT, for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { BBSet.insert(UseMI.getParent()); } - if (BBSet.size() == 0) + if (BBSet.empty()) return nullptr; MachineBasicBlock *BB = *BBSet.begin();