diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index f5c2b09c84806..24e9bb358d519 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -491,6 +491,10 @@ extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); extern char &AMDGPUWaitSGPRHazardsLegacyID; +void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &); +FunctionPass *createAMDGPUHotBlockRematerializePass(); +extern char &AMDGPUHotBlockRematerializeID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp new file mode 100644 index 0000000000000..7d2e1a6d81db8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -0,0 +1,4593 @@ +//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU hot block Rematerialize +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMIRUtils.h" +#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "AMDGPUSubExpDag.h" +#include "AMDGPUSubtarget.h" +#include "GCNRegPressure.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/SlotIndexes.h" + +#include "llvm/CodeGen/MachineCycleAnalysis.h" +#include "llvm/CodeGen/MachineUniformityAnalysis.h" + +#include +#define DEBUG_TYPE "amdgpu-hot-block-remat" + +using namespace llvm; + +static cl::opt TargetOccupancy("amdgpu-remat-target-occupancy"); +static cl::opt + EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive"); +static cl::opt + EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive"); +static cl::opt + EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone"); +static cl::opt EnableVmemDegree("amdgpu-remat-enable-vmem-degree"); +static cl::opt EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat"); +static cl::opt EnableSubExp("amdgpu-remat-enable-sub-exp-remat"); +static cl::opt + EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos"); +static cl::opt + EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg"); + +namespace { +typedef DenseSet InstSet; +typedef DenseSet BlockSet; +template using BlockMap = MapVector; + +// Rematerialize in a single pass instead of doing in register allcation. +// If in register allocation, fail to rematerialize will cause spill. +class AMDGPUHotBlockRematerialize : public MachineFunctionPass { + +public: + static char ID; + + DenseSet TotalUniformInsts; + DenseSet SafeToRemoveInsts; + DenseSet DivergentInsts; + void removeInst(const MachineInstr *MI) { + TotalUniformInsts.erase(MI); + SafeToRemoveInsts.erase(MI); + DivergentInsts.erase(MI); + } + + AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "AMDGPU rematerialize"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +typedef AMDGPUHotBlockRematerialize Remat; + +} // end anonymous namespace + +// Util functions. +namespace { + +MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT, + BlockSet &Blocks) { + auto I = Blocks.begin(), E = Blocks.end(); + + MachineBasicBlock *DomB = cast(*(I++)); + while (I != E) { + MachineBasicBlock *B = cast(*(I++)); + DomB = DT->findNearestCommonDominator(DomB, B); + if (DomB == nullptr) + return nullptr; + } + // For split block like: + // bb.42: + // %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec, + // // implicit $exec + // %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // implicitdef $scc, implicit $exec + // + // bb.68: + //; predecessors: %bb.42 + // successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%), + // %bb.43(50.00%) + // + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // which is from + // bb.42: + //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit + //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // The real common dom is bb.42. + // TODO: use _term version of exec update instructions so don't need this + // anymore. + if (DomB && DomB->pred_size() == 1 && !DomB->empty()) { + // Upstreaming note: This used to be SI_MASK_BRANCH + if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) { + MachineBasicBlock *Pred = *DomB->pred_begin(); + if (Pred->succ_size() == 1 && + (Pred->empty() || !Pred->back().isBranch())) { + DomB = Pred; + } + } + } + + return DomB; +} + +MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB, + MachineDominatorTree *DT, + MachineLoopInfo *LI) { + while (LI->getLoopDepth(BB) > 0) { + MachineDomTreeNode *N = DT->getNode(BB); + if (N == nullptr) + return nullptr; + MachineDomTreeNode *IDom = N->getIDom(); + if (IDom == nullptr) + return nullptr; + + BB = IDom->getBlock(); + } + + return BB; +} + +MachineBasicBlock * +findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + const MachineRegisterInfo &MRI, bool MemBound) { + + BlockSet BBSet; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + BBSet.insert(UseMI.getParent()); + } + if (BBSet.size() == 0) + return nullptr; + + MachineBasicBlock *BB = *BBSet.begin(); + if (BBSet.size() > 1) { + MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet); + if (!BDom) + return nullptr; + BB = BDom; + } + // Try to find non loop dominator. + if (!MemBound) { + BB = findNonLoopDominator(BB, DT, MLI); + } + if (!BB) + return nullptr; + + // If BB is already a hot block, move to BB will not help. + // hotBlockRemat will fail It when process BB. + + // Must reachable from DefMI. + if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB)) + return nullptr; + + return BB; +} + +// Maybe expensive to be called all over the place +bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + for (auto &Def : DefMI->defs()) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) { + if (UseMI.isPHI()) + return true; + } + } + return false; +} + +bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + // Do not move PHI nodes + if (isUsedByPhi(DefMI, MRI)) + return false; + + unsigned OpNum = DefMI->getNumOperands(); + // Only move DefMI which all operand is unique def. + for (unsigned I = 0; I < OpNum; I++) { + MachineOperand &Op = DefMI->getOperand(I); + if (!Op.isReg()) + continue; + if (!MRI.getUniqueVRegDef(Op.getReg()) && + !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) { + return false; + } + } + return true; +} + +// SGPR has alignment requirment, cannot get accurate reg number. +const unsigned NearTargetRegLimit = 10; +bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST, + MachineFunction &MF) { + unsigned MaxSGPR = ST->getAddressableNumSGPRs(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + Register ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg) + MaxSGPR -= 4; + + const unsigned AlignmentDelta = 3; + MaxSGPR -= AlignmentDelta; + + return MaxSPressure > MaxSGPR; +} + +struct RematStatus { + unsigned TargetOcc; + unsigned TargetVLimit; + unsigned TargetSLimit; + unsigned MaxVPressure; + unsigned MaxSPressure; + unsigned InputPhysicalVPressure; + unsigned InputPhysicalSPressure; + // More occupancy can help more than latency cost to reach It. + bool MemBound; + // abs(VTargetOcc-STargetOcc) > 1. + bool NotBalance; + DenseMap MBBPressureMap; + DenseMap MBBInputLiveMap; + DenseMap MBBOutputLiveMap; + // Collect MBBs which has memory write. When move instructions cross MBB, skip + // mem inst if the MBB has memory write. To make things fast, just check + // mayStore and isBarrier. + DenseSet MemWriteMBBSet; +}; + +unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { + // Skip processing current block if It has only debug instructions + if (MBB.getFirstNonDebugInstr() == MBB.end()) + return ST->getOccupancyWithNumVGPRs(0); + auto BBEnd = MBB.rbegin(); + GCNUpwardRPTracker RPTracker(*LIS); + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (!llvm::getNonDebugMBBEnd(BBEnd, MBB)) + return ST->getOccupancyWithNumVGPRs(0); + + GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; + RPTracker.reset(*BBEnd, &OutputLive, true); + + for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) { + MachineInstr &MI = (*I++); + RPTracker.recede(MI); + if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH)) + Status.MemWriteMBBSet.insert(&MBB); + } + + GCNRegPressure RP = RPTracker.getMaxPressureAndReset(); + unsigned SPressure = RP.getMaxSGPR(); + if (SPressure > MaxSPressure) { + MaxSPressure = SPressure; + } + if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) { + MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + } + Status.MBBPressureMap[&MBB] = RP; + return RP.getOccupancy(*ST); +} + +unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { + unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second; + // If only have one block, input/ouput virtual live set are empty. + if (MF.size() > 1) { + // Build input output live reg first. + auto *SlotIndexes = LIS->getSlotIndexes(); + DenseMap MBBInputSlotMap; + DenseMap MBBOutputSlotMap; + for (MachineBasicBlock &MBB : MF) { + auto BBBegin = MBB.getFirstNonDebugInstr(); + if (BBBegin != MBB.end()) { + auto SI = SlotIndexes->getInstructionIndex(*BBBegin); + MBBInputSlotMap[&MBB] = SI; + } + + auto BBEnd = MBB.rbegin(); + + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) { + auto SI = SlotIndexes->getInstructionIndex(*BBEnd); + MBBOutputSlotMap[&MBB] = SI; + } + } + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + + const auto &LI = LIS->getInterval(Reg); + + // Skip local live interval to make live input/ouput faster. + if (llvm::isLocalLiveInterval(LI, SlotIndexes)) + continue; + + for (auto InputIt : MBBInputSlotMap) { + MachineBasicBlock *MBB = InputIt.first; + auto SI = InputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + Status.MBBInputLiveMap[MBB][Reg] |= LiveMask; + } + + for (auto OutputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = OutputIt.first; + auto SI = OutputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask; + } + } + } + + LLVM_DEBUG( + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) { + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + } dbgs() << "input live"; + for (auto &It : Status.MBBInputLiveMap) { + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + }); + + for (auto It = MF.begin(); It != MF.end(); ++It) { + MachineBasicBlock &MBB = *It; + unsigned Occ = + collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status); + if (TgtOcc > Occ) + TgtOcc = Occ; + } + return TgtOcc; +} +RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const GCNSubtarget *ST) { + unsigned MaxSPressure = 0; + unsigned MaxVPressure = 0; + RematStatus Status; + unsigned TgtOcc = + collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status); + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (TgtOcc >= MaxOcc) { + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = 0; + Status.TargetSLimit = 0; + Status.MaxVPressure = 0; + Status.MaxSPressure = 0; + Status.InputPhysicalVPressure = 0; + Status.InputPhysicalSPressure = 0; + Status.MemBound = false; + Status.NotBalance = false; + return Status; + } + + MaxSPressure += RegForVCC; + MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF)); + unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure); + unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure); + + llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI); + bool MemBound = + TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc); + + bool NotBalance = false; + + const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU(); + // Currently, only sgpr bound can be fixed with remat. + if (STgtOcc < VTgtOcc) { + unsigned BigOcc = std::max(STgtOcc, VTgtOcc); + // Change TgtOcc to in case sgpr and vgpr is not balance. + if (BigOcc > TgtOcc) { + TgtOcc = BigOcc; + NotBalance = true; + if (TgtOcc >= MaxOccupancy) + TgtOcc = MaxOccupancy - 1; + } + } + + // Collect input physical pressure. + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + unsigned VInputPressure = 0; + uint64_t SInputMask = 0; + for (const auto &Livein : MRI.liveins()) { + const Register Reg = Livein.first; + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + assert(Reg.isPhysical() && "input must be physical reg"); + unsigned RegSize = RC->getLaneMask().getNumLanes(); + if (SIRI->isVGPR(MRI, Reg)) { + VInputPressure += RegSize; + } else { + unsigned RegIndex = SIRI->getHWRegIndex(Reg); + uint64_t Mask = ((1 << RegSize) - 1) << RegIndex; + SInputMask |= Mask; + } + } + // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high + // pressure. + unsigned SInputPressure = 0; + uint64_t Mask = 0xf; + while (Mask != 0) { + if (Mask & SInputMask) { + SInputPressure += 4; + } + Mask = Mask << 4; + } + + // If balanced, try next occupancy. + TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1); + + auto CC = MF.getFunction().getCallingConv(); + bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS; + // For shader profiles other than ps/cs, set target profile max as 4. + if (!IsPsCs) { + TgtOcc = TgtOcc > 4 ? 4 : TgtOcc; + } + if (TargetOccupancy) + TgtOcc = TargetOccupancy; + + unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true); + unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc); + + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = VLimit; + Status.TargetSLimit = SLimit; + Status.MaxVPressure = MaxVPressure; + Status.MaxSPressure = MaxSPressure; + Status.InputPhysicalVPressure = VInputPressure; + Status.InputPhysicalSPressure = SInputPressure; + Status.MemBound = MemBound; + Status.NotBalance = NotBalance; + return Status; +} + +} // namespace + +// Remat. +namespace { + +struct RematNode { + enum class RematKind { + Candidate, // Not ready yet. + OneDefOneUse, + Clone, + }; + RematNode() + : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(0) {} + RematNode(unsigned R, MachineInstr *MI, unsigned S) + : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(S) {} + unsigned Reg; + MachineInstr *DefMI; + MachineBasicBlock *InsertBlock; + union { + MachineInstr *InsertPointMI; + unsigned UserCount; + }; + RematKind Kind; + unsigned Size; +}; + +struct BlockLiveInfo { + MachineBasicBlock *BB; + unsigned MaxSReg; + unsigned MaxVReg; + // Input live is the live reg which cross block. + const GCNRPTracker::LiveRegSet InputLive; +}; + +// Skip live reg remated to other block. +void updateLiveInfo(MapVector &RematMap, + GCNRPTracker::LiveRegSet &LiveSet, + const GCNRPTracker::LiveRegSet &InputLive, + MachineBasicBlock *CurBB, + DenseMap &RPOTIndexMap) { + for (auto &It : RematMap) { + unsigned Reg = It.first; + // Skip reg not in live set. + if (!LiveSet.count(Reg)) + continue; + // Skip reg already in input set. + // Input set will be taken care in getReducedSize. + if (InputLive.count(Reg)) + continue; + + auto &Node = It.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is after InsertBB in Reverse post order, the def is + // still before LiveInfo.BB, It is still live. + unsigned LiveBBIndex = RPOTIndexMap[CurBB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex > InsertBBIndex) { + continue; + } + } + // Already in remat map, don't need to check again, remove from + // candidate. + LiveSet.erase(Reg); + } +} + +int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + + // Find shared operand in ReducedInsts. + int SharedSize = 0; + DenseMap SharedRegMaskMap; + for (MachineInstr *DefMI : ReducedInsts) { + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + Register Reg = MO.getReg(); + + if (Reg == AMDGPU::EXEC) + continue; + if (!Reg.isVirtual()) + continue; + + if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) { + // Not support mix of v and s when remat now. + continue; + } + + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + unsigned Mask; + if (unsigned SubIdx = MO.getSubReg()) { + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + Mask = (1 << SubMOSize) - 1; + } else { + Mask = (1 << MOSize) - 1; + } + auto SharedRegIt = SharedRegMaskMap.find(Reg); + if (SharedRegIt == SharedRegMaskMap.end()) { + SharedRegMaskMap[Reg] = LaneBitmask(Mask); + } else { + unsigned PrevMask = SharedRegIt->second.getAsInteger(); + if (unsigned SharedMask = (PrevMask & Mask)) { + // Some thing is shared. + for (int I = 0; I < MOSize; I++) { + if (SharedMask & (1 << I)) { + SharedSize += 1; + } + } + } + LaneBitmask MoMask = LaneBitmask(Mask | PrevMask); + SharedRegMaskMap[Reg] = MoMask; + } + } + } + return SharedSize; +} + +int getReducedSize(MapVector &RematMap, + GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts, + const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo, + DenseMap &RPOTIndexMap) { + int ReducedSize = 0; + for (auto &It : RematMap) { + Register Reg = It.first; + + if (!CanidateSet.count(Reg)) + continue; + + bool IsReduced = false; + auto &Node = It.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is before InsertBB in Reverse post order, the def is + // moved after LiveInfo.BB, It is not live anymore. + unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex < InsertBBIndex) + IsReduced = true; + } else { + // Clone. + IsReduced = true; + // If has use in LiveInfo.BB, could not reduce from input live. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == LiveInfo.BB) { + IsReduced = false; + break; + } + } + } + if (IsReduced) { + ReducedSize += Node.Size; + ReducedInsts.insert(Node.DefMI); + } + + // Already in remat map, don't need to check again, remove from candidate. + CanidateSet.erase(Reg); + } + + return ReducedSize; +} + +int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, bool IsVGPR) { + int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + + if (MO.getReg() == AMDGPU::EXEC) + continue; + + // Don't move user of VCC. + if (MO.getReg() == AMDGPU::VCC) { + RematSize = 0; + break; + } + Register Reg = MO.getReg(); + + // Don't move physical register use. + if (Reg.isPhysical()) { + RematSize = 0; + break; + } + + if (IsVGPR != SIRI->isVGPR(MRI, Reg)) { + // Not support mix of v and s when remat now. + // TODO: count possible pressure change here. + RematSize = 0; + break; + } + bool IsSingleDef = MRI.hasOneDef(Reg); + if (!IsSingleDef) { + IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI); + } + + if (IsSingleDef) { + // The reg might share with other candidates, check It here. + // Count share reg in getReducedSize. + if (EnableAggressive) { + // In case of aggressive remat, treat multi use reg as shared reg and + // ignore size of shared reg. + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + } + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + if (unsigned SubIdx = MO.getSubReg()) { + if (OpRC) + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + } + int InputSize = SIRI->getRegSizeInBits(*OpRC); + // If input not live in hotspot, move It cross hotspot should have + // less reg then DefMi. + if (RematSize > InputSize) { + RematSize -= InputSize; + continue; + } + } + + RematSize = 0; + break; + } + return RematSize; +} + +void buildRematCandiates(std::vector &Candidates, + GCNRPTracker::LiveRegSet &CandidateRegSet, + DenseSet &PinnedRegSet, + const MachineRegisterInfo &MRI, + const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, + bool IsVGPR) { + + for (auto LiveRegIt : CandidateRegSet) { + unsigned Reg = LiveRegIt.first; + // Skip unsafe reg. + if (PinnedRegSet.count(Reg)) + continue; + + if (SIRI->isVGPR(MRI, Reg) != IsVGPR) + continue; + bool IsSafeCandidate = true; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (MI) { + if (IsVGPR) { + // Only remat valu now. + if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY) + IsSafeCandidate = false; + if (MI->getOpcode() == AMDGPU::COPY) { + // Make sure src is unique define. + if (MI->getOperand(1).isReg() && + nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg())) + IsSafeCandidate = false; + } else { + // Skip convergent valu. + if (MI->isConvergent()) + IsSafeCandidate = false; + } + } + // Skip inst has more than 1 def. + if (MI->getDesc().NumDefs > 1) + IsSafeCandidate = false; + } else { + IsSafeCandidate = false; + } + + if (IsSafeCandidate) { + int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR); + if (Gain > 0) { + Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5)); + } else { + IsSafeCandidate = false; + } + } + // Save unsafe reg. + if (!IsSafeCandidate) + PinnedRegSet.insert(Reg); + } + + // Sort by gain. + std::sort(Candidates.begin(), Candidates.end(), + [](RematNode &I, RematNode &J) { return I.Size > J.Size; }); +} + +// For case like +// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, +// implicit-def dead $scc; xb.uniform +// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; +// xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit +// killed $scc; xb.uniform +// Sink S_AND right before S_CSELECT will overwrite SCC. +// To avoid It, skip case when DefMI and UseMI has implicit define use. +bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { + if (DefMI->getDesc().NumImplicitDefs == 0) + return false; + + auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo(); + for (MachineOperand &Def : DefMI->implicit_operands()) { + if (!Def.isReg()) + continue; + if (Def.isUse()) + continue; + Register Reg = Def.getReg(); + if (UseMI->readsRegister(Reg, TRI)) + return true; + } + return false; +} + +void addOneDefOneUseCandidate(RematNode &Node, + std::vector &RematList, + MachineRegisterInfo &MRI, int &RematCnt, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, + MachineLoopInfo *MLI, bool IsVGPR, + bool MemBound) { + unsigned Reg = Node.Reg; + MachineInstr *DefMI = Node.DefMI; + + unsigned Size = Node.Size; + MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin(); + MachineBasicBlock *InsertBB = UseMI->getParent(); + + // For VGPR, always move next to the only user to avoid wqm or exec issue. + // But doing this will cause issue when DefMI is in wqm user not in + // wqm. Disable VGPR remat for now. + // TODO: make sure single user don't need wqm. + if (!IsVGPR) { + if (MachineBasicBlock *NewInsertBB = + findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) { + if (InsertBB != NewInsertBB) { + InsertBB = NewInsertBB; + // If can find a non-loop insert block, go to the insert block. + if (DefMI->getParent() != InsertBB) { + if (!InsertBB->empty()) { + auto It = InsertBB->getFirstNonPHI(); + It = skipDebugInstructionsForward(It, InsertBB->end()); + if (It == InsertBB->end()) + UseMI = nullptr; + else + UseMI = &*It; + } + } + } + } + } + + if (IsVGPR) { + // Don't count reg in same block for valu. + if (UseMI->getParent() == DefMI->getParent()) + return; + } + + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(DefMI, UseMI)) { + return; + } + + Node.InsertBlock = InsertBB; + Node.InsertPointMI = UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + RematList.emplace_back(Node); + RematCnt += Size; +} + +void addCloneCandidate(std::vector &CloneList, + std::vector &RematList, + DenseSet &PinnedRegSet, + MachineRegisterInfo &MRI, int &RematCnt) { + // Group user in same blocks. + std::vector UserSetList(CloneList.size()); + + for (size_t I = 0; I < CloneList.size(); I++) { + auto *Node = CloneList[I]; + unsigned Reg = Node->Reg; + MachineInstr *DefMI = Node->DefMI; + // Group user in same blocks. + BlockSet &UserSet = UserSetList[I]; + + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); + UserSet.insert(UseMI.getParent()); + } + + if (UserSet.size() == 1) { + // All users are in same block with DefMI. + if (*UserSet.begin() == DefMI->getParent()) { + // Mark cannot remat for now. + // TODO: try to split if is bigger than 4 and only used once per + // channel. + PinnedRegSet.insert(Reg); + continue; + } + } + + int Size = Node->Size; + Size <<= 16; + // Pack userSet size to size. + Size |= UserSet.size(); + Node->UserCount = Size; + } + + std::sort(CloneList.begin(), CloneList.end(), + // Sort based on userSet size. + [](const RematNode *A, const RematNode *B) { + static constexpr int Mask = 0xffff; + return (A->UserCount & Mask) < (B->UserCount & Mask); + }); + + for (RematNode *Node : CloneList) { + Node->Kind = RematNode::RematKind::Clone; + RematList.emplace_back(*Node); + RematCnt += Node->Size; + } +} + +int filterRematCandiates(std::vector &Candidates, + std::vector &RematList, + DenseSet &PinnedRegSet, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) { + int RematCnt = 0; + // Work one def one use first. + for (auto &Node : Candidates) { + unsigned Reg = Node.Reg; + if (!MRI.hasOneNonDBGUse(Reg)) { + continue; + } + MachineInstr *DefMI = Node.DefMI; + if (!isSafeToMove(DefMI, MRI)) { + PinnedRegSet.insert(Reg); + continue; + } + + addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI, + IsVGPR, MemBound); + } + + if (!IsVGPR) { + std::vector CloneList; + // Try multi use case. + for (auto &Node : Candidates) { + unsigned Reg = Node.Reg; + if (MRI.hasOneNonDBGUse(Reg)) { + continue; + } + MachineInstr *DefMI = Node.DefMI; + if (!isSafeToMove(DefMI, MRI)) { + PinnedRegSet.insert(Reg); + continue; + } + + // Clone for each user. + CloneList.emplace_back(&Node); + } + + addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt); + } + + return RematCnt; +} + +void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef, + SmallVector &UserMIs) { + for (MachineInstr *UseMI : UserMIs) { + for (MachineOperand &MO : UseMI->operands()) { + if (!MO.isReg()) + continue; + if (MO.getReg() == Reg) { + MO.setReg(NewReg); + if (IsSubRegDef) + MO.setSubReg(0); + } + } + } +} + +DenseMap reduceClonedMBBs( + unsigned Reg, BlockMap> &UserBlocks, + DenseSet &UserMBBSet, + std::vector &HotBlocks, MachineDominatorTree *DT) { + // Collect hot blocks which Exp is live in. + DenseSet HotBlockSet; + for (BlockLiveInfo &HotBlock : HotBlocks) { + if (HotBlock.InputLive.count(Reg)) { + HotBlockSet.insert(HotBlock.BB); + } + } + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseSet AfterHotRangeMBBs; + for (MachineBasicBlock *MBB : UserMBBSet) { + // Always clone in hot block. + if (HotBlockSet.count(MBB)) + continue; + + bool IsDomAllHotBlocks = true; + bool IsDomedByAllHotBlocks = true; + for (MachineBasicBlock *HotMBB : HotBlockSet) { + if (!DT->dominates(MBB, HotMBB)) { + IsDomAllHotBlocks = false; + } + if (!DT->dominates(HotMBB, MBB)) { + IsDomedByAllHotBlocks = false; + } + if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { + break; + } + } + if (IsDomAllHotBlocks) { + UserBlocks.erase(MBB); + } else if (IsDomedByAllHotBlocks) { + AfterHotRangeMBBs.insert(MBB); + } + } + + // Split after hotRange block set by domtree. + DenseMap DomMap; + if (!AfterHotRangeMBBs.empty()) { + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) { + if (MBB == MBB2) + continue; + if (DT->dominates(MBB, MBB2)) { + auto &Dom = DomMap[MBB]; + Dom.insert(MBB2); + auto &Dom2 = DomMap[MBB2]; + Dom.insert(Dom2.begin(), Dom2.end()); + } + } + } + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + auto &Dom = DomMap[MBB]; + for (MachineBasicBlock *DomedMBB : Dom) { + // Remove domedMBB. + DomMap.erase(DomedMBB); + UserMBBSet.erase(DomedMBB); + } + } + } + + return DomMap; +} + +// Look for an earlier insert point if the InstructionToMove +// writes to scc and scc is live at the CurrentInsertPoint. +static MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash( + MachineInstr *InstructionToMove, MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + const bool WillSmashScc = + InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) { + CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef( + MBB, CurrentInsertPoint, SIRI, SIII, &MRI); + } + + return CurrentInsertPoint; +} + +// Look for an earlier insert point if the SubExp +// writes to scc and scc is live at the CurrentInsertPoint. +static MachineBasicBlock::iterator adjustInsertPointForSubExpToAvoidSccSmash( + const SubExp &SubExpToMove, MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) { + CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef( + MBB, CurrentInsertPoint, SIRI, SIII, &MRI); + } + + return CurrentInsertPoint; +} + +// Return trun if moving MI to Location will smash a live scc value. +static bool willSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock::iterator Location) { + // It is ok to pass nullptr to `modifiesRegister` for TRI here since + // SCC has no subreg/suprereg relationships. + return MI->modifiesRegister(AMDGPU::SCC, nullptr) && + llvm::isSccLiveAt(MBB, Location); +} + +void applyCloneRemat(Remat *Remat, RematNode &Node, + std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF) { + unsigned Reg = Node.Reg; + + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + auto DefOp = DefMI->getOperand(0); + const MCInstrDesc &Desc = DefMI->getDesc(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + // When the unique def has subReg, just create newReg for the subReg part. + bool IsSubRegDef = false; + if (DefOp.getSubReg() != 0) { + RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg()); + IsSubRegDef = true; + } + const DebugLoc DL = DefMI->getDebugLoc(); + unsigned OpNum = DefMI->getNumOperands(); + + Node.Kind = RematNode::RematKind::Clone; + + // Group user in same blocks. + BlockMap> UserMap; + DenseSet UserMBBSet; + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); + UserMap[UseMI.getParent()].emplace_back(&UseMI); + UserMBBSet.insert(UseMI.getParent()); + } + + DenseMap DomMap = + reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT); + + for (auto UseIt : UserMap) { + MachineBasicBlock *MBB = UseIt.first; + // Skip same block uses. + if (MBB == DefMI->getParent()) { + continue; + } + // Skip MBB which share clone from other MBBs. + if (UserMBBSet.count(MBB) == 0) + continue; + + Register NewReg = MRI.createVirtualRegister(RC); + auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg); + for (unsigned I = 1; I < OpNum; I++) { + NewDef = NewDef.add(DefMI->getOperand(I)); + } + + MachineInstr *InsertPointMI = UseIt.second.front(); + SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI); + + for (MachineInstr *UseMI : UseIt.second) { + SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI); + if (LastSlot > Slot) { + LastSlot = Slot; + InsertPointMI = UseMI; + } + } + + MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash( + DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII); + + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(MF, MO); + } + + MBB->insert(InsertPoint, NewDef); + + SlotIndexes->insertMachineInstrInMaps(*NewDef); + + SmallVector &UserMIs = UseIt.second; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); + + // update users in dom MBBs. + auto DomMapIt = DomMap.find(MBB); + if (DomMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : DomMapIt->second) { + SmallVector &UserMIs = UserMap[UpdateMBB]; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); + } + } + + llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes); + } + if (MRI.use_empty(Reg)) { + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + Remat->removeInst(DefMI); + DefMI->eraseFromParent(); + } +} + +void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, + const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + MachineInstr *DefMI = Node.DefMI; + MachineInstr *InsertPointMI = Node.InsertPointMI; + MachineBasicBlock *MBB = nullptr; + + // Find a valid insert point. + MachineBasicBlock::iterator InsertPoint; + if (InsertPointMI) { + InsertPoint = InsertPointMI->getIterator(); + MBB = InsertPointMI->getParent(); + } else { + InsertPoint = Node.InsertBlock->getFirstTerminator(); + MBB = Node.InsertBlock; + } + + InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI, + SIRI, SIII); + + // Move instruction to new location. + DefMI->removeFromParent(); + InsertPoint->getParent()->insert(InsertPoint, DefMI); + + // Update slot index. + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); +} + +void applyRemat(Remat *Remat, MapVector &RematMap, + std::vector &HotBlocks, MachineDominatorTree *DT, + SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + MachineFunction &MF) { + std::vector UpdateList; + for (auto &It : RematMap) { + UpdateList.emplace_back(It.second); + } + // Sort update list with slotIndex to make sure def moved before use. + // If use moved before def, It might not be the first use anymore. + std::sort(UpdateList.begin(), UpdateList.end(), + [&SlotIndexes](RematNode &I, RematNode &J) { + SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI); + SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI); + return A < B; + }); + + for (RematNode &Node : UpdateList) { + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII); + } else if (Node.Kind == RematNode::RematKind::Clone) { + applyCloneRemat(Remat, Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, + MF); + } + } +} + +void dumpRematMap(MapVector &RematMap, + const SIRegisterInfo *SIRI) { + dbgs() << "\n rematMap: \n"; + for (auto It : RematMap) { + int Reg = It.first; + dbgs() << printReg(Reg, SIRI); + dbgs() << "\n"; + } +} + +int DebugBlockIndex = 42; + +void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet, + MapVector &VRematMap, + MapVector &SRematMap, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + llvm::dumpLiveSet(LiveSet, SIRI); + dumpRematMap(VRematMap, SIRI); + dumpRematMap(SRematMap, SIRI); +} + +void dumpCandidates(std::vector &RematCandidates, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + dbgs() << "\n Candidates: \n"; + unsigned TotalSize = 0; + for (RematNode &Node : RematCandidates) { + dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size; + dbgs() << "\n"; + TotalSize += Node.Size; + } + dbgs() << "Total Size:" << TotalSize << "\n"; +} + +} // namespace + +bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, bool &IsNearTarget) { + const GCNSubtarget *ST = &MF.getSubtarget(); + + const SIInstrInfo *SIII = ST->getInstrInfo(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + ReversePostOrderTraversal RPOT(&MF); + DenseMap RPOTIndexMap; + for (MachineBasicBlock *MBB : RPOT) { + RPOTIndexMap[MBB] = RPOTIndexMap.size(); + } + + auto &MRI = MF.getRegInfo(); + + bool IsUpdated = false; + RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST); + + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (Status.TargetOcc >= MaxOcc) + return false; + + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; + + int RematSCnt = Status.MaxSPressure - SLimit; + // when agressive sgpr remat, reserve some for allocation lost. + if (EnableAggressive) + RematSCnt += NearTargetRegLimit; + + bool IsSGPRSpill = false; + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); + } + + const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; + + // If bound by lds, skip. + if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && + !IsForceRematSgpr) + return false; + + MachineBasicBlock *EntryMBB = &MF.front(); + + auto *SlotIndexes = LIS->getSlotIndexes(); + + // Reg which already marked remat. + MapVector VRematMap; + MapVector SRematMap; + // Reg which cannot move around to remat. + DenseSet PinnedRegSet; + std::vector HotBlocks; + for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) { + MachineBasicBlock *MBB = *It; + auto &RP = Status.MBBPressureMap[MBB]; + // ignore block not hot. + if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit && + (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) < + Status.TargetSLimit) + continue; + // Collect reg pressure. + unsigned MaxVPressure = 0; + unsigned MaxSPressure = 0; + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; + + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; + LLVM_DEBUG( + dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI)); + + GCNDownwardRPTracker Tracker(*LIS); + + Tracker.reset(*MBB->begin(), &InputLive); + + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + Tracker.advance(); + auto LISLR = Tracker.getLiveRegs(); + // Update live set for things already remated. + updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap); + updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap); + + const GCNRPTracker::LiveRegSet &LiveSet = LISLR; + unsigned VPressure = 0; + unsigned SPressure = 0; + collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure); + if (MaxVPressure < VPressure) + MaxVPressure = VPressure; + if (MaxSPressure < SPressure) + MaxSPressure = SPressure; + } + MaxSPressure += RegForVCC + Status.InputPhysicalSPressure; + if (MaxVPressure <= VLimit && MaxSPressure <= SLimit) + continue; + + // Build block live info. + // Use outputLive for EntryMBB. + BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure, + MBB != EntryMBB ? InputLive : OutputLive}; + // Skip entry block when save hotBlock to reduce clone because not clone in + // entry block. + if (MBB != EntryMBB) + HotBlocks.emplace_back(LiveInfo); + GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive; + + // Update reg pressure based on remat list. + InstSet VReducedInsts; + InstSet SReducedInsts; + int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI, + LiveInfo, RPOTIndexMap); + int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI, + LiveInfo, RPOTIndexMap); + + // Calculate size need to be remat. + int RematVCnt = MaxVPressure - VReduced - VLimit; + int RematSCnt = MaxSPressure - SReduced - SLimit; + + bool IsSGPRSpill = false; + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF); + } + bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; + // Try to add candidates into remat list. + + int NewRematSCnt = 0; + if (RematSCnt > 0) { + // Build candidate nodes. + std::vector SRematCandidates; + buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI, + SIII, SIRI, /*IsVGPR*/ false); + + LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI)); + std::vector SRematList; + // Filter candidates. + NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList, + PinnedRegSet, DT, PDT, MLI, MRI, + /*IsVGPR*/ false, Status.MemBound); + if (NewRematSCnt > RematSCnt) { + // Has enough remat node to cover rematCnt. + int RematCnt = 0; + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + RematCnt += Node.Size; + if (RematCnt > RematSCnt && !EnableAggressive) + break; + } + NewRematSCnt = 0; + } else { + + for (RematNode &Node : SRematList) { + SReducedInsts.insert(Node.DefMI); + } + // Check shared size. + int SharedReducedSize = + getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI); + if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= + RematSCnt) { + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + } + } else { + if (!IsForceRematSgpr) + return false; + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + } + // Find local one def one use candidates. + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + if (MI.getDesc().NumDefs != 1) + continue; + MachineOperand &DstMO = MI.getOperand(0); + Register Reg = DstMO.getReg(); + if (!SIRI->isSGPRReg(MRI, Reg)) + continue; + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + if (!MRI.hasOneDef(Reg)) + continue; + if (Reg.isPhysical()) + continue; + MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); + if (UseMI.getParent() != MBB) + continue; + int Gain = rematGain(&MI, Reg, MRI, SIRI, + /*IsVGPR*/ false); + if (Gain > 0) { + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(&MI, &UseMI)) { + continue; + } + RematNode Node = {Reg, &MI, (unsigned)Gain >> 5}; + Node.InsertPointMI = &UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + SRematMap[Reg] = Node; + SharedReducedSize += Node.Size; + } + } + } + NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize; + } + } + // If works, continue. + + // Collect live range from hot inst. + // find common live range in hot insts. + // Remat these common live range. + // Apply the remat. + + int NewRematVCnt = 0; + if (RematVCnt > 0) { + // TODO: V remat. + } + + bool NeedSRemat = RematSCnt > 0; + bool NeedVRemat = RematVCnt > 0; + // If sgpr spill, always do remat. + bool IsSRematOK = + (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr; + bool IsVRematOK = + (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty(); + if (NeedSRemat && NeedVRemat) { + if (IsVRematOK && IsSRematOK) { + IsUpdated = true; + } else if (IsSGPRSpill) { + IsUpdated = true; + } + } else if (NeedSRemat) { + if (IsSRematOK) { + IsUpdated = true; + } + } else if (NeedVRemat) { + if (IsVRematOK) { + IsUpdated = true; + } + } + // TODO: what to do when cannot reach target? + if (NewRematSCnt > 0) { + if ((unsigned)NewRematSCnt <= NearTargetRegLimit) { + IsNearTarget = true; + } else { + if (!IsSGPRSpill) + return false; + } + } + } + + if (SRematMap.empty() && VRematMap.empty()) { + return IsUpdated; + } + + if (!SRematMap.empty()) { + IsUpdated = true; + applyRemat(Remat, SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, + MF); + LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); + } + + // Balance between vector and scalar if possible. + return IsUpdated; +} + +namespace { +bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) { + DenseSet DefMIs; + for (MachineInstr &DefMI : MRI.def_instructions(Reg)) { + // skip implicit def. + if (DefMI.getOpcode() == AMDGPU::IMPLICIT_DEF) + continue; + DefMIs.insert(&DefMI); + } + return DefMIs.size() == 1; +} + +static bool isImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) { + if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) { + return false; + } + + return MO.getReg() == Reg; +} + +static bool isSafeRematCandidateUser(const MachineInstr *UseMI, + const SIInstrInfo *SIII) { + // Make sure UseMI is not wqm like sample. + if (SIII->isWQM(UseMI->getOpcode())) + return false; + if (UseMI->getOpcode() == AMDGPU::PHI) + return false; + + return true; +} + +static bool isConvergent(Remat *Remat, const MachineInstr &MI) { + return MI.isConvergent() && + // This flag is set on readfirstlane's to indicate that they + // are redundant (the value being read is already uniform). + // Normally, readfirstlanes are convergent, because different exec + // will cause a different value to be read; a known uniform + // readfirstlane is safe to move or clone and not actually convergent. + !Remat->TotalUniformInsts.count(&MI); +} + +bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + bool IsSink) { + if (Reg.isPhysical()) + return false; + bool IsVGPR = SIRI->isVGPR(MRI, Reg); + + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + if (!DefMI) + return false; + if (DefMI->getOpcode() == AMDGPU::PHI) + return false; + + // Skip convergent. + if (isConvergent(Remat, *DefMI)) + return false; + + // Skip inst has more than 1 def. + if (DefMI->getDesc().NumDefs > 1) + return false; + + unsigned OpNum = DefMI->getNumOperands(); + + // Only move DefMI which all operand is unique def. + for (unsigned I = 0; I < OpNum; I++) { + MachineOperand &Op = DefMI->getOperand(I); + if (!Op.isReg()) + continue; + Register OpReg = Op.getReg(); + if (isImplicitUseOfReg(Op, AMDGPU::EXEC) || + isImplicitUseOfReg(Op, AMDGPU::EXEC_LO)) + continue; + if (isImplicitUseOfReg(Op, AMDGPU::MODE)) + continue; + if (isImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI)) + continue; + // Alow unused scc define. + if (Op.isImplicit() && Op.isDead() && Op.isDef()) + continue; + if (OpReg.isPhysical()) + return false; + if (!MRI.getUniqueVRegDef(OpReg) && + !llvm::isSub0Sub1SingleDef(OpReg, MRI)) { + return false; + } + } + + if (IsVGPR && IsSink) { + // Skip mem related inst. + if (DefMI->mayLoadOrStore()) { + return false; + } + + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (!isSafeRematCandidateUser(&UseMI, SIII)) + return false; + } + } + + return true; +} + +std::vector buildSubExpFromCandidates( + Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, + GCNRPTracker::LiveRegSet &UnusedPassThrus, bool AllowPartialUseInSubExp) { + InstSet CandidateDefs; + DenseSet RemovedCandidates; + std::vector CandidateRegs; + CandidateRegs.reserve(Candidates.size()); + for (auto It : Candidates) { + unsigned Reg = It.first; + CandidateRegs.emplace_back(Reg); + } + // Sort candidate by defMI order to make sure defMI has dependent check after + // all its dependent node. + std::sort(CandidateRegs.begin(), CandidateRegs.end(), + [&MRI, &SlotIndexes](const unsigned A, unsigned B) { + MachineInstr *MIa = MRI.getUniqueVRegDef(A); + + MachineInstr *MIb = MRI.getUniqueVRegDef(B); + // Later instr first. + return !SlotIndex::isEarlierInstr( + SlotIndexes->getInstructionIndex(*MIa), + SlotIndexes->getInstructionIndex(*MIb)); + }); + + // If Candidate def has user in MBB, add It when allow partial candidates. + // And the subExp has the define could only be clone, cannot move cross blocks + // because user in MBB. + DenseSet PartialCandidates; + LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";); + for (unsigned Reg : CandidateRegs) { + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + bool IsHasNoCandidatesSameBlockUser = false; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == MI->getParent()) { + if (UseMI.getNumExplicitDefs() == 1) { + // Skip user which already in Candidates. + Register UserDefReg = UseMI.getOperand(0).getReg(); + if (Candidates.count(UserDefReg) > 0 && + RemovedCandidates.count(UserDefReg) == 0) + continue; + } + if (!AllowPartialUseInSubExp) + IsHasNoCandidatesSameBlockUser = true; + else + PartialCandidates.insert(MI); + break; + } + } + if (IsHasNoCandidatesSameBlockUser) { + RemovedCandidates.insert(Reg); + continue; + } + LLVM_DEBUG(MI->dump()); + CandidateDefs.insert(MI); + } + LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";); + + if (CandidateDefs.empty()) + return std::vector(); + for (unsigned Reg : RemovedCandidates) { + UnusedPassThrus[Reg] = Candidates[Reg]; + Candidates.erase(Reg); + } + + // iterate MBB backward. + // add inst which only used for candidate defines. + for (auto It = MBB->rbegin(); It != MBB->rend(); It++) { + MachineInstr &MI = *It; + if (CandidateDefs.count(&MI) > 0) { + continue; + } + + if (isConvergent(Remat, MI)) + continue; + // Skip if MI is not safe to move. + if (MI.getNumDefs() != 1) { + // allow to move unused implicit def. + bool IsDeadImplictDef = false; + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + IsDeadImplictDef = MO.isDead(); + } + if (!IsDeadImplictDef) + continue; + } + + unsigned Reg = -1; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Reg = MO.getReg(); + break; + } + + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true)) + continue; + + // If all users of MI are in candidate defs, add MI into candidate defs. + // If part of user of MI is in candidate defs, add MI into candidate defs + // when allow partialUse. + bool IsAllUserInCandidate = true; + bool IsHasCandidateUser = false; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (CandidateDefs.count(&UseMI) == 0) + IsAllUserInCandidate = false; + else + IsHasCandidateUser = true; + } + if (!IsHasCandidateUser) + continue; + if (!IsAllUserInCandidate) { + if (!AllowPartialUseInSubExp) + continue; + PartialCandidates.insert(&MI); + } + + CandidateDefs.insert(&MI); + } + + // Collect input for CandidateDefs. + GCNRPTracker::LiveRegSet CandidateInput; + for (MachineInstr *MI : CandidateDefs) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (MO.isImplicit() && Reg.isPhysical()) + continue; + + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + assert((DefMI || llvm::isSub0Sub1SingleDef(Reg, MRI)) && + "UseMI should be safe to move"); + if (DefMI && CandidateDefs.count(DefMI) > 0) + continue; + // Add to input. + CandidateInput[Reg] |= llvm::getRegMask(MO, MRI); + } + } + + // Build defs in order. + std::vector Defs; + Defs.reserve(CandidateDefs.size()); + for (MachineInstr &MI : *MBB) { + if (CandidateDefs.count(&MI) == 0) + continue; + Defs.emplace_back(&MI); + } + + LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; + for (MachineInstr *MI : Defs) { MI->dump(); } dbgs() + << "\nFinished Candidate Defs End\n";); + + // Build SubExp with CandidateDefs as Nodes, CandidateInput as input + // Candidates as output. + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); + Dag.build(CandidateInput, Candidates, Defs); + if (AllowPartialUseInSubExp) { + for (auto &SubExp : Dag.SubExps) { + for (auto *MI : SubExp.SUnits) { + if (PartialCandidates.count(MI)) { + SubExp.IsCloneOnly = true; + break; + } + } + } + } + return Dag.SubExps; +} + +std::vector buildSubExpFromCandidatesTopBottom( + Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const MachineRegisterInfo &MRI) { + InstSet CandidateDefs; + + LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";); + for (auto It : Candidates) { + unsigned Reg = It.first; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (isConvergent(Remat, UseMI)) + continue; + MachineBasicBlock *UseMBB = UseMI.getParent(); + if (UseMBB == MI->getParent()) + continue; + assert(UseMBB == MBB && "block mismatch"); + // If all operands in CandidateRegs, add to candidateDefs. + bool IsHasOpRegNotInCandidates = false; + for (MachineOperand &MO : UseMI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register OpReg = MO.getReg(); + if (MO.isImplicit() && OpReg.isPhysical()) + continue; + if (Candidates.count(OpReg) == 0) { + IsHasOpRegNotInCandidates = true; + break; + } + } + if (IsHasOpRegNotInCandidates) + continue; + + LLVM_DEBUG(UseMI.dump()); + CandidateDefs.insert(&UseMI); + } + } + LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";); + + if (CandidateDefs.empty()) + return std::vector(); + + // iterate MBB. + GCNRPTracker::LiveRegSet LocalCandidates = Candidates; + // add inst which only used by candidate defines. + for (auto It = MBB->begin(); It != MBB->end(); It++) { + MachineInstr &MI = *It; + if (CandidateDefs.count(&MI) > 0) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (Reg.isPhysical()) + continue; + LocalCandidates[Reg]; + } + continue; + } + + // Skip if MI is not safe to move. + if (isConvergent(Remat, MI)) + continue; + + if (MI.getNumDefs() != 1) + continue; + + if (MI.mayLoadOrStore()) { + continue; + } + + unsigned Reg = -1; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Reg = MO.getReg(); + break; + } + + // Still use bsink to skip mem load/store. + // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*IsSink*/true)) + // continue; + + // If all user of MI is in candidate defs, add MI into candidate defs. + bool IsAllOperandInCandidate = true; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register OpReg = MO.getReg(); + if (LocalCandidates.count(OpReg)) + continue; + + if (MO.isImplicit() && + (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO)) + continue; + if (OpReg.isPhysical()) { + IsAllOperandInCandidate = false; + break; + } + MachineInstr *OpMI = MRI.getUniqueVRegDef(OpReg); + if (!OpMI) { + IsAllOperandInCandidate = false; + break; + } + if (CandidateDefs.count(OpMI) == 0) { + IsAllOperandInCandidate = false; + break; + } + if (MO.isTied()) + continue; + } + if (!IsAllOperandInCandidate) + continue; + LLVM_DEBUG(llvm::dbgs() << "Add local candidates:"; + pressure::print_reg(Reg, MRI, SIRI, llvm::dbgs());); + LocalCandidates[Reg]; + CandidateDefs.insert(&MI); + } + + // Collect input for CandidateDefs. + GCNRPTracker::LiveRegSet CandidateInput; + for (MachineInstr *MI : CandidateDefs) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)) + continue; + if (Reg.isPhysical()) + continue; + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + if (!DefMI) { + // Skip local def which is not unique. + if (MO.isTied()) + continue; + if (Candidates.count(Reg) == 0 && LocalCandidates.count(Reg) != 0) + continue; + } + assert((DefMI || llvm::isSub0Sub1SingleDef(Reg, MRI)) && + "UseMI should be safe to move"); + if (DefMI && CandidateDefs.count(DefMI) > 0) + continue; + // Add to input. + CandidateInput[Reg] = llvm::getRegMask(MO, MRI); + } + } + + // Build defs in order. + std::vector Defs; + Defs.reserve(CandidateDefs.size()); + for (MachineInstr &MI : *MBB) { + if (CandidateDefs.count(&MI) == 0) + continue; + Defs.emplace_back(&MI); + } + + LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; + for (MachineInstr *MI : Defs) { MI->dump(); } dbgs() + << "\nFinished Candidate Defs End\n";); + + LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto It : LocalCandidates) { + pressure::print_reg(It.first, MRI, SIRI, llvm::dbgs()); + } dbgs() << "\nLocalCandidates End\n";); + // Make sure all input reg are uniqueDef. + // Input is Candidates, output is? + // Build SubExp with CandidateDefs as Nodes, CandidateInput as input + // Candidates as output. + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); + Dag.build(Candidates, LocalCandidates, Defs); + return Dag.SubExps; +} + +void printVreg(Register Reg, const MachineRegisterInfo &MRI) { + if (Reg.isVirtual()) { + StringRef Name = MRI.getVRegName(Reg); + if (Name != "") { + dbgs() << '%' << Name; + } else { + dbgs() << '%' << Reg.virtRegIndex(); + } + } +} + +MachineBasicBlock *findTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, + const MachineRegisterInfo &MRI, + MachineDominatorTree *DT) { + BlockSet UserBlocks; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB != FromBB) + UserBlocks.insert(UserBB); + else + // When has user in FromBB, userBlock will be FromBB. + return nullptr; + } + if (UserBlocks.empty()) + return nullptr; + MachineBasicBlock *UserBlock = nearestCommonDominator(DT, UserBlocks); + if (!DT->dominates(FromBB, UserBlock)) { + return nullptr; + } + if (UserBlock == FromBB) + return nullptr; + return UserBlock; +} + +void applySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, + MachineDominatorTree *DT, + SlotIndexes *SlotIndexes) { + // Move from bottom. + MachineBasicBlock *FromBB = Exp.FromBB; + for (auto It = Exp.SUnits.rbegin(); It != Exp.SUnits.rend(); It++) { + MachineInstr *DefMI = *It; + if (DefMI->getNumExplicitDefs() != 1) + continue; + + Register Reg = DefMI->getOperand(0).getReg(); + MachineBasicBlock *ToBB = findTargetBlock(Reg, FromBB, MRI, DT); + if (!ToBB) + continue; + + // Do not overwrite a live scc. + MachineBasicBlock::iterator InsertPoint = + ToBB->SkipPHIsAndLabels(ToBB->begin()); + if (willSmashSccAtLocation(DefMI, ToBB, InsertPoint)) + continue; + + DefMI->removeFromParent(); + assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && + "invalid insert point"); + ToBB->insert(InsertPoint, DefMI); + // Debug insts don't need slot index. + if (DefMI->isDebugInstr()) + continue; + // Update slot index. + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); + } +} + +void applySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, + const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI) { + // Move from top. + // Find lowest input def. + MachineBasicBlock *ToBB = Exp.ToBB; + assert(!ToBB->empty() && "ToBB have instructions for define of input nodes"); + auto Terminator = ToBB->getFirstTerminator(); + if (Terminator == ToBB->end() && ToBB->succ_size() == 1) { + MachineInstr &EndMI = *ToBB->rbegin(); + if (SIII->isSchedulingBoundary(EndMI, ToBB, *ToBB->getParent())) + // Insert before the scheduling boundary instruction. + Terminator = EndMI.getIterator(); + else + // No boundary so just insert inst at the end of the block. + Terminator = ToBB->end(); + } + + Terminator = adjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator, + MRI, SIRI, SIII); + + for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) { + MachineInstr *DefMI = *It; + if (DefMI->getNumExplicitDefs() != 1) + continue; + if (SIII->isEXP(DefMI->getOpcode())) + continue; + if (DefMI->mayStore()) + continue; + // Find def for DefMI operands as insert point. + DefMI->removeFromParent(); + ToBB->insert(Terminator, DefMI); + + // Debug insts don't need slot index. + if (DefMI->isDebugInstr()) + continue; + // Update slot index. + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); + } +} + +DenseSet buildCloneSet(ExpDag &Dag, + DenseSet &DagBottoms, + GCNRPTracker::LiveRegSet &UsedOutput) { + DenseSet CopySet; + for (auto It = Dag.SUnits.rbegin(); It != Dag.SUnits.rend(); It++) { + SUnit &SU = *It; + // Skip non-inst node. + if (!SU.isInstr()) + continue; + MachineInstr *MI = SU.getInstr(); + if (DagBottoms.find(&SU) != DagBottoms.end()) { + bool IsUsed = false; + // For bottom SU, if in usedOutput, add to copySet; + for (MachineOperand &DefMO : MI->defs()) { + if (!DefMO.isReg()) + continue; + Register Reg = DefMO.getReg(); + if (UsedOutput.count(Reg) > 0) { + IsUsed = true; + break; + } + } + if (IsUsed) { + CopySet.insert(MI); + continue; + } + // bottom SU may still have succNode when It used both inExp and outExp. + // So continue check succNode. + } + + // If any SuccNode is in copySet, add to copySet. + bool IsSuccCopied = false; + for (SDep &SucDep : SU.Succs) { + SUnit *SucSU = SucDep.getSUnit(); + MachineInstr *SuccMI = SucSU->getInstr(); + if (CopySet.count(SuccMI) > 0) { + IsSuccCopied = true; + break; + } + } + if (IsSuccCopied) + CopySet.insert(MI); + } + return CopySet; +} + +void updateUsers(SmallVector &UserMIs, + DenseMap &RegMap) { + + for (MachineInstr *UserMI : UserMIs) { + for (MachineOperand &MO : UserMI->uses()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + auto It = RegMap.find(Reg); + if (It == RegMap.end()) + continue; + unsigned NewReg = It->second; + MO.setReg(NewReg); + } + } +} + +struct HotBlock { + MachineBasicBlock *MBB = nullptr; + GCNRPTracker::LiveRegSet InputLive; + std::pair MaxPressures; + // Info about vmemLd. + int VmemLdInputSize; + int VmemLdOutputSize; +}; + +DenseMap reduceClonedMBBs( + SubExp &Exp, + MapVector> &UserBlocks, + DenseMap &UserBlocksLiveRegs, + std::vector &HotBlocks, MachineDominatorTree *DT) { + // Collect hot blocks which Exp is live in. + DenseSet HotBlockSet; + for (HotBlock &HotBlock : HotBlocks) { + for (unsigned Reg : Exp.BottomRegs) { + if (HotBlock.InputLive.count(Reg)) { + HotBlockSet.insert(HotBlock.MBB); + break; + } + } + } + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseSet AfterHotRangeMBBs; + for (auto It : UserBlocksLiveRegs) { + MachineBasicBlock *MBB = It.first; + // Always clone in hot block. + if (HotBlockSet.count(MBB)) + continue; + + bool IsDomAllHotBlocks = true; + bool IsDomedByAllHotBlocks = true; + for (MachineBasicBlock *HotMBB : HotBlockSet) { + if (!DT->dominates(MBB, HotMBB)) { + IsDomAllHotBlocks = false; + } + if (!DT->dominates(HotMBB, MBB)) { + IsDomedByAllHotBlocks = false; + } + if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { + break; + } + } + if (IsDomAllHotBlocks) { + UserBlocks.erase(MBB); + } else if (IsDomedByAllHotBlocks) { + AfterHotRangeMBBs.insert(MBB); + } + } + + // Split after hotRange block set by domtree. + DenseMap DomMap; + if (!AfterHotRangeMBBs.empty()) { + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) { + if (MBB == MBB2) + continue; + if (DT->dominates(MBB, MBB2)) { + auto &Dom = DomMap[MBB]; + Dom.insert(MBB2); + auto &Dom2 = DomMap[MBB2]; + Dom.insert(Dom2.begin(), Dom2.end()); + } + } + } + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + auto &UsedOutput = UserBlocksLiveRegs[MBB]; + auto &Dom = DomMap[MBB]; + for (MachineBasicBlock *DomedMBB : Dom) { + // Merge domed use to MBB use. + mergeLiveRegSet(UsedOutput, UserBlocksLiveRegs[DomedMBB]); + // Remove domedMBB. + DomMap.erase(DomedMBB); + UserBlocksLiveRegs.erase(DomedMBB); + } + } + } + + return DomMap; +} + +void applySubExpCloneNearUser(SubExp &Exp, std::vector &HotBlocks, + MachineDominatorTree *DT, + MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI) { + MapVector> UserBlocks; + DenseMap UserBlocksLiveRegs; + for (unsigned Reg : Exp.BottomRegs) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB == Exp.FromBB) + continue; + + UserBlocks[UserBB].emplace_back(&UseMI); + auto &UserLives = UserBlocksLiveRegs[UserBB]; + for (MachineOperand &MO : UseMI.uses()) { + if (!MO.isReg()) + continue; + Register UseReg = MO.getReg(); + if (Reg != UseReg) + continue; + UserLives[Reg] |= getRegMask(MO, MRI); + } + } + } + // Build dag for SubExp to help remove unused inst when clone. + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); + Dag.build(Exp.InputLive, Exp.OutputLive, Exp.SUnits); + DenseSet DagBottoms; + for (SUnit &SU : Dag.SUnits) { + if (!SU.isInstr()) + continue; + if (SU.NumSuccs == 0) { + DagBottoms.insert(&SU); + } else { + MachineInstr *MI = SU.getInstr(); + // Add SU which def value in Exp.outputLive. + for (MachineOperand &DefMO : MI->defs()) { + if (!DefMO.isReg()) + continue; + Register Reg = DefMO.getReg(); + if (Exp.BottomRegs.count(Reg) > 0) { + DagBottoms.insert(&SU); + break; + } + } + } + } + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseMap DomMap = + reduceClonedMBBs(Exp, UserBlocks, UserBlocksLiveRegs, HotBlocks, DT); + + // Sort to make stable order. + std::sort( + UserBlocks.begin(), UserBlocks.end(), + [](std::pair> &It0, + std::pair> &It1) { + return It0.first->getNumber() < It1.first->getNumber(); + }); + + const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); + + // Clone for each userBlocks. Not share clone thru dom tree which cannot help + // reg pressure. + for (auto It : UserBlocks) { + MachineBasicBlock *MBB = It.first; + // Skip MBB which share clone from other MBBs. + if (UserBlocksLiveRegs.count(MBB) == 0) + continue; + auto &UsedOutput = UserBlocksLiveRegs[MBB]; + auto CopySet = buildCloneSet(Dag, DagBottoms, UsedOutput); + // Clone to MBB. + // Create new regs first. + DenseMap RegMap; + auto InsertPtr = MBB->getFirstNonPHI(); + // If Exp has scc read/write, make sure MBB not have scc in liveins. + if (IsModifiesScc && llvm::isSccLiveAt(MBB, InsertPtr)) + continue; + MachineFunction *MF = MBB->getParent(); + for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) { + MachineInstr *DefMI = *It; + // Not clone if already in MBB. + if (DefMI->getParent() == MBB) + continue; + // Not clone if not used for MBB. + if (CopySet.count(DefMI) == 0) + continue; + + auto ClonedMI = + BuildMI(*MBB, InsertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); + + for (MachineOperand &Def : DefMI->defs()) { + Register Reg = Def.getReg(); + if (Reg.isPhysical()) { + if (Def.isImplicit()) + continue; + ClonedMI.addDef(Reg, 0, Def.getSubReg()); + } else { + Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + RegMap[Reg] = NewReg; + ClonedMI.addDef(NewReg, 0, Def.getSubReg()); + } + } + + for (MachineOperand &MO : DefMI->uses()) { + if (MO.isReg()) { + Register Reg = MO.getReg(); + if (Reg.isPhysical()) { + if (MO.isImplicit()) + continue; + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + auto It = RegMap.find(Reg); + if (It == RegMap.end()) { + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + ClonedMI.addReg(It->second, 0, MO.getSubReg()); + } + } + } else { + ClonedMI.add(MO); + } + } + + MachineInstr *NewDef = ClonedMI.getInstr(); + SlotIndexes->insertMachineInstrInMaps(*NewDef); + // Set mem operand + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(*MF, MO); + } + } + + // update users in MBB. + SmallVector &UserMIs = It.second; + updateUsers(UserMIs, RegMap); + + // update users in dom MBBs. + auto DomMapIt = DomMap.find(MBB); + if (DomMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : DomMapIt->second) { + SmallVector &UserMIs = UserBlocks[UpdateMBB]; + updateUsers(UserMIs, RegMap); + } + } + } +} + +void applySubExpCloneNearUserInBlock( + SubExp &Exp, + DenseMap &InBlockHotVInstMap, + DenseMap &InBlockHotSInstMap, + MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, + const SIRegisterInfo *SIRI) { + MachineBasicBlock *MBB = Exp.FromBB; + MachineFunction *MF = MBB->getParent(); + MachineInstr *HotVMI = InBlockHotVInstMap[MBB]; + MachineInstr *HotSMI = InBlockHotSInstMap[MBB]; + // Exp is build with hotVMI or hotSMI, cannot mix. + assert(!(HotVMI && HotSMI) && "cannot mix hot MI"); + MachineInstr *HotMI = HotVMI; + if (!HotMI) { + HotMI = HotSMI; + } + + SlotIndex HotSlot = SlotIndexes->getInstructionIndex(*HotMI).getBaseIndex(); + const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); + + for (unsigned Reg : Exp.BottomRegs) { + + SmallVector UseMIs; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB != Exp.FromBB) + continue; + // Skip inst in Exp. + if (Exp.BottomRoots.find(&UseMI) != Exp.BottomRoots.end()) + continue; + SlotIndex UseSlot = + SlotIndexes->getInstructionIndex(UseMI).getBaseIndex(); + // Only clone for use after hot slot. + if (UseSlot < HotSlot) + continue; + + // Do not overwrite a live scc. + if (IsModifiesScc && llvm::isSccLiveAt(UserBB, &UseMI)) + continue; + + UseMIs.emplace_back(&UseMI); + } + if (UseMIs.empty()) + continue; + DenseMap RegMap; + + std::sort(UseMIs.begin(), UseMIs.end(), + [&SlotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) { + return SlotIndexes->getInstructionIndex(*MIa).getBaseIndex() < + SlotIndexes->getInstructionIndex(*MIb).getBaseIndex(); + }); + auto InsertPtr = UseMIs.front()->getIterator(); + + for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) { + MachineInstr *DefMI = *It; + auto ClonedMI = + BuildMI(*MBB, InsertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); + + for (MachineOperand &Def : DefMI->defs()) { + Register Reg = Def.getReg(); + if (Reg.isPhysical()) { + ClonedMI.addDef(Reg, 0, Def.getSubReg()); + } else { + Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + RegMap[Reg] = NewReg; + ClonedMI.addDef(NewReg, 0, Def.getSubReg()); + } + } + + for (MachineOperand &MO : DefMI->uses()) { + if (MO.isReg()) { + if (MO.isImplicit()) { + continue; + } + Register Reg = MO.getReg(); + if (Reg.isPhysical()) { + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + auto It = RegMap.find(Reg); + if (It == RegMap.end()) { + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + ClonedMI.addReg(It->second, 0, MO.getSubReg()); + } + } + } else { + ClonedMI.add(MO); + } + } + + MachineInstr *NewDef = ClonedMI.getInstr(); + SlotIndexes->insertMachineInstrInMaps(*NewDef); + // Set mem operand + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(*MF, MO); + } + } + // TODO: only clone to cross hot range. + for (MachineInstr *UseMI : UseMIs) { + for (MachineOperand &MO : UseMI->uses()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + auto It = RegMap.find(Reg); + if (It == RegMap.end()) + continue; + Register NewReg = It->second; + MO.setReg(NewReg); + } + } + } +} + +bool isInLiveSet(unsigned Reg, LaneBitmask Mask, + const GCNRPTracker::LiveRegSet &Live) { + auto It = Live.find(Reg); + if (It == Live.end()) + return false; + + LaneBitmask LiveMask = It->second; + return (LiveMask | Mask) == LiveMask; +} + +unsigned getPacifistLevel(unsigned Reg, + DenseMap &PacifistLevels, + const MachineRegisterInfo &MRI) { + unsigned Level = 0; + for (MachineInstr &MI : MRI.def_instructions(Reg)) { + auto It = PacifistLevels.find(&MI); + if (It == PacifistLevels.end()) + continue; + Level = It->second; + } + return Level; +} + +bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB, + const MachineRegisterInfo &MRI) { + for (MachineInstr &Def : MRI.def_instructions(Reg)) { + if (Def.getParent() != MBB) + continue; + return true; + } + return false; +} + +MachineInstr *getInBlockUniqueDef(unsigned Reg, MachineBasicBlock *MBB, + const GCNRPTracker::LiveRegSet &InputLive, + const MachineRegisterInfo &MRI) { + MachineInstr *DefMI = nullptr; + // If live as input for MBB, cannot be unique def. + if (InputLive.count(Reg)) + return DefMI; + for (MachineInstr &Def : MRI.def_instructions(Reg)) { + if (Def.getParent() != MBB) + continue; + if (DefMI) { + // Not unique. + DefMI = nullptr; + break; + } + DefMI = &Def; + } + return DefMI; +} + +bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &InputLive, + const GCNRPTracker::LiveRegSet &OutputLive) { + return InputLive.count(Reg) && OutputLive.count(Reg); +} + +// Instructions which only use imm/passThru reg/output only reg will not kill +// any live reg, so name them pacifist here. +bool collectPacifist(MachineInstr &MI, + const GCNRPTracker::LiveRegSet &InputLive, + const GCNRPTracker::LiveRegSet &OutputLive, + const MachineRegisterInfo &MRI) { + // If has implicit def, not move. + if (MI.getDesc().NumImplicitDefs != 0) + return false; + + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + + Register Reg = MO.getReg(); + if (MO.isImplicit() && + (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::MODE)) + continue; + if (Reg.isPhysical()) + return false; + // The def for reg must be unique def in block or pass thru which not has + // def in block. If not, It is not safe to move. + if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI) || + (isPassThru(Reg, InputLive, OutputLive) && + !hasInBlockDef(Reg, MI.getParent(), MRI)))) + return false; + + LaneBitmask Mask = llvm::getRegMask(MO, MRI); + + if (isInLiveSet(Reg, Mask, OutputLive)) + continue; + + return false; + } + bool IsHasDef = false; + for (MachineOperand &MO : MI.defs()) { + Register Reg = MO.getReg(); + + if (Reg.isPhysical()) + return false; + + if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI)) + return false; + + IsHasDef = true; + } + // If no def, It will not increase pressure, don't mark It. + return IsHasDef; +} + +static MachineInstr *findFirstAliasingLoadOrStoreInMBB(MachineInstr &MI, + MachineBasicBlock &MBB, + AliasAnalysis *AA) { + if (MI.mayLoadOrStore()) { + for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); + I != E; ++I) { + const bool UseTBAA = false; + if (MI.mayAlias(AA, *I, UseTBAA)) { + return &*I; + } + } + } + + return nullptr; +} + +static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, + MachineBasicBlock &MBB, + MachineRegisterInfo &MRI, + AliasAnalysis *AA, + SlotIndexes *SlotIndexes) { + + SmallVector Users; + + // We cannot move the pacifist instruction past any memory + // op with which It aliases. Find the first instruction + // that aliases the pacifist MI (if any) and add It to the list + // of users. The sort() below will select the earliest user instruction. + if (MachineInstr *AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) { + Users.push_back(AliasMI); + } + + for (MachineOperand &MO : MI.defs()) { + Register Reg = MO.getReg(); + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (&MBB != UseMI.getParent()) + continue; + Users.emplace_back(&UseMI); + } + } + if (Users.empty()) + return nullptr; + + std::sort(Users.begin(), Users.end(), + [&SlotIndexes](const MachineInstr *MIa, MachineInstr *MIb) { + // Early instr first. + return SlotIndex::isEarlierInstr( + SlotIndexes->getInstructionIndex(*MIa), + SlotIndexes->getInstructionIndex(*MIb)); + }); + return Users.front(); +} + +// Pacifist inst will only add pressure since they don't kill. +// Try to hold them as late as possible in a MBB to help pressure. +bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + AliasAnalysis *AA, RematStatus &Status) { + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[&MBB]; + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; + + SmallVector PacifistList; + LLVM_DEBUG(dbgs() << "pacifist begin\n"); + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr() || MI.isPHI()) + continue; + if (collectPacifist(MI, InputLive, OutputLive, MRI)) { + PacifistList.emplace_back(&MI); + LLVM_DEBUG(MI.dump()); + } + } + LLVM_DEBUG(dbgs() << "pacifist end\n"); + + SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); + bool IsUpdated = false; + + // Move pacifist to its first user. + // for (MachineInstr *MI : pacifistList) { + for (auto It = PacifistList.rbegin(); It != PacifistList.rend(); It++) { + MachineInstr *MI = *It; + MachineInstr *FirstUser = + findPacifistInsertPoint(*MI, MBB, MRI, AA, SlotIndexes); + if (FirstUser == MI) + continue; + if (FirstUser == MI->getNextNode()) + continue; + + auto InsertPoint = MBB.getFirstInstrTerminator(); + if (FirstUser) { + InsertPoint = FirstUser->getIterator(); + } else { + // When there's no terminator. + if (InsertPoint == MBB.end()) + InsertPoint--; + else + // BRANCH may have exec update before It. + InsertPoint--; + + InsertPoint = + llvm::skipDebugInstructionsBackward(InsertPoint, MBB.instr_begin()); + + while ((InsertPoint->definesRegister(AMDGPU::EXEC, SIRI) || + InsertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) && + InsertPoint != MI->getIterator()) { + InsertPoint--; + InsertPoint = + llvm::skipDebugInstructionsBackward(InsertPoint, MBB.instr_begin()); + } + if (InsertPoint == MI->getIterator()) + continue; + } + // Do not overwrite a live scc. + if (willSmashSccAtLocation(MI, &MBB, InsertPoint)) + continue; + MI->removeFromParent(); + MBB.insert(InsertPoint, MI); + + LIS->handleMove(*MI); + IsUpdated = true; + } + + return IsUpdated; +} + +DenseMap +collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + DenseMap UniformMap; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + if (!Remat->TotalUniformInsts.count(&MI)) + continue; + if (MI.getNumDefs() != 1) + continue; + unsigned DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst); + if (DstIdx == (unsigned)-1) + continue; + MachineOperand &DstMO = MI.getOperand(DstIdx); + if (DstMO.getSubReg() != 0) + continue; + if (DstMO.isTied()) + continue; + Register Reg = DstMO.getReg(); + if (MRI.getUniqueVRegDef(Reg) == nullptr) + continue; + + auto *VRC = SIRI->getRegClassForReg(MRI, Reg); + if (SIRI->isSGPRClass(VRC)) + continue; + // TODO: Support more reg class. + if (VRC != &AMDGPU::VGPR_32RegClass) + continue; + + UniformMap[Reg] = &MI; + } + } + return UniformMap; +} + +// Try insert readfirstlane on uniform vgpr to turn It in sgpr and save vgpr +// pressure. +bool collectVToSCrossHotSpot( + MachineBasicBlock &MBB, RematStatus &Status, + DenseMap &UniformMap, + SmallMapVector &VToSMap, LiveIntervals *LIS) { + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; + auto &ST = MBB.getParent()->getSubtarget(); + + GCNDownwardRPTracker Tracker(*LIS); + + bool IsUpdated = false; + const auto InputLive = Status.MBBInputLiveMap[&MBB]; + Tracker.reset(*MBB.begin(), &InputLive); + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) { + continue; + } + + unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts()); + unsigned SPressure = Tracker.getPressure().getMaxSGPR(); + + SPressure += RegForVCC; + + Tracker.advance(); + // Sgpr bound, vtos cannot help. + if (SPressure > SLimit) + return false; + + if (VPressure <= VLimit) { + continue; + } + + // Try to make all possible vtos to reduce vpressure. + const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs(); + for (auto It : CurLives) { + unsigned Reg = It.first; + auto UniformIt = UniformMap.find(Reg); + if (UniformIt == UniformMap.end()) + continue; + VToSMap[UniformIt->first] = UniformIt->second; + IsUpdated = true; + } + } + return IsUpdated; +} + +// Return true if the user is outside of the def's loop. +static bool isCrossLoopUse(MachineInstr *Def, MachineInstr *User, + MachineLoopInfo *MLI) { + MachineLoop *L = MLI->getLoopFor(Def->getParent()); + return L && !L->contains(User->getParent()); +} + +bool rematUniformVgprToSgpr(Remat *Remat, MachineFunction &MF, + RematStatus &Status, + std::vector &HotBlocks, + LiveIntervals *LIS, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + MachineLoopInfo *MLI) { + DenseMap UniformVgprMap = + collectUniformVgprs(Remat, MF, MRI, SIRI); + + SmallMapVector VToSMap; + + for (auto &HotBlock : HotBlocks) { + MachineBasicBlock &MBB = *HotBlock.MBB; + collectVToSCrossHotSpot(MBB, Status, UniformVgprMap, VToSMap, LIS); + } + + if (VToSMap.empty()) + return false; + SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); + const MCInstrDesc &ReadFirstLaneDesc = SIII->get(AMDGPU::V_READFIRSTLANE_B32); + for (auto It : VToSMap) { + unsigned Reg = It.first; + MachineInstr *MI = It.second; + + auto *VRC = SIRI->getRegClassForReg(MRI, Reg); + // TODO: support bigger vgpr to sgpr. + if (VRC != &AMDGPU::VGPR_32RegClass) + continue; + auto *NewRC = SIRI->getEquivalentSGPRClass(VRC); + Register NewDst = MRI.createVirtualRegister(NewRC); + + auto ReadFirstLane = + BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, NewDst); + SmallVector UserMIs; + for (MachineInstr &UserMI : MRI.use_nodbg_instructions(Reg)) { + // Do not replace v->s across loops. Even if the value is uniform + // branch divergence can cause a uniform value in a loop to be + // non-uniform when used outside a loop. + if (isSafeRematCandidateUser(&UserMI, SIII) && + !isCrossLoopUse(MI, &UserMI, MLI)) + UserMIs.emplace_back(&UserMI); + } + + // Finish readfirstlane + ReadFirstLane.addReg(Reg); + MachineInstr *VToSMI = ReadFirstLane.getInstr(); + Remat->TotalUniformInsts.insert(VToSMI); + Remat->SafeToRemoveInsts.insert(VToSMI); + MachineBasicBlock *MBB = MI->getParent(); + MBB->insertAfter(MI->getIterator(), VToSMI); + SlotIndexes->insertMachineInstrInMaps(*VToSMI); + + for (MachineInstr *UserMI : UserMIs) { + const auto &Desc = UserMI->getDesc(); + bool IsIllegal = false; + for (unsigned I = 0; I < UserMI->getNumOperands(); I++) { + MachineOperand &MO = UserMI->getOperand(I); + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.getReg() != Reg) + continue; + if (I >= Desc.getNumOperands()) { + IsIllegal = true; + break; + } + + MO.setReg(NewDst); + if (UserMI->getDesc().operands()[I].RegClass != -1) { + if (!SIII->isOperandLegal(*UserMI, I, &MO)) { + SIII->legalizeOperands(*UserMI); + // In case legalizeOperands not help, just legalize with mov. + if (UserMI->getDesc().operands()[I].RegClass != -1 && + !SIII->isOperandLegal(*UserMI, I)) { + SIII->legalizeOpWithMove(*UserMI, I); + } + } + } else { + // consider not have limit on reg class. + } + } + if (IsIllegal) + continue; + + auto RIt = UserMI->getReverseIterator(); + RIt++; + auto EndIt = UserMI->getParent()->rend(); + while (RIt != EndIt && !RIt->isDebugInstr() && + !SlotIndexes->hasIndex(*RIt)) + SlotIndexes->insertMachineInstrInMaps(*(RIt++)); + } + } + + return true; +} + +bool collectRematableHotReg( + MachineInstr &MI, const GCNRPTracker::LiveRegSet &HotLive, + GCNRPTracker::LiveRegSet &PureHotRematSet, + DenseMap &PureHotRematLevels, unsigned &DefReg, + const GCNRPTracker::LiveRegSet &InputLive, const MachineRegisterInfo &MRI) { + // Ignore inst not have def or more than 1 def. + if (MI.getDesc().getNumDefs() != 1) + return false; + + DefReg = MI.defs().begin()->getReg(); + + unsigned Level = 0; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + + Register Reg = MO.getReg(); + + // If user is in same MI like + // %4:vgpr_32 = V_MAD_LEGACY_F32 %2:vgpr_32, %3:vgpr_32, %4:vgpr_32 + // remat It will not help. + if (Reg == DefReg) { + return false; + } + + if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)) + continue; + if (Reg.isPhysical()) + return false; + + if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI)) + return false; + + LaneBitmask Mask = llvm::getRegMask(MO, MRI); + + if (isInLiveSet(Reg, Mask, HotLive)) + continue; + + if (isInLiveSet(Reg, Mask, PureHotRematSet)) { + unsigned RegLevel = getPacifistLevel(Reg, PureHotRematLevels, MRI); + Level = std::max(Level, RegLevel); + continue; + } + + return false; + } + + for (MachineOperand &MO : MI.defs()) { + Register Reg = MO.getReg(); + + if (Reg.isPhysical()) + return false; + + if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI)) + return false; + + LaneBitmask Mask = llvm::getRegMask(MO, MRI); + PureHotRematSet[Reg] |= Mask; + } + + PureHotRematLevels[&MI] = Level + 1; + // If no def, It will not increase pressure, don't mark It. + return true; +} + +bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi, + std::vector &InBlockCloneSubExps, bool IsVGPR, + const GCNRPTracker::LiveRegSet &InputLive, + DenseSet &HotSet, int VDistance, int SDistance, + unsigned VLimit, unsigned SLimit, + const DenseSet &MemWriteMBBSet, + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + auto &ST = MBB.getParent()->getSubtarget(); + const auto &SI = LIS->getInstructionIndex(*HotMi).getBaseIndex(); + const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI); + + GCNRPTracker::LiveRegSet HotLive = LISLR; + + GCNRPTracker::LiveRegSet PureHotRematSet; + std::vector PureHotRematList; + DenseMap PureHotRematLevels; + + GCNRPTracker::LiveRegSet OutputSet; + LLVM_DEBUG(dbgs() << "pure hot remat begin\n"); + // Find reg which could remat from other reg in liveSet. + const unsigned KMaxRematLevel = 6; + GCNDownwardRPTracker Tracker(*LIS); + Tracker.reset(*MBB.begin(), &InputLive); + for (auto It = MBB.begin(); It != MBB.end(); It++) { + MachineInstr &MI = *It; + const GCNRegPressure &RP = Tracker.getPressure(); + + if (MI.isDebugInstr()) + continue; + + // Igonre inst in hot range. + if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || + RP.getMaxSGPR() > SLimit) { + Tracker.advance(); + continue; + } + + // Stop at hotMI. + if (&MI == HotMi) + break; + + Tracker.advance(); + + unsigned DefReg = 0; + if (collectRematableHotReg(MI, HotLive, PureHotRematSet, PureHotRematLevels, + DefReg, InputLive, MRI)) { + unsigned Level = PureHotRematLevels[&MI]; + if (Level >= KMaxRematLevel) + continue; + + // If the def reg is in hot reg. + // Add to output. + if (HotLive.find(DefReg) != HotLive.end()) { + bool IsUserIsHot = false; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) { + if (UseMI.getParent() != &MBB) + continue; + if (0 == HotSet.count(&UseMI)) + continue; + + const auto &UseSI = LIS->getInstructionIndex(UseMI).getBaseIndex(); + // When has a hot user after hotMI, remat It may not help. + if (UseSI > SI) { + IsUserIsHot = true; + break; + } + } + + if (IsUserIsHot) + continue; + OutputSet[DefReg]; + LLVM_DEBUG(dbgs() << "hotRemat:"); + LLVM_DEBUG(MI.getOperand(0).dump()); + // remove It from hotLive to avoid It as input when build dag. + HotLive.erase(DefReg); + } + PureHotRematList.emplace_back(&MI); + LLVM_DEBUG(dbgs() << "level:" << Level); + LLVM_DEBUG(MI.dump()); + } + } + + LLVM_DEBUG(dbgs() << "pure hot remat end\n"); + + // Create input/output for pure hot remat. + // Input is things hot reg in level 1 and output is things level > 1. + // Build SubExp with pureHotRematList as Nodes, hotLive as input + // rematHot as output. + // Not join input when build ExpDag to get small subExps. + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ false); + Dag.build(HotLive, OutputSet, PureHotRematList); + // Find best subExp add to inBlockCloneSubExps. + // Sort by size of subExp. + std::sort(Dag.SubExps.begin(), Dag.SubExps.end(), + [](const SubExp &A, const SubExp &B) { + return A.SUnits.size() < B.SUnits.size(); + }); + std::vector CloneSubExps; + int Distance = IsVGPR ? VDistance : SDistance; + for (SubExp &SubExp : Dag.SubExps) { + if (SubExp.IsNotSafeToCopy) + continue; + if (IsVGPR) { + if (SubExp.VOutputSize == 0) + continue; + } else { + if (SubExp.SOutputSize == 0) + continue; + } + if (!SubExp.isSafeToMove(MRI)) + continue; + // Not clone . + if (SubExp.SUnits.size() > 10) + continue; + // Do not allow remat in the block when the expression has a memory op and + // the block has a write. We could allow this in some cases with better + // analysis. + if (SubExp.IsHasMemInst && MemWriteMBBSet.count(&MBB)) + continue; + if (IsVGPR) { + Distance -= SubExp.VOutputSize; + } else { + Distance -= SubExp.SOutputSize; + } + CloneSubExps.emplace_back(SubExp); + if (Distance <= 0) + break; + } + if (Distance <= 0) { + InBlockCloneSubExps.insert(InBlockCloneSubExps.end(), CloneSubExps.begin(), + CloneSubExps.end()); + } + return Distance <= 0; +} + +// Try to remat live reg in hot spot from other live reg in hot spot. +// +bool tryRematInHotSpot( + MachineBasicBlock &MBB, RematStatus &Status, int VDistance, int SDistance, + int VSaved, int SSaved, std::vector &InBlockCloneSubExps, + DenseMap &InBlockHotVInstMap, + DenseMap &InBlockHotSInstMap, + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; + + auto &ST = MBB.getParent()->getSubtarget(); + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[&MBB]; + + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; + + // Collect reg pressure. + unsigned MaxLocalVPressure = 0; + unsigned MaxLocalSPressure = 0; + // Build a DAG or only on demand? + MachineInstr *HotVMI = nullptr; + MachineInstr *HotSMI = nullptr; + DenseSet HotSet; + + GCNDownwardRPTracker Tracker(*LIS); + + Tracker.reset(*MBB.begin(), &InputLive); + for (auto It = MBB.begin(); It != MBB.end(); It++) { + MachineInstr &MI = *It; + if (MI.isDebugInstr()) { + continue; + } + + unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts()); + unsigned SPressure = Tracker.getPressure().getMaxSGPR(); + + SPressure += RegForVCC; + + VPressure -= VSaved; + SPressure -= SSaved; + Tracker.advance(); + + if (VPressure <= VLimit && SPressure <= SLimit) { + continue; + } + HotSet.insert(&MI); + if (MaxLocalVPressure < VPressure) { + MaxLocalVPressure = VPressure; + HotVMI = &MI; + } + if (MaxLocalSPressure < SPressure) { + MaxLocalSPressure = SPressure; + HotSMI = &MI; + } + } + + InBlockHotVInstMap[&MBB] = HotVMI; + InBlockHotSInstMap[&MBB] = HotSMI; + if (VDistance > 0 && HotVMI) { + // Use hotVMI when apply. + InBlockHotSInstMap[&MBB] = nullptr; + if (tryRemat(MBB, HotVMI, InBlockCloneSubExps, /*IsVGPR*/ true, InputLive, + HotSet, VDistance, SDistance, VLimit, SLimit, + Status.MemWriteMBBSet, LIS, MRI, SIRI, SIII)) + return true; + } + + if (SDistance > 0 && HotSMI) { + // Use hotSMI when apply. + InBlockHotSInstMap[&MBB] = HotSMI; + InBlockHotVInstMap[&MBB] = nullptr; + return tryRemat(MBB, HotSMI, InBlockCloneSubExps, /*IsVGPR*/ false, + InputLive, HotSet, VDistance, VDistance, VLimit, SLimit, + Status.MemWriteMBBSet, LIS, MRI, SIRI, SIII); + } + return false; +} +// Sort subExpCandidates to make sure deeper subExp apply first. +// If subExp0 use result of subExp1, subExp0 is deeper than subExp1. +// When apply subExp1 before subExp0, new clone of subExp0 which use result of +// subExp1 will have old reg of subExp1. And reg pressure will not be reduced. +void sortSubExpCandidates(std::vector &SubExpCandidates) { + MapVector> InputMap; + MapVector> OutputMap; + struct SortNode { + SubExp Exp; + unsigned Depth; + bool IsDepthDirty; + SmallDenseSet Preds; + SmallDenseSet Succs; + }; + + { + SmallVector RegSortStorage; + for (SubExp &Exp : SubExpCandidates) { + RegSortStorage.assign(Exp.TopRegs.begin(), Exp.TopRegs.end()); + std::sort(RegSortStorage.begin(), RegSortStorage.end()); + for (auto It : RegSortStorage) { + unsigned Reg = It; + InputMap[Reg].insert(&Exp); + } + + RegSortStorage.assign(Exp.BottomRegs.begin(), Exp.BottomRegs.end()); + std::sort(RegSortStorage.begin(), RegSortStorage.end()); + for (auto It : RegSortStorage) { + unsigned Reg = It; + OutputMap[Reg].insert(&Exp); + } + } + } + + MapVector SortMap; + for (auto It : InputMap) { + unsigned Reg = It.first; + MapVector>::iterator OutIt = + OutputMap.find(Reg); + if (OutIt == OutputMap.end()) + continue; + auto &InExps = It.second; + auto &OutExps = OutIt->second; + for (SubExp *InExp : InExps) { + for (SubExp *OutExp : OutExps) { + if (InExp->IsHoist != OutExp->IsHoist) { + // Different direction. + // If output (def) move up, input(use) move down, nothing happens. + if (OutExp->IsHoist) + continue; + // Canot input(use) move up, output(def) move down. + // Choose the exp which save more. + int InExpGain = InExp->VOutputSize - InExp->VInputSize; + int OutExpGain = OutExp->VInputSize - InExp->VOutputSize; + if (InExpGain >= OutExpGain) { + OutExp->SUnits.clear(); + } else { + InExp->SUnits.clear(); + } + continue; + } + // Link outExp to inExp. + if (InExp->IsHoist) { + SortMap[OutExp].Preds.insert(InExp); + SortMap[InExp].Succs.insert(OutExp); + } else { + SortMap[InExp].Preds.insert(OutExp); + SortMap[OutExp].Succs.insert(InExp); + } + } + } + } + + if (SortMap.empty()) + return; + + SmallVector WorkList; + for (SubExp &Exp : SubExpCandidates) { + SortNode &Node = SortMap[&Exp]; + Node.Depth = 0; + Node.Exp = Exp; + Node.IsDepthDirty = !Node.Preds.empty(); + if (!Node.IsDepthDirty) + WorkList.emplace_back(&Exp); + } + // Calc depth. + while (!WorkList.empty()) { + SubExp *Exp = WorkList.pop_back_val(); + SortNode &Node = SortMap[Exp]; + for (SubExp *Succ : Node.Succs) { + SortNode &SuccNode = SortMap[Succ]; + SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1); + bool IsAllPrevClean = true; + for (SubExp *Prev : SuccNode.Preds) { + SortNode &PrevNode = SortMap[Prev]; + if (PrevNode.IsDepthDirty) { + IsAllPrevClean = false; + break; + } + } + if (IsAllPrevClean) { + SuccNode.IsDepthDirty = false; + WorkList.push_back(Succ); + } + } + } + + std::vector Nodes; + for (auto &It : SortMap) { + SortNode &Node = It.second; + Nodes.emplace_back(&Node); + } + + struct Sorter { + bool operator()(const SortNode *A, const SortNode *B) { + return A->Depth > B->Depth; + } + }; + + // subExp deeper should be apply first. + std::sort(Nodes.begin(), Nodes.end(), Sorter()); + + SubExpCandidates.clear(); + for (auto &Node : Nodes) { + SubExpCandidates.emplace_back(Node->Exp); + } +} + +// Compare pressure, return ture if maxV0/maxS0 pressure is higher than +// maxV1/maxS1. +bool pressureHigher(unsigned MaxV0, unsigned MaxS0, unsigned MaxV1, + unsigned MaxS1, const GCNSubtarget *ST) { + unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(MaxV0); + unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(MaxV1); + unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(MaxS0); + unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(MaxS1); + unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0); + unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1); + // is low pressure. + if (Occ0 > Occ1) + return false; + if (Occ0 < Occ1) + return true; + // When sgpr bound, is high pressure. + if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) { + return MaxS0 > MaxS1; + } + // When vgpr bound or mix, vgpr higher is higher pressure. + return MaxV0 > MaxV1; +} + +// Return true if the subExp can help pressure for passThrus. +bool canHelpPressureWhenSink(SubExp &SubExp, + const GCNRPTracker::LiveRegSet &PassThrus, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, + const MachineLoopInfo *MLI, + MachineDominatorTree *DT, bool IsCanClone, + bool IsSgprBound) { + LLVM_DEBUG(SubExp.dump(MRI, SIRI)); + if (!SubExp.isSafeToMove(MRI)) + return false; + + // Update input size to ignore lives in which already in + // passThrus. + for (auto It : SubExp.InputLive) { + unsigned Reg = It.first; + if (PassThrus.count(Reg) == 0) + continue; + unsigned Size = getRegSize(Reg, It.second, MRI, SIRI); + if (SIRI->isVGPR(MRI, Reg)) { + SubExp.VInputSize -= Size; + } else { + SubExp.SInputSize -= Size; + } + } + + if (SubExp.VInputSize > SubExp.VOutputSize) + return false; + + if (SubExp.SInputSize > SubExp.SOutputSize && IsSgprBound) + return false; + + if (SubExp.SInputSize >= SubExp.SOutputSize && + SubExp.VInputSize == SubExp.VOutputSize) + return false; + + // Try to find a Insert Block. + // Skip multi def output sub exp. + // Collect user blocks, find common dom. + BlockSet UserBlocks; + for (unsigned Reg : SubExp.BottomRegs) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB != SubExp.FromBB) + UserBlocks.insert(UserBB); + } + } + if (UserBlocks.empty()) + return false; + MachineBasicBlock *UserBlock = nearestCommonDominator(DT, UserBlocks); + if (!DT->dominates(SubExp.FromBB, UserBlock)) { + return false; + } + if (UserBlock == SubExp.FromBB && + // When allow clone, could go clone path if cannot move subExp. + !IsCanClone) + return false; + + SubExp.ToBB = UserBlock; + if (auto *ToLoop = MLI->getLoopFor(UserBlock)) { + auto *FromLoop = MLI->getLoopFor(SubExp.FromBB); + if (!FromLoop || FromLoop->getLoopDepth() < ToLoop->getLoopDepth()) + SubExp.IsMoveIntoLoop = true; + } else if (auto *FromLoop = MLI->getLoopFor(SubExp.FromBB)) { + auto *ToLoop = MLI->getLoopFor(UserBlock); + // not safe to move out of loop. + if (!ToLoop || FromLoop->getLoopDepth() > ToLoop->getLoopDepth() || + ToLoop != FromLoop) + return false; + } + return true; +} + +bool canHelpPressureWhenHoist(SubExp &SubExp, const MachineRegisterInfo &MRI, + const MachineLoopInfo *MLI, bool IsSgprBound) { + if (!SubExp.isSafeToMove(MRI)) + return false; + if (SubExp.VInputSize < SubExp.VOutputSize) + return false; + if (SubExp.SInputSize < SubExp.SOutputSize && IsSgprBound) + return false; + + if (SubExp.SInputSize <= SubExp.SOutputSize && + SubExp.VInputSize == SubExp.VOutputSize) + return false; + + // Try to find a Insert Block. + // Skip multi def output sub exp. + // Collect user blocks, find common dom. + BlockSet DefBlocks; + for (unsigned Reg : SubExp.TopRegs) { + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + if (!DefMI) + continue; + DefBlocks.insert(DefMI->getParent()); + } + if (DefBlocks.size() != 1) + return false; + MachineBasicBlock *DefBlock = *DefBlocks.begin(); + SubExp.ToBB = DefBlock; + // Not do same block hoist. + if (SubExp.ToBB == SubExp.FromBB) + return false; + + if (auto *ToLoop = MLI->getLoopFor(DefBlock)) { + auto *FromLoop = MLI->getLoopFor(SubExp.FromBB); + // TODO: enable move into loop when hoist. + if (!FromLoop || FromLoop->getLoopDepth() < ToLoop->getLoopDepth()) + return false; + } else if (auto *FromLoop = MLI->getLoopFor(SubExp.FromBB)) { + auto *ToLoop = MLI->getLoopFor(DefBlock); + // not safe to move out of loop. + if (!ToLoop || FromLoop->getLoopDepth() > ToLoop->getLoopDepth() || + ToLoop != FromLoop) + return false; + } + return true; +} + +SmallVector> +groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &PassThrus, + GCNRPTracker::LiveRegSet &UsedPassThrus, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + MapVector Candidates; + + // Group safe candidates by define block. + for (auto It : PassThrus) { + Register Reg = It.first; + // Skip used pass thru reg to avoid count It twice for different hot block. + if (UsedPassThrus.count(Reg)) + continue; + LLVM_DEBUG(printVreg(Reg, MRI)); + LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr "; + else dbgs() << " vgpr ";); + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true)) { + LLVM_DEBUG(dbgs() << " is not safe\n"); + continue; + } + LLVM_DEBUG(dbgs() << " is safe\n"); + // DefMI is already checked in isSafeCandidate. + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + + GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()]; + DefInMBB[Reg] = It.second; + } + + llvm::SmallVector> + Result = Candidates.takeVector(); + + LLVM_DEBUG( + llvm::dbgs() << "Before sort candidates\n"; for (auto It : Result) { + MachineBasicBlock *MBB = It.first; + auto &defInMBB = It.second; + MBB->dump(); + llvm::dumpLiveSet(defInMBB, SIRI); + } llvm::dbgs() << "end of candidates\n";); + + std::sort(Result.begin(), Result.end(), + [](std::pair &It0, + std::pair &It1) { + return It0.first->getNumber() < It1.first->getNumber(); + }); + + LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto It : Result) { + MachineBasicBlock *MBB = It.first; + auto &defInMBB = It.second; + MBB->dump(); + llvm::dumpLiveSet(defInMBB, SIRI); + } llvm::dbgs() << "end of candidates\n";); + + return Result; +} + +// collect pass thru regs of MBB. +GCNRPTracker::LiveRegSet +collectPassThrus(MachineBasicBlock *MBB, + const GCNRPTracker::LiveRegSet &InputLive, + const GCNRPTracker::LiveRegSet &OutputLive, + const GCNRPTracker::LiveRegSet &LiveRegCandidates, + MachineRegisterInfo &MRI, bool IsCanClone) { + GCNRPTracker::LiveRegSet PassThrus; + llvm::mergeLiveRegSet(PassThrus, InputLive); + llvm::andLiveRegSet(PassThrus, OutputLive); + + // Remove reg which not in liveRegCandidates. + GCNRPTracker::LiveRegSet TmpPassThrus = PassThrus; + for (auto It : TmpPassThrus) { + unsigned Reg = It.first; + if (!LiveRegCandidates.count(Reg)) { + PassThrus.erase(Reg); + } + } + TmpPassThrus = PassThrus; + // Remove reg which has read/write in MBB. + for (auto It : TmpPassThrus) { + unsigned Reg = It.first; + DenseSet DefMBBs; + for (MachineInstr &DefMI : MRI.def_instructions(Reg)) { + MachineBasicBlock *MBB = DefMI.getParent(); + DefMBBs.insert(MBB); + } + DenseSet UseMBBs; + // Allow use for pass thru if clone is OK. + if (!IsCanClone) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserMBB = UseMI.getParent(); + UseMBBs.insert(UserMBB); + } + } + bool IsW = DefMBBs.count(MBB) > 0; + bool IsR = UseMBBs.count(MBB) > 0; + + bool IsPassThru = !IsW && !IsR; + if (!IsPassThru) + PassThrus.erase(Reg); + } + return PassThrus; +} +// Try to build a free subExp which all input is passThrus. +SubExp buildFreeSubExp(SubExp &Exp, GCNRPTracker::LiveRegSet &PassThrus, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { + SubExp FreeExp; + // Try to split the subExp to find a help case. + // Scan all inst in subExp, propagate free inst which input is from + // passThrus. + SmallDenseSet FreeRegs; + SmallDenseSet FreeInstUseRegs; + SmallVector FreeInsts; + for (MachineInstr *MI : Exp.SUnits) { + bool IsFree = true; + // Check all use regs are free. + for (MachineOperand &MO : MI->uses()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (MO.isImplicit() && Reg == AMDGPU::EXEC) + continue; + if (MRI.getUniqueVRegDef(Reg) == nullptr) { + IsFree = false; + break; + } + // Skip local pass thrus unless It is free. + if (PassThrus.count(Reg) && Exp.TopRegs.count(Reg)) + continue; + if (FreeRegs.count(Reg)) + continue; + IsFree = false; + break; + } + // Check def is unique. + for (MachineOperand &MO : MI->defs()) { + Register Reg = MO.getReg(); + if (MRI.getUniqueVRegDef(Reg) == nullptr) { + IsFree = false; + break; + } + } + if (!IsFree) + continue; + // Save inst as free inst. + FreeInsts.emplace_back(MI); + // Save def as free reg. + for (MachineOperand &MO : MI->defs()) { + Register Reg = MO.getReg(); + FreeRegs.insert(Reg); + } + // Save use regs as free use reg. + for (MachineOperand &MO : MI->uses()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + + FreeInstUseRegs.insert(Reg); + } + } + // Then remove local inst has no output use. + for (MachineInstr *MI : FreeInsts) { + bool IsFreeUsed = false; + for (MachineOperand &MO : MI->defs()) { + Register Reg = MO.getReg(); + // Used as freeInst or output. + IsFreeUsed |= FreeInstUseRegs.count(Reg) > 0 || Exp.BottomRegs.count(Reg); + } + if (!IsFreeUsed) + continue; + FreeExp.SUnits.emplace_back(MI); + } + if (FreeExp.SUnits.empty()) { + // mark has terminator to make It unsafe. + FreeExp.IsHasTerminatorInst = true; + return FreeExp; + } + // Build BottomRegs and TopRegs for freeExp. + // BottomRegs is freeRegs in subExp.BottomRegs. + for (Register FreeReg : FreeRegs) { + if (Exp.BottomRegs.count(FreeReg)) + FreeExp.BottomRegs.insert(FreeReg); + } + // TopRegs is freeInstUseRegs in subExp.TopRegs. + for (Register FreeInstUseReg : FreeInstUseRegs) { + if (Exp.TopRegs.count(FreeInstUseReg)) + FreeExp.TopRegs.insert(FreeInstUseReg); + } + FreeExp.FromBB = Exp.FromBB; + FreeExp.ToBB = Exp.ToBB; + // must be clone since is partial of subExp. + FreeExp.IsCloneOnly = true; + + // Calc reg for freeExp. + for (unsigned Reg : FreeExp.TopRegs) { + FreeExp.InputLive[Reg]; + } + + for (unsigned Reg : FreeExp.BottomRegs) { + FreeExp.OutputLive[Reg]; + } + + collectLiveSetPressure(FreeExp.InputLive, MRI, SIRI, FreeExp.VInputSize, + FreeExp.SInputSize); + collectLiveSetPressure(FreeExp.OutputLive, MRI, SIRI, FreeExp.VOutputSize, + FreeExp.SOutputSize); + return FreeExp; +} + +std::vector buildSubExpCandidates( + Remat *Remat, + SmallVector> + &Candidates, + GCNRPTracker::LiveRegSet &PassThrus, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const MachineLoopInfo *MLI, SlotIndexes *SlotIndexes, + MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound, + GCNRPTracker::LiveRegSet &UnusedPassThrus, + DenseSet &MemWriteMBBSet, + bool AllowPartialUseInSubExp) { + std::vector SubExpCandidates; + // Build exp dag on define blocks. + // Save profit candidates into list. + for (auto &It : Candidates) { + MachineBasicBlock *DefMBB = It.first; + // Try to remove out reg def sub exp from DefMBB. + GCNRPTracker::LiveRegSet &DefInMBB = It.second; + // Go up on the dag until reach share node. + auto SubExps = buildSubExpFromCandidates( + Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, SlotIndexes, UnusedPassThrus, + AllowPartialUseInSubExp); + for (SubExp &Exp : SubExps) { + if (Exp.IsHasMemInst) { + // Skip when memory ld/st inst need to cross MBB which write memory. + // TODO: check all MBBs in between FromBB and ToBB not write memory. + // Currently just skip when any memory write exist. + if (!MemWriteMBBSet.empty()) { + MachineBasicBlock *FromBB = Exp.FromBB; + MachineBasicBlock *ToBB = Exp.ToBB; + if (Exp.IsHoist) { + FromBB = Exp.ToBB; + ToBB = Exp.FromBB; + } + bool IsCrossMemWriteMBB = false; + for (MachineBasicBlock *MemMBB : MemWriteMBBSet) { + if (DT->dominates(ToBB, MemMBB)) + continue; + if (DT->dominates(MemMBB, FromBB)) + continue; + IsCrossMemWriteMBB = true; + break; + } + if (IsCrossMemWriteMBB) + continue; + } + } + if (!canHelpPressureWhenSink(Exp, PassThrus, MRI, SIRI, MLI, DT, + IsCanClone, IsSgprBound)) { + if (AllowPartialUseInSubExp && Exp.isSafeToMove(MRI)) { + SubExp FreeSubExp = buildFreeSubExp(Exp, PassThrus, MRI, SIRI); + if (canHelpPressureWhenSink(FreeSubExp, PassThrus, MRI, SIRI, MLI, DT, + IsCanClone, IsSgprBound)) { + SubExpCandidates.emplace_back(FreeSubExp); + } + } + continue; + } + + SubExpCandidates.emplace_back(Exp); + } + } + return SubExpCandidates; +} + +std::pair +calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, + GCNRPTracker::LiveRegSet &InputLive, + GCNRPTracker::LiveRegSet &OutputLive, bool IsVOutBound, + bool IsSOutBound, bool IsCanClone, MachineDominatorTree *DT, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { + int Vgpr = 0; + int Sgpr = 0; + MachineBasicBlock *MBB = HotBb.MBB; + // Sink saving. + for (SubExp &Exp : SubExpCandidates) { + if (Exp.IsHoist) { + // ToMBB -> MBB -> FromMBB. + // If ToMBB not dom hot block, reg will not live in MBB. + if (!DT->dominates(Exp.ToBB, MBB)) + continue; + } else { + // If FromBB not dom hot block, reg will not live in MBB. + if (!DT->dominates(Exp.FromBB, MBB)) + continue; + // When subExp is from hotBB, check output instead of input. + if (Exp.FromBB == MBB) { + if (IsVOutBound && Exp.VOutputSize < Exp.VInputSize) + continue; + if (IsSOutBound && Exp.SOutputSize < Exp.SInputSize) + continue; + Vgpr += Exp.VInputSize; + Vgpr -= Exp.VOutputSize; + Sgpr += Exp.SInputSize; + Sgpr -= Exp.SOutputSize; + continue; + } + } + int VgprDiff = 0; + int SgprDiff = 0; + MachineBasicBlock *ToMBB = Exp.ToBB; + // If subExp is to hotBB, It is crossing output instead of input. + GCNRPTracker::LiveRegSet &CrossLive = MBB == ToMBB ? OutputLive : InputLive; + + bool IsClone = false; + GCNRPTracker::LiveRegSet NewInput; + if (!Exp.IsMoveIntoLoop) { + if (Exp.IsHoist) { + // If FromBB dom hot block, It will not change live for MBB. + if (Exp.FromBB != MBB && DT->dominates(Exp.FromBB, MBB)) + continue; + } else { + // If ToBB dom hot block, It will not change live for MBB. + if (ToMBB != MBB && DT->dominates(ToMBB, MBB)) { + if (IsCanClone && !Exp.IsNotSafeToCopy) { + IsClone = true; + } else { + continue; + } + } + } + + for (auto OutIt : Exp.OutputLive) { + Register Reg = OutIt.first; + LaneBitmask OutMask = OutIt.second; + LaneBitmask MBBBeginMask; + if (CrossLive.find(Reg) != CrossLive.end()) + MBBBeginMask = CrossLive[Reg]; + // Check mask which live in both BeginSlot and exp output when sink to + // kill the output. Check mask which not live in BeginSlot in + // exp output when hoist to live the output. + LaneBitmask ProfitMask = Exp.IsHoist ? (OutMask & (~MBBBeginMask)) + : (OutMask & MBBBeginMask); + if (MBBBeginMask.any()) { + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); + LLVM_DEBUG(std::string MovStr = + Exp.IsHoist ? "output hoist:" : "output sink:"; + dbgs() << MovStr << Reg.virtRegIndex() << " " << Size); + // Exp out live at block input. + // It will descrease live for MBB when sink and increase when hoist. + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + if (Exp.IsHoist) + VgprDiff += Size; + else + VgprDiff -= Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + if (Exp.IsHoist) + SgprDiff += Size; + else + SgprDiff -= Size; + } + } + } + + for (auto InIt : Exp.InputLive) { + Register Reg = InIt.first; + LaneBitmask InMask = InIt.second; + LaneBitmask MBBBeginMask; + if (CrossLive.find(Reg) != CrossLive.end()) + MBBBeginMask = CrossLive[Reg]; + // Check mask which not live in BeginSlot in exp input when + // sink to live the input. Check mask which live in both BeginSlot and + // exp output when hoist to kill the input. + LaneBitmask ProfitMask = + Exp.IsHoist ? (InMask & MBBBeginMask) : (InMask & (~MBBBeginMask)); + if (ProfitMask.any()) { + // Update input live to avoid count same input more than once. + NewInput[Reg] |= InMask; + // Exp in not live at block input. + // It will increase live for MBB. + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); + + LLVM_DEBUG(std::string MovStr = + Exp.IsHoist ? "input hoist:" : "input sink:"; + dbgs() << MovStr << Reg.virtRegIndex() << " " << Size); + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + if (Exp.IsHoist) + VgprDiff -= Size; + else + VgprDiff += Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + if (Exp.IsHoist) + SgprDiff -= Size; + else + SgprDiff += Size; + } + } + } + } else { + // When sink into loop, the input will live for every block inside loop. + // The output will only lived between to blocks and the use blocks. + // If MBB dominate any user of output live reg, It will still live in + // MBB. So cannot count that output live reg as profit. + // Hoist into loop is not supported now. + for (auto OutIt : Exp.OutputLive) { + Register Reg = OutIt.first; + bool IsDomUser = false; + for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserMBB = MI.getParent(); + if (DT->dominates(MBB, UserMBB)) { + IsDomUser = true; + break; + } + } + if (IsDomUser) + continue; + + LaneBitmask OutMask = OutIt.second; + LaneBitmask MBBBeginMask; + if (InputLive.find(Reg) != InputLive.end()) + MBBBeginMask = InputLive[Reg]; + LaneBitmask ProfitMask = OutMask & MBBBeginMask; + if (MBBBeginMask.any()) { + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); + LLVM_DEBUG(dbgs() << "move:" << Reg.virtRegIndex() << " " << Size); + // Exp out live at block input. + // It will descrease live for MBB. + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + VgprDiff -= Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + SgprDiff -= Size; + } + } + } + + for (auto InIt : Exp.InputLive) { + Register Reg = InIt.first; + LaneBitmask InMask = InIt.second; + LaneBitmask MBBBeginMask; + if (InputLive.find(Reg) != InputLive.end()) + MBBBeginMask = InputLive[Reg]; + // Check mask which not live in BeginSlot in exp input. + LaneBitmask ProfitMask = InMask & (~MBBBeginMask); + if (ProfitMask.any()) { + // Update input live to avoid count same input more than once. + NewInput[Reg] |= InMask; + // Exp in not live at block input. + // It will increase live for MBB. + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); + + LLVM_DEBUG(dbgs() << "add:" << Reg.virtRegIndex() << " " << Size); + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + VgprDiff += Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + SgprDiff += Size; + } + } + } + } + + if (IsVOutBound && VgprDiff > 0) + continue; + + if (IsSOutBound && SgprDiff > 0) + continue; + llvm::mergeLiveRegSet(CrossLive, NewInput); + Vgpr += VgprDiff; + Sgpr += SgprDiff; + if (IsClone) + Exp.IsCloneOnly = true; + } + + return std::make_pair(Vgpr, Sgpr); +} + +void addExpCandidates(std::vector &SubExpCandidates, + std::vector &SubExps, + GCNRPTracker::LiveRegSet &UsedRegs) { + SubExpCandidates.insert(SubExpCandidates.end(), SubExps.begin(), + SubExps.end()); + for (auto &Exp : SubExps) { + if (Exp.IsHoist) { + for (auto &Reg : Exp.TopRegs) { + UsedRegs[Reg]; + } + } else { + for (auto &Reg : Exp.BottomRegs) { + UsedRegs[Reg]; + } + } + } +} + +bool tryToAddSubExps( + Remat *Remat, HotBlock &HotBB, RematStatus &Status, + std::vector &SubExpCandidates, + std::vector &InBlockCloneSubExps, + DenseMap &InBlockHotVInstMap, + DenseMap &InBlockHotSInstMap, + SmallVector> + Candidates, + int Vgpr, int Sgpr, const GCNRPTracker::LiveRegSet &SavingInputLive, + const GCNRPTracker::LiveRegSet &SavingOutputLive, + GCNRPTracker::LiveRegSet &PassThrus, GCNRPTracker::LiveRegSet &UsedRegs, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, const MachineLoopInfo *MLI, SlotIndexes *SI, + LiveIntervals *LIS, MachineDominatorTree *DT, bool IsCanClone, + bool IsVOutBound, bool IsSOutBound, + GCNRPTracker::LiveRegSet &UnusedPassThrus, bool AllowPartialUseInSubExp) { + std::vector PartialSubExps = + buildSubExpCandidates(Remat, Candidates, PassThrus, MRI, SIRI, SIII, MLI, + SI, DT, IsCanClone, IsSOutBound, UnusedPassThrus, + Status.MemWriteMBBSet, AllowPartialUseInSubExp); + + GCNRPTracker::LiveRegSet TmpSavingInputLive = SavingInputLive; + GCNRPTracker::LiveRegSet TmpSavingOutputLive = SavingOutputLive; + std::pair CurSaving = calculateSaving( + HotBB, PartialSubExps, TmpSavingInputLive, TmpSavingOutputLive, + IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); + const int VLimit = Status.TargetVLimit; + const int SLimit = Status.TargetSLimit; + + Vgpr += CurSaving.first; + Sgpr += CurSaving.second; + + if (Vgpr <= VLimit && Sgpr <= SLimit) { + // nrmSubExps can help reach target occupancy, add It to + // subExpCandidates. + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); + return true; + } + + if (EnableSubExpAggressive) { + // Build candidates from passThrus used in partialSubExps. + GCNRPTracker::LiveRegSet SinkUsedRegs; + for (auto &Exp : PartialSubExps) { + for (auto &Reg : Exp.BottomRegs) { + SinkUsedRegs[Reg]; + } + } + MapVector HoistCandidates; + for (auto &It : HotBB.InputLive) { + unsigned Reg = It.first; + // Skip reg which already used for sink exp. + if (SinkUsedRegs.count(Reg)) + continue; + if (UsedRegs.count(Reg)) + continue; + // Skip unsafe reg. + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ false)) { + LLVM_DEBUG(dbgs() << " is not safe to hoist\n"); + continue; + } + // DefMI is already checked in isSafeCandidate. + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + MachineBasicBlock *DefMBB = DefMI->getParent(); + DenseSet UseMBBSet; + // Make sure all uses not in Def block are in same block. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UseMBB = UseMI.getParent(); + if (UseMBB == DefMBB) + continue; + UseMBBSet.insert(UseMBB); + } + + if (UseMBBSet.size() != 1) + continue; + MachineBasicBlock *UseMBB = *UseMBBSet.begin(); + GCNRPTracker::LiveRegSet &UseInMBB = HoistCandidates[UseMBB]; + UseInMBB[Reg] = getRegMask(DefMI->getOperand(0), MRI); + } + + // Build exp dag on define blocks. + std::vector HoistSubExpCandidates; + // Save profit candidates into list. + for (auto It : HoistCandidates) { + MachineBasicBlock *UseMBB = It.first; + // Try to remove out reg def sub exp from DefMBB. + GCNRPTracker::LiveRegSet &UseInMBB = It.second; + // Go up on the dag until reach share node. + auto SubExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, + SIRI, SIII, MRI); + for (SubExp &SubExp : SubExps) { + if (!canHelpPressureWhenHoist(SubExp, MRI, MLI, IsSOutBound)) + continue; + SubExp.IsHoist = true; + HoistSubExpCandidates.emplace_back(SubExp); + } + } + + std::pair HoistSaving = calculateSaving( + HotBB, HoistSubExpCandidates, TmpSavingInputLive, TmpSavingOutputLive, + IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); + + int HoistVgpr = Vgpr + HoistSaving.first; + int HoistSgpr = Sgpr + HoistSaving.second; + + if ((HoistVgpr <= VLimit && HoistSgpr <= SLimit) || + // If status not balance, do the remat even cannot reach target. + // TODO: check the result not help even one occupancy. + (!HoistSubExpCandidates.empty() && !Status.NotBalance && + TargetOccupancy != 0)) { + // nrmSubExps can help reach target occupancy, add It to + // subExpCandidates. + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); + addExpCandidates(SubExpCandidates, HoistSubExpCandidates, UsedRegs); + + return true; + } + } + + if (EnableVmemDegree && + // Only expect vmem when last tryToAddSubExps. + // If not, AllowPartialUseInSubExp will no chance to be true. + (AllowPartialUseInSubExp || !EnableSubExpAggressive)) { + // Assume vmemLdSize could be optimized by not parallel. + if (((Vgpr - HotBB.VmemLdInputSize) <= VLimit || + (Vgpr - HotBB.VmemLdOutputSize) <= VLimit) && + Sgpr <= SLimit) { + // nrmSubExps can help reach target occupancy, add It to + // subExpCandidates. + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); + return true; + } + } + + int VDistance = Vgpr - (int)VLimit; + int SDistance = Status.TargetOcc > 4 ? (Sgpr - (int)SLimit) : 0; + int VSaved = HotBB.MaxPressures.first - Vgpr; + int SSaved = HotBB.MaxPressures.second - Sgpr; + // Try to add inBlockCloneSubExps. + if (!tryRematInHotSpot(*HotBB.MBB, Status, VDistance, SDistance, VSaved, + SSaved, InBlockCloneSubExps, InBlockHotVInstMap, + InBlockHotSInstMap, LIS, MRI, SIRI, SIII)) { + // return false always when not allow partialUseInSubExp, It will try again + // with partialUseInSubExp enabled. + if (!AllowPartialUseInSubExp) + return false; + // If status not balance, do the remat even cannot reach target. + // TODO: check the result not help even one occupancy. + if (!Status.NotBalance && TargetOccupancy == 0) + return false; + } + // nrmSubExps can help reach target occupancy, add It to + // subExpCandidates. + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); + return true; +} + +// Remat passthru regs per hot block. +// Reason to do It per block is to make sure passthru reuse is precise. +// If try remat on all hot blocks together, the passthru might be on one block, +// reuse in on another block which the reg is not passthru there. +bool perBlockPassthruRemat(Remat *Remat, std::vector &HotBlocks, + RematStatus &Status, + GCNRPTracker::LiveRegSet &LiveRegCandidates, + const GCNSubtarget *ST, LiveIntervals *LIS, + const MachineLoopInfo *MLI, MachineDominatorTree *DT, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + bool IsUpdated = false; + bool IsCanClone = EnableSubExpClone || EnableSubExpAggressive; + + SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); + // Sort hot blocks by pressure first. + // The hot block with higher pressure is easier to fail. + // If fail, fail fast. It It works, save the subExpCandidates. The + // subExpCandidates may help other hotblocks. + std::sort(HotBlocks.begin(), HotBlocks.end(), + [&ST](const HotBlock &A, const HotBlock &B) { + return pressureHigher(A.MaxPressures.first, A.MaxPressures.second, + B.MaxPressures.first, B.MaxPressures.second, + ST); + }); + + std::vector SubExpCandidates; + // For inBlock remat clone. + std::vector InBlockCloneSubExps; + DenseMap InBlockHotVInstMap; + DenseMap InBlockHotSInstMap; + + // Save used passThrus to avoid use same reg on different MBB. + GCNRPTracker::LiveRegSet UsedPassThrus; + // Save moved regs to avoid use same reg hoist and sink. + GCNRPTracker::LiveRegSet UsedRegs; + + const int VLimit = Status.TargetVLimit; + const int SLimit = Status.TargetSLimit; + // Collect passthru for hot block. + // Try remat on It. + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; + + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; + + It.InputLive = InputLive; + + // Add pressure by 1 to consider spill to vgpr. + const int PressureDelta = -1; + int Vgpr = It.MaxPressures.first - PressureDelta; + int Sgpr = It.MaxPressures.second; + bool IsVOutBound = Vgpr > VLimit; + bool IsSOutBound = Sgpr > SLimit; + // savingInputLive is used to calculate saving which will be modified to + // avoid count same input multiple times. + GCNRPTracker::LiveRegSet SavingInputLive = InputLive; + GCNRPTracker::LiveRegSet SavingOutputLive = OutputLive; + std::pair CurSaving = + calculateSaving(It, SubExpCandidates, SavingInputLive, SavingOutputLive, + IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); + + Vgpr += CurSaving.first; + Sgpr += CurSaving.second; + + if (Vgpr <= VLimit && Sgpr <= SLimit) + continue; + + // Collect pass thru regs. + GCNRPTracker::LiveRegSet PassThrus = collectPassThrus( + MBB, InputLive, OutputLive, LiveRegCandidates, MRI, IsCanClone); + + // Group pass thru regs by def MBB. + SmallVector> + Candidates = groupPassThruByDefBlock(Remat, PassThrus, UsedPassThrus, + MRI, SIRI, SIII); + // unUsedPassThrus used to collect passThru which is skipped when build + // subExp. + GCNRPTracker::LiveRegSet UnusedPassThrus; + // Build exp dag on define blocks. + bool AllowPartialUseInSubExp = false; + if (tryToAddSubExps( + Remat, It, Status, SubExpCandidates, InBlockCloneSubExps, + InBlockHotVInstMap, InBlockHotSInstMap, Candidates, Vgpr, Sgpr, + SavingInputLive, SavingOutputLive, PassThrus, UsedRegs, MRI, SIRI, + SIII, MLI, SlotIndexes, LIS, DT, IsCanClone, IsVOutBound, + IsSOutBound, UnusedPassThrus, AllowPartialUseInSubExp)) { + // Remove unusedPassThrus from passThrus first. + llvm::andNotLiveRegSet(PassThrus, UnusedPassThrus); + llvm::mergeLiveRegSet(UsedPassThrus, PassThrus); + continue; + } + // If cannot clone, don't need to try partialUseInSubExp which must clone. + if (!IsCanClone) + return false; + + // Partial use subExp may result count caused by clone. + // Only try It when enable aggressive remat. + if (!EnableSubExpAggressive) + return false; + + AllowPartialUseInSubExp = true; + if (!tryToAddSubExps( + Remat, It, Status, SubExpCandidates, InBlockCloneSubExps, + InBlockHotVInstMap, InBlockHotSInstMap, Candidates, Vgpr, Sgpr, + SavingInputLive, SavingOutputLive, PassThrus, UsedRegs, MRI, SIRI, + SIII, MLI, SlotIndexes, LIS, DT, IsCanClone, IsVOutBound, + IsSOutBound, UnusedPassThrus, AllowPartialUseInSubExp)) { + return false; + } + // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp. + llvm::mergeLiveRegSet(UsedPassThrus, PassThrus); + } + + // Apply changes. + { + // sort subExpCandidates to make sure input use apply before output use if a + // reg is input and output of subExps. + LLVM_DEBUG(for (SubExp &Exp : SubExpCandidates) { Exp.dump(MRI, SIRI); }); + sortSubExpCandidates(SubExpCandidates); + + for (SubExp &Exp : SubExpCandidates) { + // Skip exp which is cleared in sort for hoist sink conflict. + if (Exp.SUnits.empty()) + continue; + LLVM_DEBUG(Exp.dump(MRI, SIRI)); + if (Exp.IsHoist) { + applySubExpMoveNearDefine(Exp, MRI, SlotIndexes, SIII, SIRI); + } else { + if (Exp.IsCloneOnly) + applySubExpCloneNearUser(Exp, HotBlocks, DT, MRI, SlotIndexes, SIII, + SIRI); + else + applySubExpMoveNearUser(Exp, MRI, DT, SlotIndexes); + } + } + + for (SubExp &Exp : InBlockCloneSubExps) { + applySubExpCloneNearUserInBlock( + Exp, InBlockHotVInstMap, InBlockHotSInstMap, MRI, SlotIndexes, SIRI); + } + // Try to see possible occupancy could reach, then dicide a target. + // Apply remat. + IsUpdated = SubExpCandidates.size(); + } + + return IsUpdated; +} + +int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI, const MachineRegisterInfo &MRI) { + int VmemLdSize = 0; + // Collect vmemLd when enable split. + for (MachineInstr &MI : MBB) { + bool IsHighLatency = SIII->isHighLatencyInstruction(MI); + if (!IsHighLatency) + continue; + if (!(MI.mayLoad() && + // Skip case like atomic which not return value. + MI.getNumDefs() > 0)) + continue; + // a vmem ld. + MachineOperand &Dst = MI.getOperand(0); + LaneBitmask Mask = llvm::getRegMask(Dst, MRI); + unsigned Size = llvm::getRegSize(Dst.getReg(), Mask, MRI, SIRI); + VmemLdSize += Size; + } + return VmemLdSize; +} + +} // namespace + +bool groupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, AliasAnalysis *AA) { + if (MF.size() < 2) + return false; + const GCNSubtarget *ST = &MF.getSubtarget(); + + const SIInstrInfo *SIII = ST->getInstrInfo(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + auto &MRI = MF.getRegInfo(); + + RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST); + + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (Status.TargetOcc >= MaxOcc) + return false; + + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; + + int RematVCnt = Status.MaxVPressure - VLimit; + int RematSCnt = Status.MaxSPressure - SLimit; + + bool IsSGPRSpill = false; + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); + } + + // If bound by lds, skip. + if ((Status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second && + !IsSGPRSpill) + return false; + + bool IsBothOutLimit = RematVCnt > 0 && RematSCnt > 0; + // TODO: use check wqm and support vreg remat. + bool IsCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; + RematVCnt = IsCheckWQM & false; + + // Remat on every hot block. + + // Collect all hot blocks. + std::vector HotBlocks; + for (MachineBasicBlock &MBB : MF) { + // Collect reg pressure. + auto &RP = Status.MBBPressureMap[&MBB]; + unsigned MaxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + unsigned MaxLocalSPressure = RP.getMaxSGPR(); + + MaxLocalSPressure += RegForVCC; + + if (!EnableInBlockRemat) { + if (MaxLocalVPressure <= VLimit && MaxLocalSPressure <= SLimit) + continue; + } + + // Move inst which input is imm/pass thru reg/out reg to help pressure. + if (tryHoldPacifist(MBB, LIS, MRI, SIRI, AA, Status)) { + MaxLocalVPressure = 0; + MaxLocalSPressure = 0; + collectMBBPressure(MBB, LIS, ST, MaxLocalVPressure, MaxLocalSPressure, + Status); + + MaxLocalSPressure += RegForVCC; + } + if (MaxLocalVPressure <= VLimit && MaxLocalSPressure <= SLimit) + continue; + + // When both vgpr sgpr out limit, only help vgpr. + if (IsBothOutLimit && MaxLocalVPressure <= VLimit) + continue; + GCNRPTracker::LiveRegSet LiveSet; + HotBlocks.push_back({&MBB, LiveSet, + std::make_pair(MaxLocalVPressure, MaxLocalSPressure), + 0, 0}); + } + // Collect vmemLdInput/OutputSize. + if (EnableVmemDegree) { + DenseMap OutputVMemLdSizeMap; + for (auto It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; + // Collect vmemLd when enable split. + int VmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI); + if (VmemLdSize) { + OutputVMemLdSizeMap[MBB] = VmemLdSize; + } + } + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; + + auto OIt = OutputVMemLdSizeMap.find(MBB); + if (OIt != OutputVMemLdSizeMap.end()) + It.VmemLdOutputSize = OIt->second; + + if (MBB->pred_size() != 1) + continue; + + MachineBasicBlock *Pred = *MBB->pred_begin(); + OIt = OutputVMemLdSizeMap.find(Pred); + if (OIt != OutputVMemLdSizeMap.end()) { + It.VmemLdInputSize = OIt->second; + } else { + if (Pred->getFirstTerminator() != Pred->end()) + continue; + if (Pred->empty()) + continue; + bool IsHighLatency = SIII->isHighLatencyInstruction(Pred->back()); + if (!IsHighLatency) + continue; + int VmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI); + It.VmemLdInputSize = VmemLdSize; + } + } + } + + if (EnableUniformVectorToScalar) { + if (rematUniformVgprToSgpr(Remat, MF, Status, HotBlocks, LIS, MRI, SIRI, + SIII, MLI)) { + // Rebuild LIS. + LIS->reanalyze(MF); + Status = getRematStatus(MF, MLI, LIS, MRI, ST); + bool IsSgprSpilled = nearSgprSpill(Status.MaxSPressure, ST, MF); + if (IsSgprSpilled) { + bool IsNearTarget = false; + hotBlockRemat(Remat, MF, MLI, LIS, DT, PDT, IsNearTarget); + // Rebuild LIS. + LIS->reanalyze(MF); + Status = getRematStatus(MF, MLI, LIS, MRI, ST); + } + + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; + + // Update pressure. + auto &RP = Status.MBBPressureMap[MBB]; + unsigned MaxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + unsigned MaxLocalSPressure = RP.getMaxSGPR(); + + MaxLocalSPressure += RegForVCC; + It.MaxPressures.first = MaxLocalVPressure; + It.MaxPressures.second = MaxLocalSPressure; + } + } + } + + // Collect all live reg which cross hot blocks. + GCNRPTracker::LiveRegSet LiveRegCandidates; + for (auto It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; + + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; + + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; + + llvm::mergeLiveRegSet(LiveRegCandidates, InputLive); + llvm::mergeLiveRegSet(LiveRegCandidates, OutputLive); + } + + // Check min VGPR bound. + BlockSet PressureUnderLimitSet; + if (EnableSubExpMinReg) { + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; + unsigned MaxLocalVGPR = 0; + unsigned MaxLocalSGPR = 0; + llvm::getRegBound(MBB, MRI, SIRI, SIII, LIS, MaxLocalVGPR, MaxLocalSGPR); + + if (MaxLocalVGPR < VLimit && MaxLocalSGPR < SLimit) { + PressureUnderLimitSet.insert(MBB); + } else { + if (MaxLocalVGPR < It.MaxPressures.first) + It.MaxPressures = + std::make_pair(MaxLocalVGPR, It.MaxPressures.second); + if (MaxLocalSGPR < It.MaxPressures.second) + It.MaxPressures = std::make_pair(It.MaxPressures.first, MaxLocalSGPR); + } + } + } + + bool IsUpdated = + perBlockPassthruRemat(Remat, HotBlocks, Status, LiveRegCandidates, ST, + LIS, MLI, DT, MRI, SIRI, SIII); + + return IsUpdated; +} + +bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { + if (MF.size() < 2) + return false; + LiveIntervals *LIS = &getAnalysis().getLIS(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); + MachinePostDominatorTree *PDT = + &getAnalysis().getPostDomTree(); + MachineLoopInfo *MLI = &getAnalysis().getLI(); + AliasAnalysis *AA = &getAnalysis().getAAResults(); + + { + MachineCycleInfo CI; + CI.compute(MF); + auto TTI = MF.getTarget().getTargetTransformInfo(MF.getFunction()); + MachineUniformityInfo MachineUniformity = + llvm::computeMachineUniformityInfo(MF, CI, *DT, + /*HasBranchDivergence*/ true); + TotalUniformInsts.clear(); + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MachineUniformity.isUniform(&MI)) { + TotalUniformInsts.insert(&MI); + } + } + } + } + + // LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)")); + // For non-cs/ps, set target occ as 4. + bool IsNearTarget = false; + bool IsFinalUpdated = false; + bool IsUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, IsNearTarget); + IsFinalUpdated |= IsUpdated; + if (EnableSubExp) { + if (IsUpdated) { + // Rebuild LIS. + LIS->reanalyze(MF); + } + + IsUpdated = groupRemat(this, MF, MLI, LIS, DT, PDT, AA); + + IsFinalUpdated |= IsUpdated; + } + return IsFinalUpdated; +} + +INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) + +char AMDGPUHotBlockRematerialize::ID = 0; +char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID; + +FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() { + return new AMDGPUHotBlockRematerialize(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp new file mode 100644 index 0000000000000..beace3a501a19 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -0,0 +1,2290 @@ +//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/SlotIndexes.h" + +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/ADT/IntEqClasses.h" +#include "llvm/Support/GraphWriter.h" + +#include "llvm/Support/Debug.h" + +#include "AMDGPUMIRUtils.h" +#include "AMDGPUSubExpDag.h" +#include "GCNRegPressure.h" +#include + +#define DEBUG_TYPE "xb-mir-util" +using namespace llvm; +namespace { +class CFGWithPhi { +public: + CFGWithPhi(MachineFunction &F) : F(F) { + // Collect phi and phi related insts. + MachineRegisterInfo &MRI = F.getRegInfo(); + + for (MachineBasicBlock &BB : F) { + auto &PhiInsts = BlockToPhiInstsMap[&BB]; + for (MachineInstr &I : BB) { + if (!I.isPHI()) + break; + PhiInsts.insert(&I); + Register Reg = I.getOperand(0).getReg(); + // Add incoming values. + for (unsigned i = 1; i < I.getNumOperands(); i += 2) { + MachineOperand &MO = I.getOperand(i); + if (!MO.isReg()) + continue; + MachineInstr *DefMI = MRI.getUniqueVRegDef(MO.getReg()); + if (!DefMI) + continue; + BlockToPhiInstsMap[DefMI->getParent()].insert(DefMI); + } + // Add users. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + BlockToPhiInstsMap[UseMI.getParent()].insert(&UseMI); + } + } + } + } /// Adds custom features for a visualization of the ScheduleDAG. + void addCustomGraphFeatures(llvm::GraphWriter &) const {} + MachineFunction &F; + DenseMap> + BlockToPhiInstsMap; + void dump(); +}; + +void CFGWithPhi::dump() { +#ifndef NDEBUG + for (MachineBasicBlock &BB : F) { + dbgs() << BB.getName() << "\n"; + auto &PhiInsts = BlockToPhiInstsMap[&BB]; + for (MachineInstr *I : PhiInsts) { + if (!I->isPHI()) + continue; + I->dump(); + } + for (MachineInstr *I : PhiInsts) { + if (I->isPHI()) + continue; + I->dump(); + } + } +#endif +} + +} // namespace + +// CFGWithPhi dump. +namespace llvm { + +template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { + + DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} + + static std::string getGraphName(const CFGWithPhi *) { + return "CFG with Phi graph"; + } + + static std::string getNodeIdentifierLabel(const MachineBasicBlock *Node, + const CFGWithPhi *) { + std::string R; + raw_string_ostream OS(R); + OS << static_cast(Node); + return R; + } + + static std::string getNodeLabel(const MachineBasicBlock *BB, + const CFGWithPhi *G) { + enum { MaxColumns = 8000 }; + std::string Str; + raw_string_ostream OS(Str); + + OS << "BB:" << BB->getName(); + auto It = G->BlockToPhiInstsMap.find(BB); + if (It != G->BlockToPhiInstsMap.end()) { + + auto &PhiInsts = It->second; + for (MachineInstr *I : PhiInsts) { + if (!I->isPHI()) + continue; + I->print(OS); + OS << "\n"; + } + for (MachineInstr *I : PhiInsts) { + if (I->isPHI()) + continue; + I->print(OS); + OS << "\n"; + } + } + std::string OutStr = OS.str(); + if (OutStr[0] == '\n') + OutStr.erase(OutStr.begin()); + + // Process string output to make it nicer... + unsigned ColNum = 0; + unsigned LastSpace = 0; + for (unsigned i = 0; i != OutStr.length(); ++i) { + if (OutStr[i] == '\n') { // Left justify + OutStr[i] = '\\'; + OutStr.insert(OutStr.begin() + i + 1, 'l'); + ColNum = 0; + LastSpace = 0; + } else if (OutStr[i] == ';') { // Delete comments! + unsigned Idx = OutStr.find('\n', i + 1); // Find end of line + OutStr.erase(OutStr.begin() + i, OutStr.begin() + Idx); + --i; + } else if (ColNum == MaxColumns) { // Wrap lines. + // Wrap very long names even though we can't find a space. + if (!LastSpace) + LastSpace = i; + OutStr.insert(LastSpace, "\\l..."); + ColNum = i - LastSpace; + LastSpace = 0; + i += 3; // The loop will advance 'i' again. + } else + ++ColNum; + if (OutStr[i] == ' ') + LastSpace = i; + } + return OutStr; + } + static std::string getNodeDescription(const MachineBasicBlock *SU, + const CFGWithPhi *) { + return SU->getName().str(); + } + + static void addCustomGraphFeatures(CFGWithPhi *G, + GraphWriter &GW) { + return G->addCustomGraphFeatures(GW); + } +}; + +template <> struct GraphTraits { + using NodeRef = MachineBasicBlock *; + using ChildIteratorType = MachineBasicBlock::succ_iterator; + using nodes_iterator = pointer_iterator; + + // static NodeRef getEntryNode(const CFGWithPhi *G) { + // return G->F.getFunctionEntry(); + //} + + static ChildIteratorType child_begin(const NodeRef N) { + return N->succ_begin(); + } + + static ChildIteratorType child_end(const NodeRef N) { return N->succ_end(); } + + static nodes_iterator nodes_begin(const CFGWithPhi *G) { + return nodes_iterator(G->F.begin()); + } + + static nodes_iterator nodes_end(const CFGWithPhi *G) { + return nodes_iterator(G->F.end()); + } +}; + +} // namespace llvm + +namespace llvm { + +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) { + unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + Size >>= 5; + if (Mask.any()) { + if (unsigned MaskSize = Mask.getNumLanes()) { + if (MaskSize < Size) + Size = MaskSize; + } + } + return Size; +} + +void collectLiveSetPressure(const LiveSet &LiveSet, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, unsigned &VPressure, + unsigned &SPressure) { + VPressure = 0; + SPressure = 0; + for (auto LiveIt : LiveSet) { + unsigned Reg = LiveIt.first; + unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI); + if (SIRI->isVGPR(MRI, Reg)) { + VPressure += Size; + } else { + SPressure += Size; + } + } +} + +bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) { + bool IsExecUpdate = false; + unsigned Opcode = MI.getOpcode(); + if (Opcode == AMDGPU::S_MOV_B64 || Opcode == AMDGPU::S_MOV_B32 || + Opcode == AMDGPU::S_OR_B64_term || Opcode == AMDGPU::S_OR_B32_term || + Opcode == AMDGPU::S_OR_SAVEEXEC_B64 || + Opcode == AMDGPU::S_OR_SAVEEXEC_B32 || Opcode == AMDGPU::S_AND_B64 || + Opcode == AMDGPU::S_AND_B32 || Opcode == AMDGPU::S_ANDN2_B64 || + Opcode == AMDGPU::S_ANDN2_B32) { + MachineOperand &Dst = MI.getOperand(0); + if (Dst.getReg() == AMDGPU::EXEC || Dst.getReg() == AMDGPU::EXEC_LO) { + IsExecUpdate = true; + } + } + return IsExecUpdate; +} + +bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { + // Support multi def for pattern of pointer: + // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // %808.sub1:sgpr_64 = S_MOV_B32 0 + bool HasSub0 = false; + bool HasSub1 = false; + for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) { + if (unsigned SubReg = UserDefMO.getSubReg()) { + bool IsSingleSubReg = false; + switch (SubReg) { + default: + break; + case AMDGPU::sub0: + if (!HasSub0) { + HasSub0 = true; + IsSingleSubReg = true; + } + break; + case AMDGPU::sub1: + if (!HasSub1) { + HasSub1 = true; + IsSingleSubReg = true; + } + break; + } + if (!IsSingleSubReg) { + HasSub0 = false; + break; + } + } else { + HasSub0 = false; + break; + } + } + + return (HasSub0 && HasSub1); +} + +LaneBitmask getRegMask(const MachineOperand &MO, + const MachineRegisterInfo &MRI) { + // We don't rely on read-undef_ flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0 + ? MRI.getMaxLaneMaskForVReg(MO.getReg()) + : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask( + MO.getSubReg()); +} + +void mergeLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) { + for (auto It : InputSet) { + Register Reg = It.first; + LaneBitmask Mask = It.second; + auto TargetReg = TargetSet.find(Reg); + if (TargetReg != TargetSet.end()) { + LaneBitmask TargetMask = TargetReg->second; + Mask |= TargetMask; + } + TargetSet[Reg] = Mask; + } +} + +void andLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) { + GCNRPTracker::LiveRegSet AndSet; + for (auto It : InputSet) { + Register Reg = It.first; + LaneBitmask Mask = It.second; + auto TargetReg = TargetSet.find(Reg); + if (TargetReg != TargetSet.end()) { + LaneBitmask TargetMask = TargetReg->second; + Mask &= TargetMask; + AndSet[Reg] = Mask; + } + } + + TargetSet = AndSet; +} + +void andNotLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) { + for (auto It : InputSet) { + unsigned Reg = It.first; + LaneBitmask Mask = It.second; + auto TargetReg = TargetSet.find(Reg); + if (TargetReg != TargetSet.end()) { + LaneBitmask TargetMask = TargetReg->second; + if ((TargetMask | Mask) == Mask) + TargetSet.erase(Reg); + else + TargetSet[Reg] = TargetMask & (~Mask); + } + } +} + +MachineBasicBlock *split(MachineInstr *Inst) { + + // Create the fall-through block. + MachineBasicBlock *MBB = Inst->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock(); + auto MBBIter = ++(MBB->getIterator()); + MF->insert(MBBIter, SuccMBB); + SuccMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(SuccMBB); + + // Splice the code over. + SuccMBB->splice(SuccMBB->end(), MBB, ++Inst->getIterator(), MBB->end()); + + return SuccMBB; +} + +struct Piece { + unsigned Reg; + unsigned Offset; + unsigned Size; + static SmallVector split(std::bitset<32> Mask) { + + SmallVector Pieces; + Piece Piece = {0, 0, 0}; + for (unsigned i = 0; i < 32; i++) { + if (Mask.test(i)) { + if (Piece.Size == 0) + Piece.Offset = i; + + Piece.Size++; + // Make sure no piece bigger than 8. + if (Piece.Size == 8) { + Pieces.emplace_back(Piece); + Piece.Size = 0; + } + } else { + if (Piece.Size == 0) { + continue; + } + Pieces.emplace_back(Piece); + Piece.Size = 0; + } + } + return Pieces; + } +}; + +static std::vector +getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI, + const TargetRegisterClass *RC, + LaneBitmask Mask) { + // TODO: this could replace the code it was copied from in SplitKit.cpp + + // First pass: Try to find a perfectly matching subregister index. + // If none exists find the one covering the most lanemask bits. + SmallVector PossibleIndexes; + unsigned BestIdx = 0; + const LaneBitmask Avoid = ~Mask; + { + unsigned BestCover = 0; + for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI->getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == Mask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside + if ((SubRegMask & Avoid).any()) + continue; + + unsigned PopCount = SubRegMask.getNumLanes(); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) { + LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " + << TRI->getRegClassName(RC) << " mask " + << PrintLaneMask(Mask) << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + std::vector Result; + Result.push_back(BestIdx); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx)); + while (Mask.any()) { + BestIdx = 0; + int BestCover = std::numeric_limits::min(); + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == Mask) { + BestIdx = Idx; + break; + } + + // Guaranteed above + assert((SubRegMask & Avoid).none()); + + // Try to cover as much of the remaining lanes as possible but as few of + // the already covered lanes as possible. + int Cover = (SubRegMask & Mask).getNumLanes() - + (SubRegMask & ~Mask).getNumLanes(); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) { + LLVM_DEBUG( + dbgs() << "Unable to find minimal spanning sub register(s) for " + << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask) + << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + Result.push_back(BestIdx); + Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx); + } + + return Result; +} + +static void updateSubReg(MachineOperand &UseMO, + const llvm::TargetRegisterClass *NewRC, + unsigned Offset, const SIRegisterInfo *SIRI) { + unsigned Size = NewRC->getLaneMask().getNumLanes(); + if (Size == 1) { + UseMO.setSubReg(0); + } else { + const uint32_t SubReg = UseMO.getSubReg(); + LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg); + + unsigned Mask = LaneMask.getAsInteger() >> Offset; + + unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask( + SIRI, NewRC, LaneBitmask(Mask)) + .front(); + + UseMO.setSubReg(NewSubReg); + } +} + +static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI, + const MachineRegisterInfo &MRI) { + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + const TargetRegisterClass *SubregRC = + SIRI->getSubRegisterClass(RC, AMDGPU::sub0); + return SubregRC->LaneMask.getNumLanes(); +} + +bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { + MachineOperand &DstMO = MI.getOperand(0); + // Skip case when dst subReg not 0. + if (DstMO.getSubReg()) { + return false; + } + Register Reg = DstMO.getReg(); + + SmallVector UseMOs; + for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) { + UseMOs.emplace_back(&UseMO); + } + + const llvm::TargetRegisterClass *NewRC = + SIRI->getRegClass(Desc.operands().front().RegClass); + if (!NewRC->isAllocatable()) { + if (SIRI->isSGPRClass(NewRC)) + NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits); + else if (SIRI->isVGPRClass(NewRC)) + NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits); + else + return false; + + if (!NewRC->isAllocatable()) + return false; + } + + unsigned NumLanes = NewRC->getLaneMask().getNumLanes(); + if (Offset > 0) { + // Update offset operand in MI. + MachineOperand *OffsetOp = + SIII->getNamedOperand(MI, AMDGPU::OpName::offset); + + const uint32_t LaneSize = sizeof(uint32_t); + if (OffsetOp) { + if (OffsetOp->isImm()) { + assert(OffsetOp != nullptr); + int64_t Offset = OffsetOp->getImm(); + Offset += Offset * LaneSize; + if (!SIII->isLegalMUBUFImmOffset(Offset)) { + return false; + } + OffsetOp->setImm(Offset); + } else { + return false; + } + } else { + OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset); + if (OffsetOp) { + Register NewOffsetReg = + MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(), + SIII->get(AMDGPU::S_ADD_U32)) + .addDef(NewOffsetReg) + .add(*OffsetOp) + .addImm(Offset * LaneSize); + MachineInstr *OffsetAddMI = OffsetAdd.getInstr(); + MachineBasicBlock::iterator InsertPoint = + llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI, + SIII, &MRI); + MI.getParent()->insert(InsertPoint, OffsetAddMI); + SIII->legalizeOperands(*OffsetAddMI); + OffsetOp->setReg(NewOffsetReg); + OffsetOp->setSubReg(0); + if (SlotIndexes) + SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI); + } else { + return false; + } + } + // Update subReg for users. + for (MachineOperand *UseMO : UseMOs) { + updateSubReg(*UseMO, NewRC, Offset, SIRI); + } + } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) { + // Clear subReg when it's a single 32-bit reg. + for (MachineOperand *UseMO : UseMOs) { + UseMO->setSubReg(0); + } + } + + MI.setDesc(Desc); + // Mutate reg class of Reg. + MRI.setRegClass(Reg, NewRC); + return true; +} + +bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + SlotIndexes *SlotIndexes) { + bool IsImm = false; + switch (MI.getOpcode()) { + default: + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM: + IsImm = true; + LLVM_FALLTHROUGH; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { + Register Reg = MI.getOperand(0).getReg(); + if (!MRI.getUniqueVRegDef(Reg)) + return false; + LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI); + LaneBitmask UseMask; + for (MachineOperand &MO : MRI.use_operands(Reg)) { + UseMask |= llvm::getRegMask(MO, MRI); + } + + const unsigned FullMask = DstMask.getAsInteger(); + unsigned Mask = UseMask.getAsInteger(); + if (Mask == FullMask) + return false; + // Split mask when there's gap. Then group mask to 2/4/8. + auto Pieces = Piece::split(std::bitset<32>(Mask)); + // Now only support 1 piece. + if (Pieces.size() != 1) + return false; + auto Piece = Pieces[0]; + if (Piece.Size > 8) + return false; + + // TODO: enable offset support when IsImm is true. + // Now if break different test when mul LaneSize or not mul for the offset. + if (IsImm && Piece.Offset != 0) + return false; + + const unsigned Num32BitLanes = + Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI); + + switch (Num32BitLanes) { + default: + return false; + case 1: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM + : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 2: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 3: + if (FullMask == 0xff) + return false; + LLVM_FALLTHROUGH; + case 4: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 5: + case 6: + case 7: + if (FullMask == 0xffff) + return false; + LLVM_FALLTHROUGH; + case 8: + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR), + MRI, SIRI, SIII, SlotIndexes); + } + + } break; + } + return false; +} + +// LoopInfo contains a mapping from basic block to the innermost loop. Find +// the outermost loop in the loop nest that contains BB. +const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI, + const MachineBasicBlock *BB) { + const MachineLoop *L = LI->getLoopFor(BB); + if (L) { + while (const MachineLoop *Parent = L->getParentLoop()) + L = Parent; + } + return L; +} + +// True if there is a loop which contains both BB1 and BB2. +bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1, + const MachineBasicBlock *BB2) { + const MachineLoop *L1 = getOutermostLoop(LI, BB1); + const MachineLoop *L2 = getOutermostLoop(LI, BB2); + return L1 != nullptr && L1 == L2; +} + +bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *LI, + MachineBasicBlock *ToBB) { + if (FromBB == ToBB) { + return true; + } + + if (DT->dominates(FromBB, ToBB)) { + return true; + } + + if (PDT->dominates(ToBB, FromBB)) { + return true; + } + + if (loopContainsBoth(LI, ToBB, FromBB)) { + return true; + } + // TODO: cover case hotBB in loop, + // one block in that loop dom BB or + // BB post dom one block in that loop. + return false; +} + +// If BB can reach hotMBBs. +bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *LI, + DenseSet &HotMBBs) { + bool Cross = false; + for (MachineBasicBlock *HotBB : HotMBBs) { + if (reach_block(BB, DT, PDT, LI, HotBB)) { + Cross = true; + break; + } + } + return Cross; +} + +} // namespace llvm + +namespace llvm { +void viewCFGWithPhi(llvm::MachineFunction &F) { +#ifdef DBG + CFGWithPhi G(F); + ViewGraph(const_cast(&G), F.getName(), false, F.getName()); + G.dump(); +#endif +} +} // namespace llvm + +namespace llvm { +bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, + MachineBasicBlock &MBB) { + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + while (BBEnd != MBB.rend() && BBEnd->isDebugInstr()) + BBEnd++; + return BBEnd != MBB.rend(); +} +} // namespace llvm + +// Helper functions to Write jason. +namespace { +void json_name(StringRef Val, raw_ostream &OS) { OS << "\"" << Val << "\":"; } + +template +void json_pair(StringRef Val, write_fn &Fn, raw_ostream &OS) { + json_name(Val, OS); + OS << "\""; + Fn(); + OS << "\""; +} + +template +void json_obj_pair(StringRef Val, write_fn &Fn, raw_ostream &OS) { + json_name(Val, OS); + + Fn(); +} + +template +void json_array(StringRef Val, write_fn &Fn, raw_ostream &OS) { + json_name(Val, OS); + OS << "["; + Fn(); + OS << "]"; +} +} // namespace + +namespace llvm { +namespace pressure { + +void write_inst(MachineInstr &MI, const SlotIndexes *SlotIndexes, + const SIInstrInfo *SIII, raw_ostream &OS) { + OS << "{"; + SlotIndex Slot = SlotIndexes->getInstructionIndex(MI); + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; + + json_pair("slot_index", WriteSlot, OS); + + OS << ","; + + auto WriteOpcode = [&MI, &SIII, &OS]() { + OS << SIII->getName(MI.getOpcode()); + }; + + json_pair("opcode", WriteOpcode, OS); + + OS << ","; + + auto WriteAsm = [&MI, &SIII, &OS]() { + MI.print(OS, /*IsStandalone*/ true, /*SkipOpers*/ false, + /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII); + }; + json_pair("asm", WriteAsm, OS); + + OS << "}"; +} + +void print_reg(Register Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &OS) { + if (Reg.isVirtual()) { + StringRef Name = MRI.getVRegName(Reg); + if (Name != "") { + OS << '%' << Name; + } else { + OS << '%' << Reg.virtRegIndex(); + } + } else if (Reg < SIRI->getNumRegs()) { + OS << '$'; + printLowerCase(SIRI->getName(Reg), OS); + } else { + llvm_unreachable("invalid reg"); + } +} + +void write_reg(unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &OS) { + OS << "{"; + + auto WriteReg = [&MRI, &SIRI, &Reg, &OS]() { print_reg(Reg, MRI, SIRI, OS); }; + json_pair("reg", WriteReg, OS); + + OS << ","; + + auto WriteSubReg = [&SubReg, &OS]() { OS << SubReg; }; + + json_pair("sub_reg", WriteSubReg, OS); + + OS << ","; + auto WriteIsSgpr = [&Reg, &MRI, &SIRI, &OS]() { + if (SIRI->isSGPRReg(MRI, Reg)) + OS << "true"; + else + OS << "false"; + }; + json_obj_pair("is_sgpr", WriteIsSgpr, OS); + OS << "}"; +} + +unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + return SIRI->getRegClassForReg(MRI, Reg)->getLaneMask().getNumLanes(); +} + +void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &OS) { + if (Mask.none()) { + unsigned Size = get_reg_size(Reg, MRI, SIRI); + Mask = LaneBitmask((1 << Size) - 1); + } + unsigned IntMask = Mask.getAsInteger(); + for (unsigned i = 0; i <= Mask.getHighestLane(); i++) { + if (IntMask & (1 << i)) { + write_reg(Reg, i, MRI, SIRI, OS); + OS << ",\n"; + } + } +} + +void write_dag_input_node(unsigned ID, unsigned Reg, unsigned Mask, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &OS) { + OS << "{"; + auto WriteID = [&ID, &OS]() { OS << ID; }; + + json_pair("ID", WriteID, OS); + + OS << ","; + + auto WriteReg = [&Reg, &MRI, &SIRI, &OS]() { print_reg(Reg, MRI, SIRI, OS); }; + + json_pair("reg", WriteReg, OS); + + OS << ","; + + auto WriteMask = [&Mask, &OS]() { OS << Mask; }; + + json_pair("mask", WriteMask, OS); + + OS << "},\n"; +} + +void write_dag_inst_node(unsigned ID, SlotIndex Slot, + GCNRPTracker::LiveRegSet LiveReg, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, SUnit *SU, + raw_ostream &OS) { + OS << "{"; + auto WriteID = [&ID, &OS]() { OS << ID; }; + + json_pair("ID", WriteID, OS); + + OS << ","; + + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; + + json_pair("slot_index", WriteSlot, OS); + + OS << ","; + + auto WriteRegs = [&LiveReg, &MRI, &SIRI, &OS]() { + for (auto It : LiveReg) { + unsigned Reg = It.first; + LaneBitmask Mask = It.second; + write_live(Reg, Mask, MRI, SIRI, OS); + } + }; + json_array("regs", WriteRegs, OS); + + OS << ","; + + auto WritePreds = [&SU, &OS]() { + for (auto &Pred : SU->Preds) { + + OS << Pred.getSUnit()->NodeNum << ","; + } + }; + + json_array("preds", WritePreds, OS); + + OS << "},\n"; +} + +void write_block(MachineBasicBlock &Blk, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, raw_ostream &OS) { + OS << "{\n"; + auto WriteName = [&Blk, &OS]() { OS << Blk.getName(); }; + json_pair("name", WriteName, OS); + + OS << ","; + + auto WriteIndex = [&Blk, &OS]() { OS << Blk.getNumber(); }; + json_pair("id", WriteIndex, OS); + + OS << ","; + + const SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); + + SlotIndex BeginSlot = SlotIndexes->getMBBStartIdx(&Blk); + auto WriteSlot = [&BeginSlot, &OS]() { BeginSlot.print(OS); }; + json_pair("begin_slot", WriteSlot, OS); + + OS << ","; + + SlotIndex EndSlot = SlotIndexes->getMBBEndIdx(&Blk); + auto WriteEndSlot = [&EndSlot, &OS]() { EndSlot.print(OS); }; + json_pair("end_slot", WriteEndSlot, OS); + + OS << ","; + + auto WriteInsts = [&Blk, &SlotIndexes, &SIII, &OS]() { + for (MachineInstr &MI : Blk) { + if (MI.isDebugInstr()) + continue; + write_inst(MI, SlotIndexes, SIII, OS); + OS << ",\n"; + } + }; + + json_array("instructions", WriteInsts, OS); + + OS << ","; + + BlockExpDag Dag(&Blk, LIS, MRI, SIRI, SIII); + Dag.buildWithPressure(); + + const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *Dag.LIS, Dag.MRI); + auto WriteInputs = [&StartLiveReg, &Dag, &OS]() { + for (auto It : StartLiveReg) { + unsigned Reg = It.first; + LaneBitmask Mask = It.second; + SUnit *SU = Dag.InputSUnitMap[Reg]; + // Write Reg and mask to the nodes. + write_dag_input_node(SU->NodeNum, Reg, Mask.getAsInteger(), Dag.MRI, + Dag.SIRI, OS); + } + }; + + json_array("input_nodes", WriteInputs, OS); + + OS << ","; + + auto WriteNodes = [&SlotIndexes, &Dag, &OS]() { + for (auto It : Dag.MISUnitMap) { + MachineInstr *MI = It.first; + SUnit *SU = It.second; + // Use SlotIndex of MI. + SlotIndex SlotIndex; + if (!MI->isDebugInstr()) + SlotIndex = SlotIndexes->getInstructionIndex(*MI); + GCNRPTracker::LiveRegSet LiveReg = Dag.DagPressureMap[SU]; + // Write slot, live to the nodes. + write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, Dag.MRI, Dag.SIRI, + SU, OS); + } + }; + + json_array("inst_nodes", WriteNodes, OS); + + OS << ","; + + auto WritePreds = [&Blk, &OS]() { + for (MachineBasicBlock *Pred : Blk.predecessors()) { + OS << Pred->getNumber() << ","; + } + }; + + json_array("preds", WritePreds, OS); + + OS << ","; + + auto WriteSuccs = [&Blk, &OS]() { + for (MachineBasicBlock *Succ : Blk.successors()) { + OS << Succ->getNumber() << ","; + } + }; + + json_array("succs", WriteSuccs, OS); + + OS << "}"; +} + +void write_define(SlotIndex &Slot, unsigned Reg, unsigned SubReg, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &OS) { + OS << "{"; + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; + + json_pair("slot_index", WriteSlot, OS); + + OS << ","; + + auto WriteReg = [&MRI, &SIRI, &Reg, &SubReg, &OS]() { + write_reg(Reg, SubReg, MRI, SIRI, OS); + }; + json_obj_pair("reg", WriteReg, OS); + + OS << "}\n"; + + OS << ","; +} + +void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &OS) { + // Split subReg? MO.getSubReg(); + Register Reg = MO.getReg(); + unsigned SubReg = MO.getSubReg(); + MachineInstr *MI = MO.getParent(); + SlotIndex Slot = SlotIndexes->getInstructionIndex(*MI); + if (SubReg == 0) { + unsigned Size = get_reg_size(Reg, MRI, SIRI); + for (unsigned i = 0; i < Size; i++) { + write_define(Slot, Reg, i, MRI, SIRI, OS); + } + } else { + switch (SubReg) { + default: + assert(0 && "SubReg not supported yet."); + write_define(Slot, Reg, SubReg, MRI, SIRI, OS); + break; + case AMDGPU::sub0: + write_define(Slot, Reg, 0, MRI, SIRI, OS); + break; + case AMDGPU::sub1: + write_define(Slot, Reg, 1, MRI, SIRI, OS); + break; + case AMDGPU::sub2: + write_define(Slot, Reg, 2, MRI, SIRI, OS); + break; + case AMDGPU::sub3: + write_define(Slot, Reg, 3, MRI, SIRI, OS); + break; + case AMDGPU::sub4: + write_define(Slot, Reg, 4, MRI, SIRI, OS); + break; + case AMDGPU::sub5: + write_define(Slot, Reg, 5, MRI, SIRI, OS); + break; + case AMDGPU::sub6: + write_define(Slot, Reg, 6, MRI, SIRI, OS); + break; + case AMDGPU::sub7: + write_define(Slot, Reg, 7, MRI, SIRI, OS); + break; + case AMDGPU::sub8: + write_define(Slot, Reg, 8, MRI, SIRI, OS); + break; + case AMDGPU::sub9: + write_define(Slot, Reg, 9, MRI, SIRI, OS); + break; + case AMDGPU::sub10: + write_define(Slot, Reg, 10, MRI, SIRI, OS); + break; + case AMDGPU::sub11: + write_define(Slot, Reg, 11, MRI, SIRI, OS); + break; + case AMDGPU::sub12: + write_define(Slot, Reg, 12, MRI, SIRI, OS); + break; + case AMDGPU::sub13: + write_define(Slot, Reg, 13, MRI, SIRI, OS); + break; + case AMDGPU::sub14: + write_define(Slot, Reg, 14, MRI, SIRI, OS); + break; + case AMDGPU::sub15: + write_define(Slot, Reg, 15, MRI, SIRI, OS); + break; + case AMDGPU::sub0_sub1: + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + break; + case AMDGPU::sub2_sub3: + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); + break; + case AMDGPU::sub4_sub5: + write_define(Slot, Reg, 4, MRI, SIRI, OS); + write_define(Slot, Reg, 5, MRI, SIRI, OS); + break; + case AMDGPU::sub1_sub2: + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + break; + case AMDGPU::sub0_sub1_sub2: + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + break; + case AMDGPU::sub0_sub1_sub2_sub3: + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); + break; + case AMDGPU::sub2_sub3_sub4_sub5: + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); + write_define(Slot, Reg, 4, MRI, SIRI, OS); + write_define(Slot, Reg, 5, MRI, SIRI, OS); + break; + case AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7: + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); + write_define(Slot, Reg, 4, MRI, SIRI, OS); + write_define(Slot, Reg, 5, MRI, SIRI, OS); + write_define(Slot, Reg, 6, MRI, SIRI, OS); + write_define(Slot, Reg, 7, MRI, SIRI, OS); + break; + } + } +} + +void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &OS) { + + for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { + auto Reg = Register::index2VirtReg(i); + + for (MachineOperand &MO : MRI.def_operands(Reg)) { + write_define(MO, SlotIndexes, MRI, SIRI, OS); + } + } +} + +void write_uses(MachineFunction &MF, const SlotIndexes *SlotIndexes, + + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &OS) { + + for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { + auto Reg = Register::index2VirtReg(i); + + for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + // TODO: create write_use if use has more info. + write_define(MO, SlotIndexes, MRI, SIRI, OS); + } + } +} + +void write_liveness(SlotIndex Slot, GCNRPTracker::LiveRegSet &LiveSet, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &OS) { + OS << "{"; + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; + + json_pair("slot_index", WriteSlot, OS); + + OS << ","; + + auto WriteRegs = [&LiveSet, &MRI, &SIRI, &OS]() { + for (auto it : LiveSet) { + unsigned Reg = it.first; + LaneBitmask Mask = it.second; + write_live(Reg, Mask, MRI, SIRI, OS); + } + }; + json_array("regs", WriteRegs, OS); + OS << "\n},\n"; +} + +void write_segment(const LiveInterval::Segment &S, raw_ostream &OS) { + OS << "{"; + auto WriteBegin = [&S, &OS]() { S.start.print(OS); }; + + json_pair("begin", WriteBegin, OS); + + OS << ","; + + auto WriteEnd = [&S, &OS]() { S.end.print(OS); }; + + json_pair("end", WriteEnd, OS); + + OS << ","; + + auto WriteValNum = [&S, &OS]() { + if (S.valno) + OS << S.valno->id; + else + OS << 0xFFFFFFFF; + }; + + json_pair("val_num", WriteValNum, OS); + + OS << "},\n"; +} + +void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &OS) { + OS << "{\n"; + auto WriteMask = [&SR, &OS]() { OS << SR.LaneMask.getAsInteger(); }; + + json_pair("mask", WriteMask, OS); + + OS << ","; + + // Segments. + auto WriteSegments = [&SR, &OS]() { + for (auto &S : SR.segments) { + write_segment(S, OS); + } + }; + + json_array("segments", WriteSegments, OS); + + OS << "\n},\n"; +} + +void write_live_interval(LiveInterval &LI, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &OS) { + OS << "{\n"; + + auto WriteReg = [&LI, &MRI, &SIRI, &OS]() { + write_reg(LI.reg(), 0, MRI, SIRI, OS); + }; + + json_obj_pair("reg", WriteReg, OS); + + OS << ","; + + auto WriteSegments = [&LI, &OS]() { + for (auto &S : LI.segments) { + write_segment(S, OS); + } + }; + + json_array("segments", WriteSegments, OS); + + OS << ","; + + auto WriteSubRanges = [&LI, &OS]() { + for (auto &SR : LI.subranges()) { + write_subrange(SR, OS); + } + }; + + json_array("subranges", WriteSubRanges, OS); + + OS << "},\n"; +} + +std::string get_legal_str(const MDString *MDStr) { + std::string Str; + raw_string_ostream Stream(Str); + MDStr->print(Stream); + Stream.flush(); + // Remove !. + Str = Str.substr(1); + // Remove "" + Str = Str.substr(1); + Str.pop_back(); + std::replace(Str.begin(), Str.end(), '\\', '#'); + return Str; +} + +void write_file(const MDNode *FileNode, raw_ostream &OS) { + const MDString *FileName = cast(FileNode->getOperand(0).get()); + StringRef FileNameStr = FileName->getString(); + if (FileNameStr.find("__AMDGPU_GPUMAP_") == 0) + return; + if (FileNameStr.find("__AMDGPU_DWARF_") == 0) + return; + + OS << "{"; + + std::string Str0 = get_legal_str(FileName); + auto WriteName = [&Str0, &OS]() { OS << Str0; }; + json_pair("filename", WriteName, OS); + + OS << ",\n"; + + const MDString *Content = cast(FileNode->getOperand(1).get()); + std::string Str = get_legal_str(Content); + auto WriteContent = [&Str, &OS]() { OS << Str; }; + json_pair("content", WriteContent, OS); + OS << "\n},\n"; +} + +void write_DIFile(const DIFile *File, raw_ostream &OS) { + if (File) { + std::string Name = get_legal_str(File->getRawFilename()); + std::string Dir = ""; + if (MDString *MDDir = File->getRawDirectory()) + Dir = get_legal_str(MDDir); + OS << Dir << Name; + } else { + OS << "ArtificialFile"; + } +} + +void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &OS) { + OS << "{"; + + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; + + json_pair("slot_index", WriteSlot, OS); + + OS << ",\n"; + + MDNode *Scope = DL.getScope(); + unsigned Line = DL.getLine(); + unsigned Col = DL.getCol(); + + auto WriteLine = [&Line, &OS]() { OS << Line; }; + json_pair("line", WriteLine, OS); + + OS << ",\n"; + + auto WriteCol = [&Col, &OS]() { OS << Col; }; + json_pair("col", WriteCol, OS); + + OS << ",\n"; + + auto WriteFile = [&Scope, &OS]() { + const DIFile *File = cast(Scope)->getFile(); + write_DIFile(File, OS); + }; + json_pair("file", WriteFile, OS); + + if (DILocation *InlineDL = DL.getInlinedAt()) { + OS << ",\n"; + unsigned InlineLine = InlineDL->getLine(); + auto WriteLine = [&InlineLine, &OS]() { OS << InlineLine; }; + json_pair("inline_line", WriteLine, OS); + + OS << ",\n"; + + unsigned InlineCol = InlineDL->getColumn(); + auto WriteCol = [&InlineCol, &OS]() { OS << InlineCol; }; + json_pair("inline_col", WriteCol, OS); + + OS << ",\n"; + + const MDNode *InlineScope = DL.getInlinedAtScope(); + auto WriteFile = [&InlineScope, &OS]() { + const DIFile *File = cast(InlineScope)->getFile(); + write_DIFile(File, OS); + }; + json_pair("inline_file", WriteFile, OS); + } + + OS << "\n},\n"; +} + +void write_dbg_val(unsigned Reg, const DIVariable *V, const DIExpression *Exp, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &OS) { + OS << "{"; + + auto WriteReg = [&MRI, &SIRI, &Reg, &OS]() { + const unsigned SubReg = 0; + write_reg(Reg, SubReg, MRI, SIRI, OS); + }; + json_obj_pair("reg", WriteReg, OS); + + OS << ",\n"; + + if (V) { + auto WriteName = [&V, &OS]() { OS << V->getName(); }; + json_pair("debug_val_name", WriteName, OS); + OS << ",\n"; + + auto WriteFile = [&V, &OS]() { + const DIFile *File = V->getFile(); + write_DIFile(File, OS); + }; + json_pair("debug_val_file", WriteFile, OS); + OS << ",\n"; + + auto WriteLine = [&V, &OS]() { OS << V->getLine(); }; + json_pair("debug_val_line", WriteLine, OS); + } + + if (Exp->isValid() && Exp->getNumElements()) { + OS << ",\n"; + auto WriteV = [&Exp, &OS]() { + OS << '['; + bool NeedSep = false; + for (auto Op : Exp->expr_ops()) { + if (NeedSep) + OS << ", "; + else + NeedSep = true; + OS << dwarf::OperationEncodingString(Op.getOp()); + for (unsigned I = 0; I < Op.getNumArgs(); ++I) + OS << ' ' << Op.getArg(I); + } + OS << "] "; + }; + json_pair("debug_exp", WriteV, OS); + } + OS << "\n},\n"; +} + +void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI, const SlotIndexes *SlotIndexes, + const NamedMDNode *SourceMD, raw_ostream &OS) { + OS << ",\n"; + + auto WriteFiles = [&SourceMD, &OS]() { + for (const MDNode *FileNode : SourceMD->operands()) { + write_file(FileNode, OS); + } + }; + + json_array("files", WriteFiles, OS); + + OS << ",\n"; + + auto WriteLineMapping = [&MF, &SlotIndexes, &OS]() { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) { + continue; + } + const DebugLoc DL = MI.getDebugLoc(); + if (!DL) + continue; + SlotIndex Slot = SlotIndexes->getInstructionIndex(MI); + write_line_mapping(Slot, DL, OS); + } + } + }; + + json_array("line_mapping", WriteLineMapping, OS); + + OS << ",\n"; + + auto WriteDebugVals = [&MF, &MRI, &SIRI, &OS]() { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!MI.isDebugValue()) + continue; + + MachineOperand &Reg = MI.getOperand(0); + if (!Reg.isReg()) + continue; + + if (Reg.getReg() == 0) + continue; + + const DIVariable *V = MI.getDebugVariable(); + const DIExpression *Exp = MI.getDebugExpression(); + write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, OS); + } + } + }; + + json_array("debug_vals", WriteDebugVals, OS); +} + +void write_function(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI, raw_ostream &OS) { + const SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); + + OS << "{\n"; + auto WriteName = [&MF, &OS]() { OS << MF.getName(); }; + json_pair("name", WriteName, OS); + + OS << ",\n"; + + auto WriteBlocks = [&MF, &LIS, &MRI, &SIRI, &SIII, &OS]() { + for (MachineBasicBlock &MBB : MF) { + write_block(MBB, LIS, MRI, SIRI, SIII, OS); + OS << ",\n"; + } + }; + + json_array("blocks", WriteBlocks, OS); + + OS << ",\n"; + + auto WriteDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &OS]() { + write_defines(MF, SlotIndexes, MRI, SIRI, OS); + }; + + json_array("defines", WriteDefines, OS); + + OS << ",\n"; + + auto WriteUses = [&MF, &SlotIndexes, &MRI, &SIRI, &OS]() { + write_uses(MF, SlotIndexes, MRI, SIRI, OS); + }; + + json_array("uses", WriteUses, OS); + + OS << ",\n"; + + auto WriteLiveness = [&MF, &LIS, &MRI, &SIRI, &OS]() { + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + const SlotIndex &SI = LIS->getInstructionIndex(MI).getBaseIndex(); + GCNRPTracker::LiveRegSet LISLR = llvm::getLiveRegs(SI, *LIS, MRI); + write_liveness(SI, LISLR, MRI, SIRI, OS); + } + }; + + json_array("liveness", WriteLiveness, OS); + + OS << ",\n"; + + auto WriteLiveIntervals = [&MRI, &SIRI, &LIS, &OS]() { + for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { + auto Reg = Register::index2VirtReg(i); + if (!LIS->hasInterval(Reg)) + continue; + auto &LI = LIS->getInterval(Reg); + write_live_interval(LI, MRI, SIRI, OS); + } + }; + + json_array("live_intervals", WriteLiveIntervals, OS); + + // Check debug info. + const Function &F = MF.getFunction(); + const Module *M = F.getParent(); + const NamedMDNode *SourceMD = M->getNamedMetadata("dx.source.contents"); + if (SourceMD) { + write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, OS); + } + + OS << "\n}"; +} + +void write_pressure(MachineFunction &MF, LiveIntervals *LIS, + const char *Filename) { + int FD = -1; + SmallString<128> TmpFilename(Filename); + std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename); + if (EC) { + errs() << "Error: " << EC.message() << "\n"; + return; + } + + raw_fd_ostream O(FD, /*shouldClose=*/true); + + const GCNSubtarget *ST = &MF.getSubtarget(); + const auto *SIII = ST->getInstrInfo(); + const auto *SIRI = ST->getRegisterInfo(); + auto &MRI = MF.getRegInfo(); + write_function(MF, LIS, MRI, SIII, SIRI, O); + O.flush(); + O.close(); +} + +void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &OS) { + const GCNSubtarget *ST = &MF.getSubtarget(); + const auto *SIII = ST->getInstrInfo(); + const auto *SIRI = ST->getRegisterInfo(); + auto &MRI = MF.getRegInfo(); + write_function(MF, LIS, MRI, SIII, SIRI, OS); + OS.flush(); +} + +} // namespace pressure +} // namespace llvm + +namespace { +class ContributionList { +public: + ContributionList(MachineFunction &MF) : MF(MF) {}; + void build(); + bool propagateContribution(); + MachineFunction &MF; + DenseMap MIIndexMap; + // Set of inst which contribute to build the key MachineInstr. + DenseMap> MIContributorMap; + // Set of inst which been contributed by the key MachineInstr. + DenseMap> MIContributedToMap; + void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &OS); + void writeBlock(MachineBasicBlock &MBB, const SIInstrInfo *SIII, + raw_ostream &OS); + void write(raw_ostream &OS); +}; + +void buildMIContribution(MachineInstr &MI, + DenseSet &ContributorSet, + DenseSet &ContributedSet, + MachineRegisterInfo &MRI) { + for (MachineOperand &UseMO : MI.uses()) { + if (!UseMO.isReg()) + continue; + Register Reg = UseMO.getReg(); + if (Reg.isPhysical()) + continue; + if (UseMO.isImplicit()) { + // if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || + // Reg == AMDGPU::SCC) + continue; + } + for (MachineInstr &DefMI : MRI.def_instructions(Reg)) { + ContributorSet.insert(&DefMI); + } + } + + for (MachineOperand &DstMO : MI.defs()) { + if (!DstMO.isReg()) + continue; + if (DstMO.isImplicit()) + continue; + Register Reg = DstMO.getReg(); + if (Reg.isPhysical()) + continue; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + ContributedSet.insert(&UseMI); + } + } +} + +bool ContributionList::propagateContribution() { + bool IsUpdated = false; + ReversePostOrderTraversal RPOT(&MF); + for (auto *MBB : RPOT) { + for (auto &MI : *MBB) { + auto &Contributors = MIContributorMap[&MI]; + unsigned Size = Contributors.size(); + DenseSet ParentContributors; + for (auto *CMI : Contributors) { + auto &Contributors = MIContributorMap[CMI]; + ParentContributors.insert(Contributors.begin(), Contributors.end()); + } + Contributors.insert(ParentContributors.begin(), ParentContributors.end()); + IsUpdated |= Size < Contributors.size(); + } + } + return IsUpdated; +} + +void ContributionList::build() { + // Build contribution. + auto &MRI = MF.getRegInfo(); + for (auto &MBB : MF) { + for (auto &MI : MBB) { + auto &Contributors = MIContributorMap[&MI]; + auto &Contributed = MIContributedToMap[&MI]; + buildMIContribution(MI, Contributors, Contributed, MRI); + } + } + // propagate contribution. + bool IsUpdated = true; + while (IsUpdated) { + IsUpdated = propagateContribution(); + } +} + +void ContributionList::writeInst(MachineInstr &MI, const SIInstrInfo *SIII, + raw_ostream &OS) { + OS << "\n{\n"; + unsigned ID = MIIndexMap[&MI]; + auto WriteSlot = [&ID, &OS]() { OS << ID; }; + + json_pair("ID", WriteSlot, OS); + + OS << ","; + + auto WriteAsm = [&MI, &SIII, &OS]() { + MI.print(OS, /*IsStandalone*/ true, /*SkipOpers*/ false, + /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII); + }; + json_pair("asm", WriteAsm, OS); + + OS << ",\n"; + + auto &Contributors = MIContributorMap[&MI]; + auto WriteContributor = [&Contributors, this, &OS]() { + for (auto *MI : Contributors) { + unsigned ID = MIIndexMap[MI]; + OS << ID << ","; + } + }; + + json_array("contributors", WriteContributor, OS); + OS << ",\n"; + + auto &Contributeds = MIContributedToMap[&MI]; + auto WriteContributed = [&Contributeds, this, &OS]() { + for (auto *MI : Contributeds) { + unsigned ID = MIIndexMap[MI]; + OS << ID << ","; + } + }; + + json_array("contributed", WriteContributed, OS); + OS << "\n}\n"; +} + +void ContributionList::writeBlock(MachineBasicBlock &MBB, + const SIInstrInfo *SIII, raw_ostream &OS) { + OS << "{\n"; + auto WriteName = [&MBB, &OS]() { OS << MBB.getName(); }; + json_pair("name", WriteName, OS); + + OS << ","; + + auto WriteIndex = [&MBB, &OS]() { OS << MBB.getNumber(); }; + json_pair("id", WriteIndex, OS); + + OS << ",\n"; + + auto WriteInsts = [this, &MBB, &SIII, &OS]() { + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + writeInst(MI, SIII, OS); + OS << ",\n"; + } + }; + + json_array("instructions", WriteInsts, OS); + + OS << ",\n"; + + auto WritePreds = [&MBB, &OS]() { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + OS << Pred->getNumber() << ","; + } + }; + + json_array("preds", WritePreds, OS); + + OS << ","; + + auto WriteSuccs = [&MBB, &OS]() { + for (MachineBasicBlock *Succ : MBB.successors()) { + OS << Succ->getNumber() << ","; + } + }; + + json_array("succs", WriteSuccs, OS); + + OS << "}"; +} + +void ContributionList::write(raw_ostream &OS) { + unsigned ID = 0; + // Build ID for Write. + ReversePostOrderTraversal RPOT(&MF); + for (auto *MBB : RPOT) { + for (auto &MI : *MBB) { + MIIndexMap[&MI] = ID++; + } + } + + const GCNSubtarget *ST = &MF.getSubtarget(); + const auto *SIII = ST->getInstrInfo(); + + OS << "{\n"; + auto WriteName = [this, &OS]() { OS << MF.getName(); }; + json_pair("name", WriteName, OS); + + OS << ",\n"; + + auto WriteBlocks = [this, &SIII, &RPOT, &OS]() { + for (auto *MBB : RPOT) { + writeBlock(*MBB, SIII, OS); + OS << ",\n"; + } + }; + + json_array("blocks", WriteBlocks, OS); + + OS << "\n}"; +} +} // namespace + +namespace llvm { + +void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) { + int FD = -1; + SmallString<128> TmpFilename(Filename); + std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename); + if (EC) { + errs() << "Error: " << EC.message() << "\n"; + return; + } + + raw_fd_ostream O(FD, /*shouldClose=*/true); + ContributionList CL(MF); + CL.build(); + + CL.write(O); + + O.flush(); + O.close(); +} +} // namespace llvm + +static bool isPhysReg(const MachineOperand &Op) { + return Op.isReg() && Op.getReg().isPhysical(); +} + +// Sometimes split bb uses physical registers defined in BB, have to add them to +// live-in or the ir is malformed. +void llvm::updatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, + const MachineRegisterInfo *MRI) { + // Initialize with current set of liveins. For new blocks this will be empty. + SmallDenseSet DefSet; + for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins()) { + DefSet.insert(P.PhysReg); + } + + for (auto &MI : *NewBB) { + // Add all undefined physical registers to the live in set. + for (MachineOperand &Use : MI.operands()) { + // Only process physreg uses. + if (!isPhysReg(Use) || !Use.isUse()) + continue; + + // Reserved regs do not need to be tracked through live-in sets. + Register Reg = Use.getReg(); + if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) + continue; + + if (!DefSet.count(Reg)) + NewBB->addLiveIn(Reg); + } + + // Add all physical register defs (exlicit+implicit) to the def register + // set. + for (MachineOperand &Def : MI.operands()) { + // Only process physreg defs. + if (!isPhysReg(Def) || !Def.isDef()) + continue; + DefSet.insert(Def.getReg()); + } + } +} + +void llvm::buildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, + SmallDenseSet &LiveOutSet, + const MachineRegisterInfo *MRI) { + for (auto RIt = NewBB->rbegin(); RIt != NewBB->rend(); RIt++) { + auto &MI = *RIt; + // Add all physical register defs (exlicit+implicit) to the def register + // set. + for (MachineOperand &Def : MI.operands()) { + // Only process physreg defs. + if (!isPhysReg(Def) || !Def.isDef()) + continue; + LiveOutSet.erase(Def.getReg()); + } + // Add all undefined physical registers to the live in set. + for (MachineOperand &Use : MI.operands()) { + // Only process physreg uses. + if (!isPhysReg(Use) || !Use.isUse()) + continue; + + // Reserved regs do not need to be tracked through live-in sets. + Register Reg = Use.getReg(); + if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) + continue; + + if (!LiveOutSet.count(Reg)) + LiveOutSet.insert(Reg); + } + } + for (unsigned Reg : LiveOutSet) { + NewBB->addLiveIn(Reg); + } +} + +MachineReg llvm::createVirtualRegForOperand(MachineOpcode Opcode, + unsigned OpNum, + MachineFunction &MF) { + const TargetSubtargetInfo &ST = MF.getSubtarget(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetInstrInfo *TII = ST.getInstrInfo(); + const MCInstrDesc &Desc = TII->get(Opcode); + const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF); + if (!RC) { + llvm::report_fatal_error( + "Unable to create virtual reg for instruction operand"); + } + + MachineRegisterInfo &MRI = MF.getRegInfo(); + return MRI.createVirtualRegister(RC); +} + +MachineReg llvm::createVirtualDstReg(MachineOpcode Opcode, + MachineFunction &MF) { + return llvm::createVirtualRegForOperand(Opcode, 0, MF); +} + +// Return true if the MI is a copy of exec. +// If true then sets pDst to the destination register. +bool llvm::isExecCopy(const MachineInstr &MI, MachineReg Exec, + MachineReg *OutDst) { + enum { DST = 0, SRC = 1 }; + bool FoundCopy = false; + if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::S_MOV_B32 || + MI.getOpcode() == AMDGPU::S_MOV_B64) { + const MachineOperand &Src = MI.getOperand(SRC); + if (Src.isReg() && Src.getReg() == Exec) { + FoundCopy = true; + } + } + if (FoundCopy) { + *OutDst = MI.getOperand(DST).getReg(); + } + + return FoundCopy; +} + +bool llvm::isSccLiveAt(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock::iterator MI) { + const TargetRegisterInfo *TRI = + MBB->getParent()->getRegInfo().getTargetRegisterInfo(); + for (auto It = MI; It != MBB->end(); ++It) { + const MachineInstr &CurMI = *It; + // Hit use of scc, it is live. + if (CurMI.readsRegister(AMDGPU::SCC, TRI)) + return true; + // Hit def of scc first, not live. + if (CurMI.definesRegister(AMDGPU::SCC, TRI)) + return false; + } + // Reach the end of MBB, check live-ins of MBB successors. + for (const MachineBasicBlock *Succ : MBB->successors()) { + if (Succ->isLiveIn(AMDGPU::SCC)) + return true; + } + return false; +} + +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef( + MachineBasicBlock *MBB, MachineBasicBlock::iterator MI, + const TargetRegisterInfo *TRI, const SIInstrInfo *TII, + MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) { + // If SCC is dead at MI when we can use MI as the insert point. + if (!llvm::isSccLiveAt(MBB, MI)) { + return MI; + } + + const bool CheckForExecWrite = + Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; + + // Get the starting reverse iterator taking care to handle the MBB->end() + // case. + MachineBasicBlock::reverse_iterator Start; + if (MI == MBB->end()) { + Start = MBB->rbegin(); + } else { + Start = MI.getReverse(); + } + + // Otherwise, walk backwards through the block looking for a location where + // SCC is dead. + for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); + It != End; ++It) { + // If the instruction modifies exec then we cannot use it as + // an insertion point (if that is a constraint from the caller). + // The check for EXEC works for both wave64 and wave32 because + // it will also catch Writes to the subregisters (e.g. exec_lo). + if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) { + break; + } + + if (It->modifiesRegister(AMDGPU::SCC, TRI) && + !It->readsRegister(AMDGPU::SCC, TRI)) { + return It->getIterator(); + } + } + + // If no safe location can be found in the block we can save and restore + // SCC around MI. There is no way to directly read or Write SCC so we use + // s_cselect to read the current value of SCC and s_cmp to Write the saved + // value back to SCC. + // + // The generated code will look like this; + // + // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC + // <----- Newly created safe insert point. + // MI + // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC + // + Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) + .addImm(-1) + .addImm(0); + BuildMI(*MBB, std::next(MI->getIterator()), DL, + TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(TmpScc, RegState::Kill) + .addImm(0); + + return MI; +} + +namespace { +bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes, + SmallDenseSet &TouchedMBBSet) { + MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); + // Treat non inst as not local. + if (!StartMI || !EndMI) + return false; + // is local when parent MBB the same. + bool IsSameMBB = StartMI->getParent() == EndMI->getParent(); + if (!IsSameMBB) + return false; + // Collect touched MBB. + MachineBasicBlock *MBB = StartMI->getParent(); + TouchedMBBSet.insert(MBB); + return true; +} + +bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes, + SmallDenseSet &TouchedMBBSet) { + for (const LiveRange::Segment &Seg : Range->segments) { + if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet)) + return false; + } + return true; +} + +bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) { + MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); + // Treat non inst as not local. + if (!StartMI || !EndMI) + return false; + // is local when parent MBB the same. + return StartMI->getParent() == EndMI->getParent(); +} + +bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) { + for (const LiveRange::Segment &Seg : Range->segments) { + if (!isLocalSegment(&Seg, Indexes)) + return false; + } + return true; +} + +} // namespace + +// In case like float4 v, v.x used and defined in one block, v.y used and define +// in another block, one live interval could touch more than one MBB. +// TouchedMBBSet is used for scheduling where local live interval could cross +// multiple regions, need to calculate livereg for each region inside touched +// MBB. +bool llvm::isLocalLiveInterval( + const LiveInterval &LI, SlotIndexes *Indexes, + SmallDenseSet &TouchedMBBSet) { + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet)) + return false; + } + } + return isLocalLiveRange(&LI, Indexes, TouchedMBBSet); +} + +bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!isLocalLiveRange(&S, Indexes)) + return false; + } + } + return isLocalLiveRange(&LI, Indexes); +} + +// This is used to speed up reg pressure calculation. +// If instruction is moved, the cached liveset will be out of date. +// Before instruction is moved, the value will be correct. +void llvm::buildEndLiveMap( + llvm::LiveIntervals *LIS, llvm::MachineFunction &MF, + const llvm::MachineRegisterInfo &MRI, + llvm::DenseMap &MBBLiveMap, + bool After) { + // When only have one block, end live reg must be empty. + if (MF.size() == 1) + return; + auto *SlotIndexes = LIS->getSlotIndexes(); + DenseMap MBBOutputSlotMap; + for (MachineBasicBlock &MBB : MF) { + auto BBEnd = MBB.rbegin(); + + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) { + auto SI = SlotIndexes->getInstructionIndex(*BBEnd); + MBBOutputSlotMap[&MBB] = After ? SI.getDeadSlot() : SI.getBaseIndex(); + } + } + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + + const auto &LI = LIS->getInterval(Reg); + + // Skip local live interval to make live input/ouput faster. + if (llvm::isLocalLiveInterval(LI, SlotIndexes)) + continue; + + for (auto OutputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = OutputIt.first; + auto SI = OutputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + MBBLiveMap[MBB][Reg] = LiveMask; + } + } +} + +unsigned llvm::getCurrentVGPRCount(llvm::MachineFunction &MF, + const SIRegisterInfo *SIRI) { + auto &MRI = MF.getRegInfo(); + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + return SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::VGPR0) + 1; + } + } + return 0; +} + +unsigned llvm::getCurrentSGPRCount(llvm::MachineFunction &MF, + const SIRegisterInfo *SIRI) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + Register ScratchRSrcReg = MFI->getScratchRSrcReg(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned MaxSGPR = 0; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + // Skip scratch reserved reg, which is a big register that don't really + // contribute to this stat. + if (ScratchRSrcReg != 0) { + if (SIRI->isSubRegister(ScratchRSrcReg, Reg)) + continue; + } + MaxSGPR = SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::SGPR0); + break; + } + } + return 1 + llvm::RegForVCC + MaxSGPR; +} + +void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { + + dbgs() << "\n live set: \n"; + for (auto It : LiveSet) { + int Reg = It.first; + dbgs() << printReg(Reg, SIRI); + if (It.second.any()) { + dbgs() << " mask:" << It.second.getAsInteger(); + } + dbgs() << "\n"; + } +} + +// Test if all fast math flags of this Machine Instr are set. This allows +// all non-strict floating-point transforms. +bool llvm::isFastMathInst(llvm::MachineInstr &MI) { + // Follow the checks in isFast() in SelectionDAGNodes.h + return MI.getFlag(llvm::MachineInstr::MIFlag::FmNsz) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmArcp) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmNoNans) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmNoInfs) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmContract) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmAfn) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmReassoc); +} +#if 0 +bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage) +{ + switch (Stage) + { + case xmd::HwStage::PS: + case xmd::HwStage::CS: + return true; + default: + return false; + } +} +#endif + +MachineBasicBlock::succ_iterator +llvm::findSuccessor(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock *Succ) { + for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), + End = MBB->succ_end(); + It != End; ++It) { + if (*It == Succ) { + return It; + } + } + + return MBB->succ_end(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h new file mode 100644 index 0000000000000..7aa053b9f7fe8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -0,0 +1,213 @@ +//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/MC/LaneBitmask.h" + +namespace llvm { + +class MachineFunction; +class LiveIntervals; +class LiveInterval; +class MachineRegisterInfo; +class SIRegisterInfo; +class SIInstrInfo; +class MachineInstr; +class MachinePostDominatorTree; +class MachineLoopInfo; +class MachineDominatorTree; +class raw_ostream; +class TargetInstrInfo; +class TargetRegisterInfo; + +typedef unsigned MachineReg; +typedef unsigned MachineOpcode; + +constexpr unsigned RegForVCC = 2; +constexpr unsigned VGPR_LIMIT = 256; +// Post RA remat only try to help case when pressue is OK before RA but RA +// result is higher. The diff should not be too much. So just use 4 as threshold +// here. +constexpr unsigned PostRARematThreshHold = 4; + +using LiveSet = llvm::DenseMap; + +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); +void collectLiveSetPressure(const LiveSet &LiveSet, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + unsigned &VPressure, unsigned &SPressure); + +bool isExecUpdateForControlFlow(llvm::MachineInstr &MI); + +bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); + +llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO, + const llvm::MachineRegisterInfo &MRI); +void andLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet); +void andNotLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet); +void mergeLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet); +llvm::MachineBasicBlock *split(llvm::MachineInstr *I); + +// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only +// used 4 lanes. +bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *TRI, + const llvm::SIInstrInfo *TII, + llvm::SlotIndexes *SlotIndexes); + +bool reach_block(llvm::MachineBasicBlock *FromBB, + llvm::MachineDominatorTree *DT, + llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI, + llvm::MachineBasicBlock *ToBB); + +void viewCFGWithPhi(llvm::MachineFunction &MF); +void write_contribution_list(llvm::MachineFunction &MF, const char *Filename); + +bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, + llvm::MachineBasicBlock &MBB); + +void updatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, + const llvm::MachineRegisterInfo *MRI); + +void buildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, + llvm::SmallDenseSet &LiveOutSet, + const llvm::MachineRegisterInfo *MRI); + +MachineReg createVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand, + llvm::MachineFunction &MF); + +MachineReg createVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF); + +bool isExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, + MachineReg *OutDst); + +bool isSccLiveAt(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock::iterator MI); + +// An enum used to pass additional constraints to +// `FindOrCreateInsertionPointForSccDef()`. This will further +// constrain the location where the scc def can be inserted. +enum SccDefInsertPointConstraintFlags { + None = 0, // No additional constraints. + NoExecWrite = 1, // Should be no modification of exec between BeforeInst and + // insert point. +}; + +// Look for a safe place to insert an instruction that defines scc. +// +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( + llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst, + const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII, + llvm::MachineRegisterInfo *MRI, + SccDefInsertPointConstraintFlags Constraints = + SccDefInsertPointConstraintFlags::None); + +// Check if LI live cross basic blocks, save all touched basic block if is +// local. +bool isLocalLiveInterval( + const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes, + llvm::SmallDenseSet &TouchedMBBSet); +bool isLocalLiveInterval(const llvm::LiveInterval &LI, + llvm::SlotIndexes *Indexes); + +// build liveRegSet at end of each MBB. +void buildEndLiveMap( + llvm::LiveIntervals *LIS, llvm::MachineFunction &MF, + const llvm::MachineRegisterInfo &MRI, + llvm::DenseMap &MBBLiveMap, bool After); + +void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI); + +unsigned getCurrentVGPRCount(llvm::MachineFunction &MF, + const llvm::SIRegisterInfo *SIRI); +unsigned getCurrentSGPRCount(llvm::MachineFunction &MF, + const llvm::SIRegisterInfo *SIRI); + +bool isFastMathInst(llvm::MachineInstr &MI); + +namespace pressure { +void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &OS); +void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, + const char *Filename); +void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, + llvm::raw_ostream &OS); +} // namespace pressure + +// Look for the successor `Succ` of the given `MBB`. +// Returns MBB->succ_end() if `Succ` is not a successor of MBB. +llvm::MachineBasicBlock::succ_iterator +findSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ); + +// The enum and helper function for v_perm selection mask. +// +// The input byte layout of v_perm is as below: +// +// BYTE in[8] +// in[0] = $src1_BYTE0; +// in[1] = $src1_BYTE1; +// in[2] = $src1_BYTE2; +// in[3] = $src1_BYTE3; +// in[4] = $src0_BYTE0; +// in[5] = $src0_BYTE1; +// in[6] = $src0_BYTE2; +// in[7] = $src0_BYTE3; +// +enum class V_PERM_IN_BYTE_POS { + src1_BYTE0 = 0, + src1_BYTE1, + src1_BYTE2, + src1_BYTE3, + src0_BYTE0, + src0_BYTE1, + src0_BYTE2, + src0_BYTE3 +}; + +// The 4 arguments specify which input byte will be output +// out[0] = Sel_0; +// out[1] = Sel_1; +// out[2] = Sel_2; +// out[3] = Sel_3; +// +constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0, + V_PERM_IN_BYTE_POS Sel_1, + V_PERM_IN_BYTE_POS Sel_2, + V_PERM_IN_BYTE_POS Sel_3) { + return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) | ((int)Sel_1 << 8) | + (int)Sel_0); +} +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp new file mode 100644 index 0000000000000..e313c1f264a92 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -0,0 +1,188 @@ +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==------------------------------------------------------------------------==// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//==------------------------------------------------------------------------==// + +#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" + +#include "llvm/CodeGen/MachineLoopInfo.h" + +#include + +namespace llvm { + +// Other info which can help compare schedule result. +float SchedScore::computeScore() const { + // Occupancy 1 cannot mix alu. + unsigned MixHidenAlu = Alu - MixAlu; + if (Occupancy == 1) + MixHidenAlu = 0; + return ((float)MemLatency - (float)MixHidenAlu) / (float)Occupancy - + LatencyHide; +} +float SchedScore::computeScore2() const { + float Cycles = 0; + Cycles = (MixAlu * Occupancy + MemLatency); + Cycles /= Occupancy; + return Cycles; +} + +void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) { + unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1; + LatencyHide += LoopCount * S.LatencyHide; + MemLatency += LoopCount * S.MemLatency; + MixAlu += LoopCount * S.MixAlu; + Alu += LoopCount * S.Alu; + Lds += LoopCount * S.Lds; + SgprSpill |= S.SgprSpill; +} +bool SchedScore::isBetter(const SchedScore &S) const { + float Score = computeScore(); + float NewScore = S.computeScore(); + bool SpillBetter = !SgprSpill && S.SgprSpill; + return SpillBetter ? true : NewScore >= Score; +} +// Does more occupancy give more perf. +bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const { + unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc); + // 10% is good enough. + if ((10 * Gain) >= Alu) + return true; + return false; +} + +unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const { + unsigned Latency = MemLatency; + return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc)); +} + +// AMDGPULatencyTracker +AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST) + : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {} + +void AMDGPULatencyTracker::scan(const MachineInstr &MI) { + if (MI.isDebugInstr()) + return; + int Latency = SIII->getInstrLatency(ItinerayData, MI); + // If inside latency hide. + if (!LatencyMIs.empty()) { + bool IsWaitCnt = false; + for (auto &MO : MI.operands()) { + if (MO.isReg()) { + Register Reg = MO.getReg(); + auto It = LatencyMIs.find(Reg); + if (It != LatencyMIs.end()) { + IsWaitCnt = true; + // If MI use mem result, update latency to mem latency. + int Cycle = It->second; + if (Cycle > Latency) + Latency = Cycle; + } + } + } + // Update latency for each mem latency inst. + for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) { + auto Prev = It; + auto L = (It++); + int Cycle = L->second; + if (Cycle <= Latency) { + // Only left cycles. + // Remove the reg. + LatencyMIs.erase(Prev); + if (IsWaitCnt && Cycle == Latency) { + Score.MemLatency += Cycle; + // Only count memLatency once, the rest is hide. + IsWaitCnt = false; + } else { + // Hide cycle or count mem latency? + Score.LatencyHide += Cycle; + } + } else { + L->second -= Latency; + // Hide latency. + Score.LatencyHide += Latency; + } + } + + } else { + // TODO: check branch/lds? + // TODO: check prevVAlu? + auto GetAluStatus = [](const MachineInstr &MI, + const llvm::SIInstrInfo *SIII) { + AluStatus Status = AluStatus::Nothing; + if (SIII->isVALU(MI.getOpcode())) { + Status = AluStatus::Vector; + } else if (SIII->isSALU(MI.getOpcode())) { + Status = AluStatus::Scalar; + } + return Status; + }; + AluStatus Status = GetAluStatus(MI, SIII); + + switch (PrevStatus) { + case AluStatus::Nothing: { + Score.Alu += Latency; + Score.MixAlu += Latency; + PrevStatus = Status; + } break; + case AluStatus::Vector: + case AluStatus::Scalar: { + Score.Alu += Latency; + // Ignore mix alu. + if (PrevStatus != Status) { + PrevStatus = AluStatus::Nothing; + } else { + Score.MixAlu += Latency; + } + } break; + } + } + // Update latency inst. + if (SIII->isHighLatencyInstruction(MI) && MI.mayLoad()) { + Register Reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kHighLetency = 180; + LatencyMIs[Reg] = kHighLetency; + } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) { + Register Reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kLowLetency = 35; + LatencyMIs[Reg] = kLowLetency; + } +} + +SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI) { + SchedScore TotalScore; + for (auto &MFI : MF) { + MachineBasicBlock &MBB = MFI; + MachineBasicBlock::iterator Next; + AMDGPULatencyTracker LatencyTracker(ST); + for (auto &MI : MBB) { + LatencyTracker.scan(MI); + } + unsigned LoopDepth = 0; + if (MLI) { + LoopDepth = MLI->getLoopDepth(&MBB); + } + TotalScore.sum(LatencyTracker.Score, LoopDepth); + } + return TotalScore; +} + +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h new file mode 100644 index 0000000000000..c04afe61c9809 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -0,0 +1,75 @@ +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCInstrItineraries.h" + +namespace llvm { + +class MachineFunction; +class GCNSubtarget; +class MachineInstr; +class SIInstrInfo; +class MachineLoopInfo; + +struct SchedScore { + // Score for this Sched result. + unsigned Occupancy = 0; + bool SgprSpill = false; + unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass? + unsigned MemLatency = 0; // Only save mem latency. + // We want mem latency small and hide big. Compare + // memLatency - hide * Occ, smaller is better. + unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1. + unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy. + unsigned Lds = 0; // Todo: count lds. + SchedScore() {} + + // Other info which can help compare schedule result. + float computeScore() const; + float computeScore2() const; + + void sum(const SchedScore &S, unsigned LoopDepth = 0); + bool isBetter(const SchedScore &S) const; + bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const; + // More latency can be hiden with ExtraOcc. + unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; +}; + +struct AMDGPULatencyTracker { + AMDGPULatencyTracker(const llvm::GCNSubtarget &ST); + const llvm::SIInstrInfo *SIII; + const llvm::InstrItineraryData *ItinerayData; + // Latency MI dst reg to cycle map. + llvm::DenseMap LatencyMIs; + SchedScore Score; + // Low latency MI not wait. + unsigned HideLatency = 0; + unsigned MemLatency = 0; + // For simple, only consider mixture as one valu one salu. + // Not group now. + unsigned PrevSAlu = 0; + unsigned PrevVAlu = 0; + enum class AluStatus { + Nothing, + Vector, + Scalar, + } PrevStatus = AluStatus::Nothing; + void scan(const llvm::MachineInstr &MI); +}; + +SchedScore collectLatency(llvm::MachineFunction &MF, + const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI = nullptr); +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp new file mode 100644 index 0000000000000..548bfa508c735 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -0,0 +1,1771 @@ +//===----------- AMDGPUSubExpDag.cpp - AMDGPU Sub Expression DAG ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Sub Expression DAG. Helper for building a dag based on sub +/// expressions. +// +//===----------------------------------------------------------------------===// + +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/SlotIndexes.h" + +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/ADT/IntEqClasses.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Support/GraphWriter.h" + +#include "llvm/Support/Debug.h" + +#include "AMDGPUMIRUtils.h" +#include "AMDGPUSubExpDag.h" +#include "GCNRegPressure.h" +#include + +#define DEBUG_TYPE "xb-sub-exp-dag" +using namespace llvm; + +namespace llvm { + +// Expression Dag. + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void SubExp::dump(const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) const { + dbgs() << "\nSubExp:\n"; + dbgs() << "input regs:\n"; + for (auto &Input : InputLive) { + pressure::print_reg(Input.first, MRI, SIRI, llvm::dbgs()); + dbgs() << "\n"; + } + dbgs() << "output regs:\n"; + for (auto &Output : OutputLive) { + pressure::print_reg(Output.first, MRI, SIRI, llvm::dbgs()); + dbgs() << "\n"; + } + + for (MachineInstr *MI : SUnits) { + MI->dump(); + } + dbgs() << "End of SubExp\n"; +} +#endif + +bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const { + for (const MachineInstr *MI : SUnits) { + if (MI->modifiesRegister(Reg, SIRI)) { + return true; + } + } + + return false; +} + +void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI) { + SMaxSize = std::max(SInputSize, SOutputSize); + VMaxSize = std::max(VInputSize, VOutputSize); + + DenseMap LiveRegs; + GCNRegPressure CurPressure; + + // Add output to pressure. + for (MachineInstr *MI : BottomRoots) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (!Reg.isVirtual()) + continue; + LaneBitmask Mask = getRegMask(MO, MRI); + auto It = LiveRegs.find(Reg); + if (It != LiveRegs.end()) { + LiveRegs[Reg] = Mask | It->second; + } else { + LiveRegs[Reg] = Mask; + } + } + } + + for (auto It : LiveRegs) { + LaneBitmask EmptyMask; + CurPressure.inc(It.first, EmptyMask, It.second, MRI); + } + + for (auto It = SUnits.rbegin(); It != SUnits.rend(); It++) { + MachineInstr *MI = *It; + auto *ST = + &MI->getMF() + ->getSubtarget(); // TODO: Better way to get this. + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (!Reg.isVirtual()) { + if (Reg == AMDGPU::SCC) + IsTouchSCC = true; + continue; + } + + LaneBitmask LiveMask = getRegMask(MO, MRI); + LaneBitmask PrevMask; + auto LiveIt = LiveRegs.find(Reg); + if (LiveIt != LiveRegs.end()) { + PrevMask = LiveIt->second; + } + + if (MO.isDef()) { + LiveMask = PrevMask & (~(LiveMask)); + } else { + LiveMask = PrevMask | LiveMask; + } + + CurPressure.inc(Reg, PrevMask, LiveMask, MRI); + LiveRegs[Reg] = LiveMask; + } + + unsigned SSize = CurPressure.getSGPRNum(); + unsigned VSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts()); + if (SSize > SMaxSize) + SMaxSize = SSize; + if (VSize > VMaxSize) + VMaxSize = VSize; + } +} + +bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI) const { + if (IsMultiDefOutput) + return false; + if (IsHasTerminatorInst) + return false; + if (IsUseIncomingReg) + return false; + + // Input should be single def. + for (unsigned Reg : TopRegs) { + if (!MRI.hasOneDef(Reg) && !llvm::isSub0Sub1SingleDef(Reg, MRI)) + return false; + } + return true; +} + +ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const bool IsJoinInput) + : MRI(MRI), SIRI(SIRI), SIII(SIII), IsJoinInputToSubExp(IsJoinInput) {} + +template +void ExpDag::initNodes(const LiveSet &InputLiveReg, T &Insts) { + unsigned NodeSize = InputLiveReg.size() + Insts.size(); + SUnits.reserve(NodeSize); + + for (MachineInstr *MI : Insts) { + if (MI->isDebugInstr()) + continue; + SUnits.emplace_back(MI, SUnits.size()); + SUnit *SU = &SUnits.back(); + SUnitMIMap[SU] = MI; + MISUnitMap[MI] = SU; + } + + for (auto It : InputLiveReg) { + unsigned Reg = It.first; + SUnits.emplace_back(); + SUnit *SU = &SUnits.back(); + SU->NodeNum = SUnits.size() - 1; + SUnitInputMap[SU] = Reg; + InputSUnitMap[Reg] = SU; + } +} + +template void ExpDag::initNodes>( + const LiveSet &InputLiveReg, DenseSet &instRange); + +template void ExpDag::initNodes>( + const LiveSet &InputLiveReg, std::vector &instRange); + +template +void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, + T &Insts) { + initNodes(InputLiveReg, Insts); + addDataDep(); + addCtrlDep(); + buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII); +} + +template void +ExpDag::build>(const LiveSet &InputLiveReg, + const LiveSet &OutputLiveReg, + DenseSet &instRange); +template void ExpDag::build>( + const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, + std::vector &instRange); + +void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + IntEqClasses SubtreeClasses(SUnits.size()); + std::vector PassThruInputs; + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) { + PassThruInputs.emplace_back(SU.NodeNum); + continue; + } + if (!IsJoinInputToSubExp && !SU.isInstr()) + continue; + // Join prev. + for (SDep &PreDep : SU.Preds) { + SUnit *PreSU = PreDep.getSUnit(); + if (!IsJoinInputToSubExp && !PreSU->isInstr()) + continue; + SubtreeClasses.join(SU.NodeNum, PreSU->NodeNum); + } + // Join succ. + for (SDep &SucDep : SU.Succs) { + SUnit *SucSU = SucDep.getSUnit(); + SubtreeClasses.join(SU.NodeNum, SucSU->NodeNum); + } + } + SubtreeClasses.compress(); + + unsigned NumSubExps = SubtreeClasses.getNumClasses(); + // Not count PassThruInputs for subExps since they're exp with only 1 SU. + // SubExpIndexMap is used to pack SubIdx within updated NumSubExps. + NumSubExps -= PassThruInputs.size(); + SubExps.resize(NumSubExps); + DenseMap SubExpIndexMap; + + // Add SU to sub exp. + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) { + continue; + } + unsigned SubIdx = SubtreeClasses[SU.NodeNum]; + unsigned OriginSubIdx = SubIdx; + // Pack subidx. + if (SubExpIndexMap.count(SubIdx) == 0) { + unsigned Count = SubExpIndexMap.size(); + SubExpIndexMap.insert(std::make_pair(SubIdx, Count)); + } + SubIdx = SubExpIndexMap[SubIdx]; + // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag. + SU.NodeQueueId = SubIdx; + + SubExp &Exp = SubExps[SubIdx]; + auto It = SUnitInputMap.find(&SU); + if (It != SUnitInputMap.end()) { + // Input. + Register Reg = It->second; + Exp.TopRegs.insert(Reg); + } else { + MachineInstr *MI = SU.getInstr(); + MachineBasicBlock *MBB = MI->getParent(); + Exp.FromBB = MBB; + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isUse()) + continue; + Register Reg = MO.getReg(); + if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) { + Exp.IsUseIncomingReg = true; + } + } + + Exp.SUnits.emplace_back(MI); + if (SU.NumSuccsLeft == 0) { + Exp.BottomRoots.insert(MI); + if (MI->isTerminator()) + Exp.IsHasTerminatorInst = true; + } + if (MI->isNotDuplicable()) + Exp.IsNotSafeToCopy = true; + // Skip Scalar mem access since no scalar store. + if (MI->mayLoadOrStore() && !SIII->isSMRD(*MI)) { + Exp.IsHasMemInst = true; + } + // Add bottom regs. + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Register Reg = MO.getReg(); + // physical reg is not in live reg. + if (!Reg.isVirtual()) + continue; + if (SU.NumSuccsLeft) { + // For SU which has used in current blk. + // Check if used in other blks or subExps. + bool IsUsedInOtherBlk = false; + for (auto &UserMI : MRI.use_nodbg_instructions(Reg)) { + if (UserMI.getParent() != MBB) { + IsUsedInOtherBlk = true; + break; + } + auto SuIt = MISUnitMap.find(&UserMI); + // When UserMI is not in dag, treat it as other block. + if (SuIt == MISUnitMap.end()) { + IsUsedInOtherBlk = true; + break; + } + SUnit *UseSU = SuIt->second; + // UserMI should always be in same subExp. + unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum]; + if (UseSubIdx != OriginSubIdx) { + // When reg has multiple def, it is possible for user def in + // different subExp. + if (MRI.getUniqueVRegDef(Reg)) + llvm::report_fatal_error("user and def in different subExp"); + break; + } + } + if (!IsUsedInOtherBlk) + continue; + } + Exp.BottomRegs.insert(Reg); + if (!MRI.getUniqueVRegDef(Reg)) { + Exp.IsMultiDefOutput = true; + } + } + } + } + // Calc reg for SubExp. + // Get block live in and live out. + // Only reg will miss live mask. + for (SubExp &Exp : SubExps) { + for (unsigned Reg : Exp.TopRegs) { + auto It = StartLiveReg.find(Reg); + assert(It != StartLiveReg.end() && + "cannot find input reg in block start live"); + Exp.InputLive[Reg] |= It->second; + } + + for (unsigned Reg : Exp.BottomRegs) { + auto It = EndLiveReg.find(Reg); + if (It == EndLiveReg.end()) { + //"cannot find output reg in block end live"); + // Bottom reg is killed inside current block, did not get out of the + // block. + // Or the bottom reg is not treat as output in this dag, not save to + // OutputLive which will affect profit count. + continue; + } + Exp.OutputLive[Reg] |= It->second; + } + + collectLiveSetPressure(Exp.InputLive, MRI, SIRI, Exp.VInputSize, + Exp.SInputSize); + collectLiveSetPressure(Exp.OutputLive, MRI, SIRI, Exp.VOutputSize, + Exp.SOutputSize); + } +} + +void ExpDag::addDataDep() { + DenseMap CurDefMI; + + for (SUnit &SU : SUnits) { + if (!SU.isInstr()) + continue; + MachineInstr *MI = SU.getInstr(); + + // Link use to the def. + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + + Register Reg = MO.getReg(); + SUnit *DefSU = nullptr; + + auto CurDefIt = CurDefMI.find(Reg); + // Check def inst first. + if (CurDefIt != CurDefMI.end()) { + MachineInstr *CurDef = CurDefIt->second; + DefSU = MISUnitMap[CurDef]; + } else { + // physical reg is not in live reg. + if (!Reg.isVirtual()) + continue; + if (MO.isUndef()) + continue; + // Is it OK for degbug instr MO cannot find def? + if (MI->isDebugInstr()) + continue; + // Should be an input. + assert(InputSUnitMap.count(Reg) > 0 && "cannot find def"); + DefSU = InputSUnitMap[Reg]; + } + SU.addPred(SDep(DefSU, SDep::Data, Reg)); + } + + // Add def to curDefMI; + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Register Reg = MO.getReg(); + + // For case like: + // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // %808.sub1:sgpr_64 = S_MOV_B32 0 + // When partially write, link MI to previous def. + if (MO.getSubReg() != 0) { + SUnit *DefSU = nullptr; + auto CurDefIt = CurDefMI.find(Reg); + // Check def inst first. + if (CurDefIt != CurDefMI.end()) { + MachineInstr *CurDef = CurDefIt->second; + DefSU = MISUnitMap[CurDef]; + // Add link between different defs. + SU.addPred(SDep(DefSU, SDep::Data, Reg)); + } + } + + CurDefMI[Reg] = MI; + } + } +} + +void ExpDag::addCtrlDep() { + // TODO: add depend for memory, barrier. +} + +BlockExpDag::BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII) + : ExpDag(MRI, SIRI, SIII, /*IsJoinInput*/ true), LIS(LIS), MBB(B) {} + +void BlockExpDag::build() { + auto *SlotIndexes = LIS->getSlotIndexes(); + const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB); + const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI); + + const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB); + const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI); + + std::vector Insts; + for (MachineInstr &MI : *MBB) { + Insts.emplace_back(&MI); + } + + ExpDag::build(StartLiveReg, EndLiveReg, Insts); +} + +void BlockExpDag::buildWithPressure() { + auto *SlotIndexes = LIS->getSlotIndexes(); + const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB); + const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI); + + const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB); + const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI); + + std::vector Insts; + for (MachineInstr &MI : *MBB) { + Insts.emplace_back(&MI); + } + + ExpDag::build(StartLiveReg, EndLiveReg, Insts); + // Build pressure. + buildPressure(StartLiveReg, EndLiveReg); +} + +void BlockExpDag::buildAvail(const LiveSet &PassThruSet, + DenseMap &DagAvailRegMap) { + DenseSet Processed; + + DenseSet WorkList; + MachineInstr &BeginMI = MBB->instr_front(); + + // Calc avaialbe for each node, live is avail & sum(input of success). + // If a reg is avaiable from the node, then success node can use it from this + // node. For dag live, pred output don't need to have all input a node needs. + // As long as all pred outputs can cover inputs, it is OK. + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0) { + GCNDownwardRPTracker RP(*LIS); + RP.reset(BeginMI, &PassThruSet); + MachineInstr *MI = SU.getInstr(); + if (MI) { + RP.reset(*MI, &PassThruSet); + RP.advance(); + } + DagAvailRegMap[&SU] = RP.getLiveRegs(); + + // Add succ to work list. + for (auto &Succ : SU.Succs) { + SUnit *SuccSU = Succ.getSUnit(); + if (SuccSU->NumPredsLeft > 0) + SuccSU->NumPredsLeft--; + WorkList.insert(SuccSU); + } + } + } + while (!WorkList.empty()) { + SmallVector ReadyNodes; + for (SUnit *SU : WorkList) { + if (SU->NumPredsLeft > 0) + continue; + ReadyNodes.emplace_back(SU); + // Ready, move it to Processed. + Processed.insert(SU); + // Only update 1 node once. + // Order of schedle here should not affect pressure. + break; + } + + for (SUnit *SU : ReadyNodes) { + // Remove SU from worklist. + WorkList.erase(SU); + + MachineInstr *MI = SU->getInstr(); + // Calc pressure based on pred nodes. + GCNRPTracker::LiveRegSet DagLive; + for (auto &Pred : SU->Preds) { + SUnit *PredSU = Pred.getSUnit(); + GCNRPTracker::LiveRegSet PredLive = DagAvailRegMap[PredSU]; + + GCNDownwardRPTracker RP(*LIS); + RP.reset(BeginMI, &PredLive); + if (MI) { + RP.reset(*MI, &PredLive); + // Update PredLive based on MI. + RP.advance(); + } + llvm::mergeLiveRegSet(DagLive, RP.getLiveRegs()); + } + DagAvailRegMap[SU] = DagLive; + + // Add succ to work list. + for (auto &Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + if (SuccSU->NumPredsLeft > 0) + SuccSU->NumPredsLeft--; + WorkList.insert(SuccSU); + } + } + + // Skip dead loop + if (ReadyNodes.empty()) { + printf("dead loop when build dag pressure"); + break; + } + } + + assert(WorkList.empty() && "schedule failed for available reg"); +} + +void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, + const LiveSet &EndLiveReg) { + if (MBB->empty()) + return; + DenseMap DagAvailRegMap; + GCNRPTracker::LiveRegSet PassThruSet; + for (auto It : StartLiveReg) { + Register Reg = It.first; + auto EndReg = EndLiveReg.find(Reg); + if (EndReg == EndLiveReg.end()) + continue; + + LaneBitmask Mask = It.second; + LaneBitmask EndMask = EndReg->second; + Mask &= EndMask; + if (Mask.getAsInteger() == 0) + continue; + PassThruSet[Reg] = Mask; + } + + // Build avial for each nodes. + buildAvail(PassThruSet, DagAvailRegMap); + + // Calc avaialbe for each node, live is avail & sum(input of success). + // If a reg is avaiable from the node, then success node can use it from this + // node. For dag live, pred output don't need to have all input a node needs. + // As long as all pred outputs can cover inputs, it is OK. + DenseSet Processed; + + DenseSet WorkList; + MachineInstr &BeginMI = MBB->instr_front(); + + for (SUnit &SU : SUnits) { + if (SU.NumSuccsLeft == 0) { + // Calc pressure based on pass thru. + // Using pass thru as base because output of current SU should not + // affect other output SUs. + GCNUpwardRPTracker RP(*LIS); + RP.reset(BeginMI, &PassThruSet, /*After*/ true); + MachineInstr *MI = SU.getInstr(); + if (MI) { + RP.reset(*MI, &PassThruSet, /*After*/ true); + RP.recede(*MI); + } + DagPressureMap[&SU] = RP.getLiveRegs(); + // Add pred to work list. + for (auto &Pred : SU.Preds) { + SUnit *PredSU = Pred.getSUnit(); + PredSU->NumSuccsLeft--; + WorkList.insert(PredSU); + } + } + } + + while (!WorkList.empty()) { + SmallVector ReadyNodes; + for (SUnit *SU : WorkList) { + if (SU->NumSuccsLeft > 0) + continue; + ReadyNodes.emplace_back(SU); + // Ready, move it to Processed. + Processed.insert(SU); + // Only update 1 node once. + // Order of schedle here should not affect pressure. + break; + } + + for (SUnit *SU : ReadyNodes) { + // Remove SU from worklist. + WorkList.erase(SU); + + MachineInstr *MI = SU->getInstr(); + // Calc pressure based on succ nodes. + GCNRPTracker::LiveRegSet DagLive; + for (auto &Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU]; + + GCNUpwardRPTracker RP(*LIS); + RP.reset(BeginMI, &SuccLive, /*After*/ true); + if (MI) { + RP.reset(*MI, &SuccLive, /*After*/ true); + // Update SuccLive based on MI. + RP.recede(*MI); + } + llvm::mergeLiveRegSet(DagLive, RP.getLiveRegs()); + } + // Remove live which not avail in SU. + GCNRPTracker::LiveRegSet AvailLive = DagAvailRegMap[SU]; + llvm::andLiveRegSet(DagLive, AvailLive); + DagPressureMap[SU] = DagLive; + + // Add pred to work list. + for (auto &Pred : SU->Preds) { + SUnit *PredSU = Pred.getSUnit(); + PredSU->NumSuccsLeft--; + WorkList.insert(PredSU); + } + } + + // Skip dead loop + if (ReadyNodes.empty()) { + printf("dead loop when build dag pressure"); + break; + } + } +} + +// dump functions. + +std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const { + std::string S; + raw_string_ostream OSS(S); + auto It = SUnitInputMap.find(SU); + if (It != SUnitInputMap.end()) { + OSS << "second) << ">"; + } else { + SU->getInstr()->print(OSS, /*SkipOpers=*/true); + } + + return OSS.str(); +} + +/// Return the label. +std::string ExpDag::getDAGName() const { return "dag.exp"; } + +/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG +/// rendered using 'dot'. +/// +void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const { + // This code is only for debugging! +#ifndef NDEBUG + ViewGraph(const_cast(this), Name, false, Title); +#else + errs() << "BlockExpDag::viewGraph is only available in debug builds on " + << "systems with Graphviz or gv!\n"; +#endif // NDEBUG +} + +void ExpDag::dump() { + viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName()); +} + +} // namespace llvm + +// Expression Dag dump. +namespace llvm { + +static DenseSet ViewNodes; + +template <> +struct DOTGraphTraits : public DefaultDOTGraphTraits { + + DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} + + static std::string getGraphName(const llvm::ExpDag *) { + return "ExpDag graph"; + } + + static bool renderGraphFromBottomUp() { return true; } + + static bool isNodeHidden(const SUnit *Node, const llvm::ExpDag *) { + if (ViewNodes.empty()) + return false; + + return ViewNodes.count(Node) == 0; + } + + static std::string getNodeIdentifierLabel(const SUnit *Node, + const llvm::ExpDag *) { + std::string R; + raw_string_ostream OS(R); + OS << static_cast(Node); + return R; + } + + /// If you want to override the dot attributes printed for a particular + /// edge, override this method. + static std::string getEdgeAttributes(const SUnit *, SUnitIterator EI, + const llvm::ExpDag *) { + if (EI.isArtificialDep()) + return "color=cyan,style=dashed"; + if (EI.isCtrlDep()) + return "color=blue,style=dashed"; + return ""; + } + + static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *) { + std::string Str; + raw_string_ostream SS(Str); + SS << "SU:" << SU->NodeNum; + return SS.str(); + } + static std::string getNodeDescription(const SUnit *SU, + const llvm::ExpDag *G) { + return G->getGraphNodeLabel(SU); + } + static std::string getNodeAttributes(const SUnit *N, const llvm::ExpDag *) { + std::string Str("shape=Mrecord"); + + Str += ",style=filled,fillcolor=\"#"; + // Use NodeQueueId as SubIdx for ExpDag. + Str += DOT::getColorString(N->NodeQueueId); + Str += '"'; + + return Str; + } + + static void addCustomGraphFeatures(llvm::ExpDag *G, + GraphWriter &GW) { + return G->addCustomGraphFeatures(GW); + } +}; + +template <> struct GraphTraits : public GraphTraits { + using nodes_iterator = pointer_iterator::iterator>; + static nodes_iterator nodes_begin(llvm::ExpDag *G) { + return nodes_iterator(G->SUnits.begin()); + } + static nodes_iterator nodes_end(llvm::ExpDag *G) { + return nodes_iterator(G->SUnits.end()); + } +}; + +} // namespace llvm + +namespace llvm { +void getRegBound(llvm::MachineBasicBlock *MBB, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + llvm::LiveIntervals *LIS, unsigned &MaxVGPR, + unsigned &MaxSGPR) { + // TODO: calc real reg bound. + MaxVGPR = AMDGPU::VGPR255 - AMDGPU::VGPR0; + MaxSGPR = AMDGPU::SGPR104 - AMDGPU::SGPR0; + + const auto &EndSlot = LIS->getMBBEndIdx(MBB); + const GCNRPTracker::LiveRegSet OutputLive = + llvm::getLiveRegs(EndSlot, *LIS, MRI); + + auto *ST = + &MBB->getParent() + ->getSubtarget(); // TODO: Better way to get this. + if (MBB->empty()) { + GCNRegPressure MaxPressure = getRegPressure(MRI, OutputLive); + MaxSGPR = MaxPressure.getSGPRNum(); + MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts()); + return; + } + + BlockExpDag Dag(MBB, LIS, MRI, SIRI, SIII); + Dag.build(); + + std::vector &SUnits = Dag.SUnits; + // Remove input nodes. + for (SUnit &SU : SUnits) { + if (!SU.isInstr()) + continue; + std::vector InputDeps; + for (SDep &Dep : SU.Preds) { + SUnit *Pred = Dep.getSUnit(); + if (Pred->isInstr()) + continue; + InputDeps.emplace_back(Dep); + } + for (SDep &Dep : InputDeps) { + SU.removePred(Dep); + } + } + + const unsigned InputSize = Dag.InputSUnitMap.size(); + const unsigned InstNodeSize = SUnits.size() - InputSize; + SUnits.erase(SUnits.begin() + InstNodeSize, SUnits.end()); + + std::vector BotRoots; + for (SUnit &SU : SUnits) { + if (SU.NumSuccsLeft == 0) + BotRoots.emplace_back(&SU); + } + + auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI); + + GCNUpwardRPTracker RPTracker(*LIS); + RPTracker.reset(MBB->front(), &OutputLive, /*After*/ true); + for (auto It = SchedResult.rbegin(); It != SchedResult.rend(); It++) { + const SUnit *SU = *It; + if (!SU->isInstr()) + continue; + MachineInstr *MI = SU->getInstr(); + RPTracker.recede(*MI); + } + + GCNRegPressure MaxPressure = RPTracker.getMaxPressureAndReset(); + MaxSGPR = MaxPressure.getSGPRNum(); + MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts()); +} +} // namespace llvm + +// HRB +namespace { + +std::vector buildWorkList(std::vector &SUnits) { + std::vector ResultList; + ResultList.reserve(SUnits.size()); + for (SUnit &SU : SUnits) { + ResultList.emplace_back(&SU); + } + return ResultList; +} + +void sortByHeight(std::vector &WorkList) { + std::sort(WorkList.begin(), WorkList.end(), + [](const SUnit *A, const SUnit *B) { + // Lowest height first. + if (A->getHeight() < B->getHeight()) + return true; + // If height the same, NodeNum big first. + if (A->getHeight() == B->getHeight()) + return A->NodeNum > B->NodeNum; + return false; + }); +} + +void sortByInChain(std::vector &WorkList, DenseSet &Chained) { + // In chain nodes at end. + std::sort(WorkList.begin(), WorkList.end(), + [&Chained](const SUnit *A, const SUnit *B) { + return Chained.count(A) < Chained.count(B); + }); +} + +const TargetRegisterClass *getRegClass(SUnit *SU, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + if (!SU->isInstr()) + return nullptr; + MachineInstr *MI = SU->getInstr(); + if (MI->getNumDefs() == 0) + return nullptr; + + // For MI has more than one dst, always use first dst. + MachineOperand *MO = MI->defs().begin(); + if (!MO->isReg()) + return nullptr; + Register Reg = MO->getReg(); + return SIRI->getRegClassForReg(MRI, Reg); +} + +unsigned getVGPRSize(const TargetRegisterClass *RC, + const SIRegisterInfo *SIRI) { + if (!RC) + return 0; + if (SIRI->isSGPRClass(RC)) + return 0; + return RC->getLaneMask().getNumLanes(); +} +unsigned getSGPRSize(const TargetRegisterClass *RC, + const SIRegisterInfo *SIRI) { + if (!RC) + return 0; + if (!SIRI->isSGPRClass(RC)) + return 0; + return RC->getLaneMask().getNumLanes(); +} + +} // namespace + +namespace llvm { + +void HRB::Lineage::addNode(llvm::SUnit *SU) { Nodes.emplace_back(SU); } +unsigned HRB::Lineage::getSize() const { + return RC ? RC->getLaneMask().getNumLanes() : 0; +} +unsigned HRB::Lineage::length() const { return Nodes.size(); } + +SUnit *HRB::Lineage::getHead() const { return Nodes.front(); } +SUnit *HRB::Lineage::getTail() const { return Nodes.back(); } + +void HRB::buildLinear(std::vector &SUnits) { + // Working list from TopRoots. + std::vector WorkList = buildWorkList(SUnits); + IntEqClasses EqClasses(SUnits.size()); + + while (!WorkList.empty()) { + sortByHeight(WorkList); + // Highest SU. + SUnit *SU = WorkList.back(); + WorkList.pop_back(); + if (!SU->isInstr()) + continue; + if (ChainedNodes.count(SU) > 0) + continue; + IsRecomputeHeight = false; + Lineage Lineage = buildChain(SU, SUnits); + + // Remove chained nodes from worklist. + sortByInChain(WorkList, ChainedNodes); + while (!WorkList.empty()) { + SUnit *Back = WorkList.back(); + if (ChainedNodes.count(Back)) + WorkList.pop_back(); + else + break; + } + + Lineages.emplace_back(Lineage); + + if (IsRecomputeHeight) { + // Update height from tail. + SUnit *Tail = Lineage.Nodes.back(); + Tail->setDepthDirty(); + Tail->getHeight(); + } + } + + DenseSet TailSet; + for (Lineage &L : Lineages) { + if (L.Nodes.size() < 2) + continue; + auto It = L.Nodes.rbegin(); + It++; + SUnit *Tail = L.Nodes.back(); + // If already as tail for other Lineage, start from next. + if (TailSet.count(Tail) > 0) { + Tail = *It; + It++; + } else { + TailSet.insert(Tail); + } + for (; It != L.Nodes.rend(); It++) { + SUnit *SU = *It; + if (Tail->NodeNum == (unsigned)-1) + continue; + EqClasses.join(SU->NodeNum, Tail->NodeNum); + } + } + + EqClasses.compress(); + // TODO: assign sub class to node. + for (Lineage &L : Lineages) { + for (SUnit *SU : L.Nodes) { + if (SU->NodeNum == (unsigned)-1) + continue; + unsigned SubIdx = EqClasses[SU->NodeNum]; + //// Pack subidx. + // if (EqClasses.count(SubIdx) == 0) + // EqClasses[SubIdx] = EqClasses.size(); + SubIdx = EqClasses[SubIdx]; + // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag. + SU->NodeQueueId = SubIdx; + } + } + + LLVM_DEBUG( + dbgs() << "Chained Nodes:"; for (SUnit *SU : ChainedNodes) { + dbgs() << " " << SU->NodeNum << "\n"; + } for (unsigned i = 0; i < Lineages.size(); i++) { + dbgs() << "Lineage" << i << ":"; + Lineage &L = Lineages[i]; + for (SUnit *SU : L.Nodes) { + dbgs() << " " << SU->NodeNum; + } + dbgs() << "\n"; + }); +} + +SUnit *HRB::findHeir(SUnit *SU, std::vector &SUnits) { + std::vector Candidates; + for (SDep &Dep : SU->Succs) { + // Only check data dep. + if (Dep.getKind() != SDep::Data) + continue; + + SUnit *Succ = Dep.getSUnit(); + Candidates.emplace_back(Succ); + } + + if (Candidates.empty()) + return nullptr; + + if (Candidates.size() == 1) + return Candidates.front(); + + sortByHeight(Candidates); + // Lowest height. + SUnit *Heir = Candidates.front(); + SmallVector SameHeightCandidate; + for (SUnit *SU : Candidates) { + if (Heir->getHeight() != SU->getHeight()) + break; + SameHeightCandidate.emplace_back(SU); + } + // Make sure choose lowest dependence between SameHeightCandidate. + if (SameHeightCandidate.size() > 1) { + for (size_t i = 1; i < SameHeightCandidate.size(); i++) { + SUnit *SU = SameHeightCandidate[i]; + // If Heir is pred of SU, use SU. + if (canReach(SU, Heir)) + Heir = SU; + } + } + + unsigned HeriHeight = Heir->getHeight(); + + // if lowest node is in ChainedNodes, try to find same height nodes? + + for (SDep &Dep : SU->Succs) { + // Only check data dep. + if (Dep.getKind() != SDep::Data) + continue; + SUnit *Succ = Dep.getSUnit(); + if (Succ == Heir) + continue; + // Avoid cycle in DAG. + if (canReach(Heir, Succ)) + return nullptr; + // Make sure Succ is before Heir. + Heir->addPred(SDep(Succ, SDep::Artificial)); + updateReachForEdge(Succ, Heir, SUnits); + LLVM_DEBUG(dbgs() << "add edge from " << Succ->NodeNum << "(" + << Succ->getHeight() << ") to " << Heir->NodeNum << "(" + << HeriHeight << ")\n"); + // Update height if need. + unsigned Height = Succ->getHeight(); + if (Height <= HeriHeight) { + IsRecomputeHeight = true; + } + } + return Heir; +} + +HRB::Lineage HRB::buildChain(SUnit *Node, std::vector &SUnits) { + HRB::Lineage Chain; + Chain.addNode(Node); + ChainedNodes.insert(Node); + LLVM_DEBUG(dbgs() << "start chain " << Node->NodeNum << "(" + << Node->getHeight() << ")\n"); + while (Node->NumSuccsLeft > 0) { + SUnit *Heir = findHeir(Node, SUnits); + if (!Heir) + break; + Chain.addNode(Heir); + + LLVM_DEBUG(dbgs() << "add node to chain " << Heir->NodeNum << "\n"); + if (ChainedNodes.count(Heir) > 0) + break; + ChainedNodes.insert(Heir); + + Node = Heir; + } + // Find biggest vgpr RC for the chain. + // TODO: Build conflict and allocate on each edge of the chain. + const TargetRegisterClass *RC = nullptr; + unsigned MaxRCSize = 0; + for (SUnit *SU : Chain.Nodes) { + const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI); + unsigned RCSize = getVGPRSize(SuRC, SIRI); + if (RCSize > MaxRCSize) { + MaxRCSize = RCSize; + RC = SuRC; + } + } + if (!RC) { + // TODO: Find biggest sgpr RC. + unsigned MaxRCSize = 0; + for (SUnit *SU : Chain.Nodes) { + const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI); + unsigned RCSize = getSGPRSize(SuRC, SIRI); + if (RCSize > MaxRCSize) { + MaxRCSize = RCSize; + RC = SuRC; + } + } + } + Chain.RC = RC; + return Chain; +} + +void HRB::buildConflict() { + + for (unsigned i = 0; i < Lineages.size(); i++) { + Lineage &A = Lineages[i]; + for (unsigned j = i + 1; j < Lineages.size(); j++) { + Lineage &B = Lineages[j]; + if (isConflict(A, B)) { + Color.Conflicts[i].insert(j); + Color.Conflicts[j].insert(i); + LLVM_DEBUG(dbgs() << i << " conflict" << j << "\n"); + } + } + // SelfConflict. + Color.Conflicts[i].insert(i); + } +} + +bool HRB::canReach(llvm::SUnit *A, llvm::SUnit *B) { + auto It = ReachMap.find(A); + // If no reach info, treat as reach. + if (It == ReachMap.end()) + return true; + DenseSet &CurReach = It->second; + return CurReach.find(B) != CurReach.end(); +} + +void HRB::updateReachForEdge(llvm::SUnit *A, llvm::SUnit *B, + std::vector &SUnits) { + DenseSet &ReachA = ReachMap[A]; + ReachA.insert(B); + DenseSet &ReachB = ReachMap[B]; + ReachA.insert(ReachB.begin(), ReachB.end()); + + for (SUnit &SU : SUnits) { + if (!canReach(&SU, A)) + continue; + + DenseSet &CurReach = ReachMap[&SU]; + CurReach.insert(ReachA.begin(), ReachA.end()); + } +} + +void HRB::buildReachRelation(ArrayRef BotRoots) { + // Add fake entry to do PostOrder traversal. + // SUnit using Pred to traversal, so need to Revrese post order. + SUnit FakeEntry; + SmallVector FakeDeps; + for (SUnit *Root : BotRoots) { + SDep Dep = SDep(Root, SDep::Artificial); + FakeEntry.addPred(Dep); + FakeDeps.emplace_back(Dep); + } + + ReversePostOrderTraversal RPOT(&FakeEntry); + for (SUnit *SU : RPOT) { + // Create Reach Set first. + ReachMap[SU].clear(); + } + for (SUnit *SU : RPOT) { + DenseSet &CurReach = ReachMap[SU]; + // All Preds can reach SU and SU's reach. + for (SDep &Dep : SU->Preds) { + // Igonre week dep. + if (Dep.isWeak()) + continue; + DenseSet &PrevReach = ReachMap[Dep.getSUnit()]; + PrevReach.insert(SU); + PrevReach.insert(CurReach.begin(), CurReach.end()); + } + assert(CurReach.count(SU) == 0 && "dead loop"); + } + // Remove fake entry. + for (SDep &Dep : FakeDeps) { + FakeEntry.removePred(Dep); + } + ReachMap.erase(&FakeEntry); + + LLVM_DEBUG(for (Lineage &L : Lineages) { + for (SUnit *SU : L.Nodes) { + DenseSet &CurReach = ReachMap[SU]; + dbgs() << SU->NodeNum << " reach: "; + for (SUnit *R : CurReach) { + dbgs() << R->NodeNum << " "; + } + dbgs() << "\n"; + } + }); +} + +bool HRB::isConflict(const Lineage &A, const Lineage &B) { + // Make conflict between sgpr and vgpr to help group lineages when share + // colors. Keep the conflict will group lineages in avoid mix use color in + // different sub exp. + SUnit *Head0 = A.getHead(); + SUnit *Tail0 = A.getTail(); + SUnit *Head1 = B.getHead(); + SUnit *Tail1 = B.getTail(); + DenseSet &Reach0 = ReachMap[Head0]; + DenseSet &Reach1 = ReachMap[Head1]; + bool R01 = Reach0.count(Tail1) != 0; + bool R10 = Reach1.count(Tail0) != 0; + return R01 && R10; +} +bool HRB::canFuse(const Lineage &A, const Lineage &B) { + if (A.RC != B.RC) { + // no RC will not conflict with other nodes. + if (!A.RC) + return false; + if (!B.RC) + return false; + // SGRP and VGPR not conflict. + if (SIRI->isSGPRClass(A.RC) != SIRI->isSGPRClass(B.RC)) + return false; + } + // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa. + SUnit *Head0 = A.getHead(); + SUnit *Tail0 = A.getTail(); + SUnit *Head1 = B.getHead(); + SUnit *Tail1 = B.getTail(); + DenseSet &Reach0 = ReachMap[Head0]; + DenseSet &Reach1 = ReachMap[Head1]; + bool R01 = Reach0.count(Tail1) != 0; + bool R10 = Reach1.count(Tail0) != 0; + return R01 != R10; +} + +bool HRB::tryFuse(Lineage &A, Lineage &B, std::vector &SUnits) { + + // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa. + SUnit *Head0 = A.getHead(); + SUnit *Tail0 = A.getTail(); + SUnit *Head1 = B.getHead(); + SUnit *Tail1 = B.getTail(); + DenseSet &Reach0 = ReachMap[Head0]; + DenseSet &Reach1 = ReachMap[Head1]; + bool R01 = Reach0.count(Tail1) != 0; + bool R10 = Reach1.count(Tail0) != 0; + if (R01 == R10) + return false; + Lineage *NewHead = &A; + Lineage *NewTail = &B; + if (R01) { + // a reach b, b cannot reach a. + // link a.tail->b.head. + NewHead = &A; + NewTail = &B; + } else { + // b reach a, a cannot reach b. + // link b.tail->a.head. + NewHead = &B; + NewTail = &A; + } + + // Merge reg class. + const TargetRegisterClass *RC0 = NewHead->RC; + const TargetRegisterClass *RC1 = NewTail->RC; + unsigned RC0Size = getVGPRSize(RC0, SIRI); + unsigned RC1Size = getVGPRSize(RC1, SIRI); + if (RC1Size > RC0Size) + NewHead->RC = RC1; + // Merge chain. + SUnit *FuseTail = NewHead->getTail(); + SUnit *FuseHead = NewTail->getHead(); + assert(ReachMap[FuseHead].count(FuseTail) == 0 && ""); + FuseHead->addPred(SDep(FuseTail, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "fuse " << FuseTail->NodeNum << "->" << FuseHead->NodeNum + << "\n"); + // Update reach map. + updateReachForEdge(FuseTail, FuseHead, SUnits); + // Merge Nodes. + NewHead->Nodes.append(NewTail->Nodes.begin(), NewTail->Nodes.end()); + // Clear newTail. + NewTail->Nodes.clear(); + NewTail->RC = nullptr; + return true; +} + +void HRB::fusionLineages(std::vector &SUnits) { + if (Lineages.empty()) + return; + bool IsUpdated = true; + while (IsUpdated) { + IsUpdated = false; + int Size = Lineages.size(); + for (int i = 0; i < Size; i++) { + Lineage &A = Lineages[i]; + if (A.length() == 0) + continue; + + for (int j = i + 1; j < Size; j++) { + Lineage &B = Lineages[j]; + if (B.length() == 0) + continue; + if (tryFuse(A, B, SUnits)) { + IsUpdated = true; + if (A.length() == 0) + break; + } + } + } + // Remove empty lineages. + std::sort(Lineages.begin(), Lineages.end(), + [](const Lineage &A, const Lineage &B) { + return A.length() > B.length(); + }); + while (Lineages.back().length() == 0) { + Lineages.pop_back(); + } + } + // Set ID after fusion. + unsigned ID = 0; + for (Lineage &L : Lineages) { + L.ID = ID++; + } +} + +unsigned HRB::colorLineages(std::vector &InLineages, + DenseMap &AllocMap, + const unsigned Limit) { + // allocate long Lineage first. How about size of RC? + std::sort(InLineages.begin(), InLineages.end(), + [](const Lineage *a, const Lineage *b) { + // Make sure root allocate first. + return a->length() > b->length(); + }); + + unsigned MaxColor = 0; + const unsigned VGPR_LIMIT = 256 * 4; + + for (Lineage *L : InLineages) { + unsigned ID = L->ID; + auto &Conflict = Color.Conflicts[ID]; + std::bitset Colors; + for (unsigned j : Conflict) { + Lineage *LineageC = &Lineages[j]; + if (AllocMap.count(LineageC) == 0) + continue; + unsigned C = AllocMap[LineageC]; + unsigned S = LineageC->getSize(); + for (unsigned i = 0; i < S; i++) { + unsigned Pos = C + i; + Colors.set(Pos); + } + } + + unsigned Color = Limit; + unsigned Size = L->getSize(); + for (unsigned i = 0; i < Limit - Size;) { + unsigned OldI = i; + for (unsigned j = 0; j < Size; j++) { + unsigned Pos = i + Size - 1 - j; + if (Colors.test(Pos)) { + i = Pos + 1; + break; + } + } + + if (i != OldI) + continue; + Color = i; + break; + } + + AllocMap[L] = Color; + Color += Size; + if (Color > MaxColor) + MaxColor = Color; + } + return MaxColor; +} + +void HRB::ColorResult::colorSU(SUnit *SU, unsigned Color) { + ColorMap[SU] = Color; +} + +unsigned HRB::ColorResult::getLineage(SUnit *SU) const { + return LineageMap.find(SU)->second; +} + +bool HRB::ColorResult::isConflict(const SUnit *SU0, unsigned Lineage) const { + const unsigned L = LineageMap.find(SU0)->second; + const auto &Conflict = Conflicts.find(L)->second; + return Conflict.count(Lineage) > 0; +} + +bool HRB::ColorResult::isHead(SUnit *SU) const { return HeadSet.count(SU); } +bool HRB::ColorResult::isTail(SUnit *SU) const { return TailSet.count(SU); } + +const SUnit *HRB::ColorResult::getTail(SUnit *SU) const { + if (!isHead(SU)) + return nullptr; + auto It = HeadTailMap.find(SU); + return It->second; +} + +unsigned HRB::ColorResult::getColor(const llvm::SUnit *SU) const { + auto It = ColorMap.find(SU); + return It->second; +} + +unsigned HRB::ColorResult::getSize(const llvm::SUnit *SU) const { + auto It = SizeMap.find(SU); + return It->second; +} + +HRB::ColorResult &HRB::coloring() { + // Collect VGPR lineages. + std::vector VgprLineages; + for (Lineage &L : Lineages) { + const auto *RC = L.RC; + if (!RC) + continue; + if (SIRI->isSGPRClass(RC)) + continue; + VgprLineages.emplace_back(&L); + } + + const unsigned VGPR_LIMIT = 256 * 4; + DenseMap VAllocMap; + const unsigned MaxVGPR = colorLineages(VgprLineages, VAllocMap, VGPR_LIMIT); + + // Collect SGPR lineages. + std::vector SgprLineages; + for (Lineage &L : Lineages) { + const auto *RC = L.RC; + if (!RC) + continue; + if (!SIRI->isSGPRClass(RC)) + continue; + SgprLineages.emplace_back(&L); + } + + const unsigned SGPR_LIMIT = 104; + DenseMap SAllocMap; + const unsigned MaxSGPR = colorLineages(SgprLineages, SAllocMap, SGPR_LIMIT); + // +1 for each type of lineages(SGPR, VGPR, no reg). + const unsigned MaxReg = MaxSGPR + 1 + MaxVGPR + 1 + 1; + const unsigned SgprBase = MaxVGPR + 1; + + for (Lineage &L : Lineages) { + // Collect HeadSet. + Color.HeadSet.insert(L.getHead()); + Color.TailSet.insert(L.getTail()); + Color.HeadTailMap[L.getHead()] = L.getTail(); + // Save color. + const auto *RC = L.RC; + // All no reg lineage goes to maxReg. + unsigned RegColor = MaxReg; + if (!RC) { + } else if (SIRI->isSGPRClass(RC)) { + RegColor = SAllocMap[&L] + SgprBase; + } else { + RegColor = VAllocMap[&L]; + } + unsigned Size = L.getSize(); + for (SUnit *SU : L.Nodes) { + Color.colorSU(SU, RegColor); + Color.SizeMap[SU] = Size; + Color.LineageMap[SU] = L.ID; + } + } + Color.MaxReg = MaxReg; + Color.MaxSGPR = MaxSGPR; + Color.MaxVGPR = MaxVGPR; + + for (unsigned i = 0; i < Lineages.size(); i++) { + Lineage &A = Lineages[i]; + SUnit *HeadA = A.getHead(); + unsigned ColorA = Color.getColor(HeadA); + unsigned SizeA = Color.getSize(HeadA); + for (unsigned j = i + 1; j < Lineages.size(); j++) { + Lineage &B = Lineages[j]; + + SUnit *HeadB = B.getHead(); + unsigned ColorB = Color.getColor(HeadB); + unsigned SizeB = Color.getSize(HeadB); + + if (ColorB >= (ColorA + SizeA)) + continue; + if (ColorA >= (ColorB + SizeB)) + continue; + Color.ShareColorLineages.insert(i); + Color.ShareColorLineages.insert(j); + } + } + + return Color; +} + +void HRB::dump() { + for (unsigned i = 0; i < Lineages.size(); i++) { + dbgs() << "Lineage" << i << ":"; + Lineage &L = Lineages[i]; + for (SUnit *SU : L.Nodes) { + dbgs() << " " << SU->NodeNum; + } + dbgs() << "\n"; + if (!Color.ColorMap.empty()) { + dbgs() << "color:" << Color.getColor(L.getHead()) + << " size: " << Color.getSize(L.getHead()) << "\n"; + } + if (!ReachMap.empty()) { + dbgs() << "conflict:"; + for (unsigned j = 0; j < Lineages.size(); j++) { + if (i == j) + continue; + if (isConflict(L, Lineages[j])) { + dbgs() << " " << j; + } + } + dbgs() << "\n"; + } + } +} + +void HRB::dumpReachMap() { + if (!ReachMap.empty()) { + dbgs() << "reachMap:"; + for (auto It : ReachMap) { + SUnit *SU = It.first; + auto &Reach = It.second; + if (SU->isInstr()) { + MachineInstr *MI = SU->getInstr(); + MI->print(dbgs()); + } + dbgs() << SU->NodeNum << "can reach :\n"; + for (SUnit *R : Reach) { + dbgs() << R->NodeNum << " "; + } + dbgs() << "\n"; + } + dbgs() << "\n"; + } +} + +// schedule base on HRB lineages and color result. + +std::vector hrbSched(std::vector &SUnits, + std::vector &BRoots, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) { + HRB Hrb(MRI, SIRI); + // build reach info to avoid dead loop when build linear. + Hrb.buildReachRelation(BRoots); + Hrb.buildLinear(SUnits); + + std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *A, const SUnit *B) { + return A->NumSuccsLeft < B->NumSuccsLeft; + }); + while (!BRoots.empty() && BRoots.back()->NumSuccsLeft > 0) { + BRoots.pop_back(); + } + + Hrb.buildReachRelation(BRoots); + Hrb.fusionLineages(SUnits); + Hrb.buildConflict(); + const HRB::ColorResult &ColorRes = Hrb.coloring(); + + LLVM_DEBUG(Hrb.dump()); + + // All lineage head which don't has Pred is TopRoots. + // Put top roots in worklist. + // while worklist not empty. + // if not head or color avail + // is candidate. + // choose best candidate by height. + // update worklist. + std::vector ReadyList; + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0) + ReadyList.emplace_back(&SU); //.insert(&SU); + } + // When there're more than one sub exp in the DAG, make sure not mix different + // sub exp or it will dead loop for color goes different subexp. + + std::bitset<512 * 2> Colors; + auto IsColorAvail = [&Colors](unsigned Color, unsigned Size) -> bool { + for (unsigned i = 0; i < Size; i++) { + unsigned Pos = Color + i; + if (Colors.test(Pos)) + return false; + } + return true; + }; + auto AllocColor = [&Colors](unsigned Color, unsigned Size) { + for (unsigned i = 0; i < Size; i++) { + unsigned Pos = Color + i; + assert(!Colors.test(Pos) && "color already allocated"); + LLVM_DEBUG(dbgs() << Pos << "is allocated\n"); + Colors.set(Pos); + } + }; + + auto FreeColor = [&Colors](unsigned Color, unsigned Size) { + for (unsigned i = 0; i < Size; i++) { + unsigned Pos = Color + i; + assert(Colors.test(Pos) && "color has not been allocated"); + LLVM_DEBUG(dbgs() << Pos << "is free\n"); + Colors.reset(Pos); + } + }; + + // Save color and size for tail to support case two lineage share tail. + // When finish a tail, free color for working lineage which end with tail. + DenseMap, 2>> + TailMap; + + // For lineages share same color, need to choose correct order. + // If l0 has color 0, l1 has color 1, l2 has color 0, l3 has color 1. + // l0 and l3 conflict, l1 and l2 conflict. + // l0 and l3 must sched together. + // If sched l0 and l1, it may dead lock for l0 wait something in l3 and l1 + // wait something in l2. + // ShareColorLineages will mark lineages which share color with other + // lineages. When sched, choose new lineages which has more conflict with + // ShareColorLineages. + const DenseSet &ShareColorLineages = ColorRes.ShareColorLineages; + + std::vector Schedule; + DenseSet UnfinishedLineages; + while (!ReadyList.empty()) { + // Make sure node conflict with predLineage first. + std::sort(ReadyList.begin(), ReadyList.end(), + [&UnfinishedLineages, &ColorRes](const SUnit *A, const SUnit *B) { + unsigned ConfA = 0; + for (unsigned L : UnfinishedLineages) { + if (ColorRes.isConflict(A, L)) + ConfA++; + } + unsigned ConfB = 0; + for (unsigned L : UnfinishedLineages) { + if (ColorRes.isConflict(B, L)) + ConfB++; + } + return ConfA > ConfB; + }); + + LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU : ReadyList) { + dbgs() << " " << SU->NodeNum; + } dbgs() << "\n";); + SUnit *Candidate = nullptr; + for (auto It = ReadyList.begin(); It != ReadyList.end(); It++) { + SUnit *SU = *It; + unsigned Color = ColorRes.getColor(SU); + unsigned Size = ColorRes.getSize(SU); + // If SU is not head or color is available, SU is the candidate. + if (ColorRes.isHead(SU)) { + if (!IsColorAvail(Color, Size)) + continue; + // alloc color. + AllocColor(Color, Size); + // save tail color. + const SUnit *Tail = ColorRes.getTail(SU); + unsigned ID = ColorRes.getLineage(SU); + SmallVector, 2> &TailColors = + TailMap[Tail]; + TailColors.emplace_back(std::make_tuple(Color, Size, ID)); + if (ShareColorLineages.count(ID)) + UnfinishedLineages.insert(ID); + } + + // free color for working lineage which end with SU. + if (ColorRes.isTail(SU)) { + auto &TailColors = TailMap[SU]; + for (auto &TailTuple : TailColors) { + unsigned LineageColor, LineageSize, ID; + std::tie(LineageColor, LineageSize, ID) = TailTuple; + FreeColor(LineageColor, LineageSize); + if (ShareColorLineages.count(ID)) + UnfinishedLineages.insert(ID); + } + // Clear the tail. + TailMap.erase(SU); + } + + Candidate = SU; + // Remove Candidate from ReadyList. + ReadyList.erase(It); + break; + } + + if (!Candidate) { + // In case failed to find candidate, start a lineage if there is one. + for (auto It = ReadyList.begin(); It != ReadyList.end(); It++) { + SUnit *SU = *It; + + if (!ColorRes.isHead(SU)) { + continue; + } + Candidate = SU; + // Remove Candidate from ReadyList. + ReadyList.erase(It); + break; + } + } + assert(Candidate && "fail to find a Candidate"); + LLVM_DEBUG(dbgs() << "Sched " << Candidate->NodeNum << "\n"); + + // Add all Candidate succ which is Ready. + for (SDep &Dep : Candidate->Succs) { + if (Dep.isWeak()) + continue; + SUnit *Succ = Dep.getSUnit(); + + if (Succ->NumPredsLeft > 0) + Succ->NumPredsLeft--; + LLVM_DEBUG(dbgs() << "Succ " << Succ->NodeNum << " has " + << Succ->NumPredsLeft << " preds\n"); + if (Succ->NumPredsLeft == 0) + ReadyList.emplace_back(Succ); + } + + // Sched Candidate. + assert(Candidate->isInstr() && "Candidate must be instr Node"); + Schedule.emplace_back(Candidate); + } + assert(Schedule.size() == SUnits.size() && "SUnit size should match"); + return Schedule; +} + +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h new file mode 100644 index 0000000000000..c19190c6afe24 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h @@ -0,0 +1,195 @@ +#pragma once + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/MC/LaneBitmask.h" + +#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. + +namespace llvm { +class MachineFunction; +class LiveIntervals; +class MachineRegisterInfo; +class SIRegisterInfo; +class SIInstrInfo; +class MachineInstr; +class MachineBasicBlock; +template class GraphWriter; +class SUnit; +class IntEqClasses; +class Twine; + +using LiveSet = llvm::DenseMap; + +// SubExp and BlockExpDag. +struct SubExp { + // Keep original order for sunits. + std::vector SUnits; + llvm::DenseSet TopRegs; + llvm::DenseSet BottomRoots; + llvm::DenseSet BottomRegs; + bool IsMultiDefOutput = false; + bool IsHasTerminatorInst = false; + bool IsUseIncomingReg = false; + bool IsMoveIntoLoop = false; + bool IsNotSafeToCopy = false; + bool IsHasMemInst = false; + bool IsHoist = false; + // If temp/out reg is used by inst not in the subExp, cannot move since not + // all users will be move. But OK to clone. + bool IsCloneOnly = false; + bool IsTouchSCC = false; + llvm::MachineBasicBlock *FromBB; + llvm::MachineBasicBlock *ToBB; + unsigned SInputSize; + unsigned VInputSize; + unsigned SOutputSize; + unsigned VOutputSize; + unsigned SMaxSize; + unsigned VMaxSize; + LiveSet InputLive; + LiveSet OutputLive; + bool isSafeToMove(const llvm::MachineRegisterInfo &MRI) const; + void calcMaxPressure(const llvm::MachineRegisterInfo &MRI); + void dump(const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) const; + bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo *SIRI) const; +}; + +struct ExpDag { + ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII, const bool IsJoinInput); + const llvm::MachineRegisterInfo &MRI; + const llvm::SIRegisterInfo *SIRI; + const llvm::SIInstrInfo *SIII; + const bool IsJoinInputToSubExp; + + std::vector SUnits; ///< The scheduling units. + llvm::DenseMap MISUnitMap; + llvm::DenseMap SUnitMIMap; + llvm::DenseMap InputSUnitMap; + llvm::DenseMap SUnitInputMap; + std::vector SubExps; + template + void build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, + T &Insts); + void dump(); + void viewGraph(const llvm::Twine &Name, const llvm::Twine &Title) const; + /// Returns a label for an SUnit node in a visualization of the ScheduleDAG. + std::string getGraphNodeLabel(const llvm::SUnit *SU) const; + std::string getDAGName() const; + /// Adds custom features for a visualization of the ScheduleDAG. + void addCustomGraphFeatures(llvm::GraphWriter &) const {} + +private: + template void initNodes(const LiveSet &InputLiveReg, T &Insts); + void addDataDep(); + void addCtrlDep(); + void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, + const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII); +}; + +struct BlockExpDag : public ExpDag { + BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII); + llvm::LiveIntervals *LIS; + llvm::MachineBasicBlock *MBB; + llvm::DenseMap DagPressureMap; + std::vector> SUnitsInSameDepth; + std::vector SubExps; + void build(); + void buildWithPressure(); + +private: + void buildAvail(const LiveSet &PassThruSet, + llvm::DenseMap &DagAvailRegMap); + void buildPressure(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg); +}; + +void getRegBound(llvm::MachineBasicBlock *MBB, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII, llvm::LiveIntervals *LIS, + unsigned &MaxVGPR, unsigned &MaxSGRP); + +// Currently mix sgpr and vgpr when build lineage to avoid cycle. +// This maybe waste registers. +// Based on "Minimum Register Instruction Sequencing to Reduce Register Spills +// in Out-of-Order Issue Superscalar Architectures". +class HRB { +public: + struct Lineage { + unsigned ID = 0; + const llvm::TargetRegisterClass *RC = nullptr; + llvm::SmallVector Nodes; + llvm::SUnit *getHead() const; + llvm::SUnit *getTail() const; + void addNode(llvm::SUnit *); + unsigned getSize() const; + unsigned length() const; + }; + struct ColorResult { + llvm::DenseMap ColorMap; + llvm::DenseMap SizeMap; + llvm::DenseMap LineageMap; + llvm::DenseMap> Conflicts; + llvm::DenseSet ShareColorLineages; + llvm::DenseSet HeadSet; + llvm::DenseSet TailSet; + llvm::DenseMap HeadTailMap; + unsigned MaxReg = 0; + unsigned MaxVGPR = 0; + unsigned MaxSGPR = 0; + void colorSU(llvm::SUnit *SU, unsigned Color); + unsigned getLineage(llvm::SUnit *SU) const; + bool isConflict(const llvm::SUnit *SU0, unsigned Lineage) const; + bool isHead(llvm::SUnit *SU) const; + bool isTail(llvm::SUnit *SU) const; + const llvm::SUnit *getTail(llvm::SUnit *SU) const; + unsigned getColor(const llvm::SUnit *SU) const; + unsigned getSize(const llvm::SUnit *SU) const; + }; + HRB(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI) + : MRI(MRI), SIRI(SIRI) {} + + void buildLinear(std::vector &SUnits); + void buildConflict(); + void buildReachRelation(llvm::ArrayRef BotRoots); + llvm::DenseMap> &getReachMap() { + return ReachMap; + } + bool canReach(llvm::SUnit *a, llvm::SUnit *B); + void updateReachForEdge(llvm::SUnit *A, llvm::SUnit *B, + std::vector &SUnits); + void fusionLineages(std::vector &SUnits); + ColorResult &coloring(); + void dump(); + void dumpReachMap(); + +private: + Lineage buildChain(llvm::SUnit *Node, std::vector &SUnits); + llvm::SUnit *findHeir(llvm::SUnit *SU, std::vector &SUnits); + bool isConflict(const Lineage &A, const Lineage &B); + bool canFuse(const Lineage &A, const Lineage &B); + bool tryFuse(Lineage &A, Lineage &B, std::vector &SUnits); + unsigned colorLineages(std::vector &Lineages, + llvm::DenseMap &AllocMap, + const unsigned Limit); + + llvm::DenseSet ChainedNodes; + llvm::DenseMap> ReachMap; + bool IsRecomputeHeight = false; + std::vector Lineages; + ColorResult Color; + const llvm::MachineRegisterInfo &MRI; + const llvm::SIRegisterInfo *SIRI; +}; + +std::vector hrbSched(std::vector &SUnits, + std::vector &BRoots, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); + +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d0454cce15756..d680e01e3f8fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -394,6 +394,12 @@ static cl::opt cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable Hot block rematerialize +static cl::opt + EnableHotBlockRemat("amdgpu-enable-hot-block-remat", + cl::desc("Enable HotBlock Rematerialize optimization"), + cl::init(false), cl::Hidden); + // Enable GFX11+ VOPD static cl::opt EnableVOPD("amdgpu-enable-vopd", @@ -517,6 +523,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); + initializeAMDGPUHotBlockRematerializePass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); @@ -1522,6 +1529,10 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (TM->getOptLevel() > CodeGenOptLevel::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + // Must be run before phi elimination + if (isPassEnabled(EnableHotBlockRemat)) + addPass(&AMDGPUHotBlockRematerializeID); + TargetPassConfig::addOptimizedRegAlloc(); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..eac9b57dc9973 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHotBlockRematerialize.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp @@ -81,10 +82,12 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp + AMDGPUMIRUtils.cpp AMDGPUIGroupLP.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp + AMDGPUOccupancyAndLatencyHelper.cpp AMDGPUPerfHintAnalysis.cpp AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp @@ -106,6 +109,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUSelectionDAGInfo.cpp AMDGPUSetWavePriority.cpp AMDGPUSplitModule.cpp + AMDGPUSubExpDag.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index a438ad00bc41d..390c2f05ffe69 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -549,22 +549,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, if (!S.liveAt(SI)) { if (It == LiveRegs.end()) { It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) - llvm_unreachable("register isn't live"); } - auto PrevMask = It->second; - It->second &= ~S.LaneMask; - CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + if (It != LiveRegs.end()) { + auto PrevMask = It->second; + It->second &= ~S.LaneMask; + CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + } } } if (It != LiveRegs.end() && It->second.none()) LiveRegs.erase(It); } else if (!LI.liveAt(SI)) { auto It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) - llvm_unreachable("register isn't live"); - CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); - LiveRegs.erase(It); + if (It != LiveRegs.end()) { + CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); + LiveRegs.erase(It); + } } } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 7554b9f578fcb..aa4b3f948b726 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -47,6 +47,10 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + unsigned getMaxSGPR() const { + return std::max(getSGPRNum(), getSGPRTuplesWeight()); + } + /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR32]; } /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 79ef1432d512a..14db2b39ef9d4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1332,6 +1332,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isLowLatencyInstruction(const MachineInstr &MI) const; bool isHighLatencyDef(int Opc) const override; + bool isHighLatencyInstruction(const MachineInstr &MI) const { + return isHighLatencyDef(MI.getOpcode()); + } /// Return the descriptor of the target-specific machine instruction /// that corresponds to the specified pseudo or native opcode. diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir new file mode 100644 index 0000000000000..bfb8e85c8aef6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir @@ -0,0 +1,643 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-sub-exp-remat-aggressive | FileCheck %s + +# Check that the whole expression gets CLONED to uses in bb.2. +# CHECK: bb.0: +# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0 +# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1 +# CHECK: bb.1: +# CHECK: bb.2: +# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]] +# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]] +# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]] +# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]] +# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]] +# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]] +# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]] +# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]] +# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]] +# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]] +# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]] +# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]] +# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]] +# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]] +# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]] +# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]] +# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]] +# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]] +# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]] +# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]] +# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]] +# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]] +# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]] +# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]] +# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]] +# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]] +# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]] +# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]] +# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]] +# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]] +# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]] +# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]] +# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]] +# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]] +# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]] +# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]] +# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]] +# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]] +# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]] +# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]] +# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]] +# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]] +# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]] +# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]] +# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]] +# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]] +# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]] +# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]] +# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]] +# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]] +# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]] +# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]] +# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]] +# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]] +# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]] +# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]] +# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]] +# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]] +# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]] +# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]] +# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]] +# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]] +# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]] +# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]] +# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]] +# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]] +# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]] +# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]] +# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]] +# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]] +# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]] +# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]] +# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]] +# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]] +# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]] +# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]] +# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]] +# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]] +# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]] +# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]] +# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]] +# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]] +# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]] +# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]] +# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]] +# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]] +# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]] +# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]] +# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]] +# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]] +# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]] +# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]] +# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]] +# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]] +# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]] +# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]] +# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]] +# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]] +# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]] +# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]] +# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]] +# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]] +# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]] +# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]] +# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]] +# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]] +# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]] +# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]] +# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]] +# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]] +# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]] +# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]] +# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]] +# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]] +# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]] +# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]] +# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]] +# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]] +# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]] +# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]] +# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]] +# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]] +# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]] +# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]] +# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]] +# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]] +# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]] +# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]] +# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]] +# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]] +# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]] +# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]] + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + + + %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec + %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode + %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode + %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode + %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode + %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode + %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode + %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode + %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode + %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode + %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode + %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode + %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode + %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode + %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode + %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode + %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode + %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode + %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode + %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode + %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode + %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode + %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode + %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode + %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode + %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode + %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode + %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode + %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode + %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode + %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode + %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode + %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode + %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode + %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode + %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode + %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode + %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode + %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode + %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode + %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode + %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode + %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode + %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode + %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode + %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode + %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode + %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode + %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode + %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode + %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode + %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode + %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode + %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode + %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode + %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode + %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode + %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode + %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode + %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode + %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode + %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode + %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode + %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode + %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode + %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode + %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode + %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode + %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode + %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode + %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode + %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode + %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode + %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode + %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode + %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode + %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode + %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode + %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode + %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode + %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode + %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode + %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode + %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode + %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode + %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode + %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode + %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode + %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode + %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode + %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode + %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode + %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode + %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode + %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode + %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode + %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode + %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode + %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode + %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode + %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode + %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode + %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode + %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode + %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode + %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode + %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode + %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode + %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode + %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode + %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode + %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode + %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode + %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode + %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode + %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode + %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode + %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode + %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode + %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode + %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode + %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode + %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode + %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode + %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode + %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode + %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode + %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode + %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode + %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode + %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode + %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode + %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode + EXP 0, %500, %500, %500, %500, -1, -1, 15, implicit $exec + EXP 0, %501, %501, %501, %501, -1, -1, 15, implicit $exec + EXP 0, %502, %502, %502, %502, -1, -1, 15, implicit $exec + EXP 0, %503, %503, %503, %503, -1, -1, 15, implicit $exec + EXP 0, %504, %504, %504, %504, -1, -1, 15, implicit $exec + EXP 0, %505, %505, %505, %505, -1, -1, 15, implicit $exec + EXP 0, %506, %506, %506, %506, -1, -1, 15, implicit $exec + EXP 0, %507, %507, %507, %507, -1, -1, 15, implicit $exec + EXP 0, %508, %508, %508, %508, -1, -1, 15, implicit $exec + EXP 0, %509, %509, %509, %509, -1, -1, 15, implicit $exec + EXP 0, %5010, %5010, %5010, %5010, -1, -1, 15, implicit $exec + EXP 0, %5011, %5011, %5011, %5011, -1, -1, 15, implicit $exec + EXP 0, %5012, %5012, %5012, %5012, -1, -1, 15, implicit $exec + EXP 0, %5013, %5013, %5013, %5013, -1, -1, 15, implicit $exec + EXP 0, %5014, %5014, %5014, %5014, -1, -1, 15, implicit $exec + EXP 0, %5015, %5015, %5015, %5015, -1, -1, 15, implicit $exec + EXP 0, %5016, %5016, %5016, %5016, -1, -1, 15, implicit $exec + EXP 0, %5017, %5017, %5017, %5017, -1, -1, 15, implicit $exec + EXP 0, %5018, %5018, %5018, %5018, -1, -1, 15, implicit $exec + EXP 0, %5019, %5019, %5019, %5019, -1, -1, 15, implicit $exec + EXP 0, %5020, %5020, %5020, %5020, -1, -1, 15, implicit $exec + EXP 0, %5021, %5021, %5021, %5021, -1, -1, 15, implicit $exec + EXP 0, %5022, %5022, %5022, %5022, -1, -1, 15, implicit $exec + EXP 0, %5023, %5023, %5023, %5023, -1, -1, 15, implicit $exec + EXP 0, %5024, %5024, %5024, %5024, -1, -1, 15, implicit $exec + EXP 0, %5025, %5025, %5025, %5025, -1, -1, 15, implicit $exec + EXP 0, %5026, %5026, %5026, %5026, -1, -1, 15, implicit $exec + EXP 0, %5027, %5027, %5027, %5027, -1, -1, 15, implicit $exec + EXP 0, %5028, %5028, %5028, %5028, -1, -1, 15, implicit $exec + EXP 0, %5029, %5029, %5029, %5029, -1, -1, 15, implicit $exec + EXP 0, %5030, %5030, %5030, %5030, -1, -1, 15, implicit $exec + EXP 0, %5031, %5031, %5031, %5031, -1, -1, 15, implicit $exec + EXP 0, %5032, %5032, %5032, %5032, -1, -1, 15, implicit $exec + EXP 0, %5033, %5033, %5033, %5033, -1, -1, 15, implicit $exec + EXP 0, %5034, %5034, %5034, %5034, -1, -1, 15, implicit $exec + EXP 0, %5035, %5035, %5035, %5035, -1, -1, 15, implicit $exec + EXP 0, %5036, %5036, %5036, %5036, -1, -1, 15, implicit $exec + EXP 0, %5037, %5037, %5037, %5037, -1, -1, 15, implicit $exec + EXP 0, %5038, %5038, %5038, %5038, -1, -1, 15, implicit $exec + EXP 0, %5039, %5039, %5039, %5039, -1, -1, 15, implicit $exec + EXP 0, %5040, %5040, %5040, %5040, -1, -1, 15, implicit $exec + EXP 0, %5041, %5041, %5041, %5041, -1, -1, 15, implicit $exec + EXP 0, %5042, %5042, %5042, %5042, -1, -1, 15, implicit $exec + EXP 0, %5043, %5043, %5043, %5043, -1, -1, 15, implicit $exec + EXP 0, %5044, %5044, %5044, %5044, -1, -1, 15, implicit $exec + EXP 0, %5045, %5045, %5045, %5045, -1, -1, 15, implicit $exec + EXP 0, %5046, %5046, %5046, %5046, -1, -1, 15, implicit $exec + EXP 0, %5047, %5047, %5047, %5047, -1, -1, 15, implicit $exec + EXP 0, %5048, %5048, %5048, %5048, -1, -1, 15, implicit $exec + EXP 0, %5049, %5049, %5049, %5049, -1, -1, 15, implicit $exec + EXP 0, %5050, %5050, %5050, %5050, -1, -1, 15, implicit $exec + EXP 0, %5051, %5051, %5051, %5051, -1, -1, 15, implicit $exec + EXP 0, %5052, %5052, %5052, %5052, -1, -1, 15, implicit $exec + EXP 0, %5053, %5053, %5053, %5053, -1, -1, 15, implicit $exec + EXP 0, %5054, %5054, %5054, %5054, -1, -1, 15, implicit $exec + EXP 0, %5055, %5055, %5055, %5055, -1, -1, 15, implicit $exec + EXP 0, %5056, %5056, %5056, %5056, -1, -1, 15, implicit $exec + EXP 0, %5057, %5057, %5057, %5057, -1, -1, 15, implicit $exec + EXP 0, %5058, %5058, %5058, %5058, -1, -1, 15, implicit $exec + EXP 0, %5059, %5059, %5059, %5059, -1, -1, 15, implicit $exec + EXP 0, %5060, %5060, %5060, %5060, -1, -1, 15, implicit $exec + EXP 0, %5061, %5061, %5061, %5061, -1, -1, 15, implicit $exec + EXP 0, %5062, %5062, %5062, %5062, -1, -1, 15, implicit $exec + EXP 0, %5063, %5063, %5063, %5063, -1, -1, 15, implicit $exec + EXP 0, %5064, %5064, %5064, %5064, -1, -1, 15, implicit $exec + EXP 0, %5065, %5065, %5065, %5065, -1, -1, 15, implicit $exec + EXP 0, %5066, %5066, %5066, %5066, -1, -1, 15, implicit $exec + EXP 0, %5067, %5067, %5067, %5067, -1, -1, 15, implicit $exec + EXP 0, %5068, %5068, %5068, %5068, -1, -1, 15, implicit $exec + EXP 0, %5069, %5069, %5069, %5069, -1, -1, 15, implicit $exec + EXP 0, %5070, %5070, %5070, %5070, -1, -1, 15, implicit $exec + EXP 0, %5071, %5071, %5071, %5071, -1, -1, 15, implicit $exec + EXP 0, %5072, %5072, %5072, %5072, -1, -1, 15, implicit $exec + EXP 0, %5073, %5073, %5073, %5073, -1, -1, 15, implicit $exec + EXP 0, %5074, %5074, %5074, %5074, -1, -1, 15, implicit $exec + EXP 0, %5075, %5075, %5075, %5075, -1, -1, 15, implicit $exec + EXP 0, %5076, %5076, %5076, %5076, -1, -1, 15, implicit $exec + EXP 0, %5077, %5077, %5077, %5077, -1, -1, 15, implicit $exec + EXP 0, %5078, %5078, %5078, %5078, -1, -1, 15, implicit $exec + EXP 0, %5079, %5079, %5079, %5079, -1, -1, 15, implicit $exec + EXP 0, %5080, %5080, %5080, %5080, -1, -1, 15, implicit $exec + EXP 0, %5081, %5081, %5081, %5081, -1, -1, 15, implicit $exec + EXP 0, %5082, %5082, %5082, %5082, -1, -1, 15, implicit $exec + EXP 0, %5083, %5083, %5083, %5083, -1, -1, 15, implicit $exec + EXP 0, %5084, %5084, %5084, %5084, -1, -1, 15, implicit $exec + EXP 0, %5085, %5085, %5085, %5085, -1, -1, 15, implicit $exec + EXP 0, %5086, %5086, %5086, %5086, -1, -1, 15, implicit $exec + EXP 0, %5087, %5087, %5087, %5087, -1, -1, 15, implicit $exec + EXP 0, %5088, %5088, %5088, %5088, -1, -1, 15, implicit $exec + EXP 0, %5089, %5089, %5089, %5089, -1, -1, 15, implicit $exec + EXP 0, %5090, %5090, %5090, %5090, -1, -1, 15, implicit $exec + EXP 0, %5091, %5091, %5091, %5091, -1, -1, 15, implicit $exec + EXP 0, %5092, %5092, %5092, %5092, -1, -1, 15, implicit $exec + EXP 0, %5093, %5093, %5093, %5093, -1, -1, 15, implicit $exec + EXP 0, %5094, %5094, %5094, %5094, -1, -1, 15, implicit $exec + EXP 0, %5095, %5095, %5095, %5095, -1, -1, 15, implicit $exec + EXP 0, %5096, %5096, %5096, %5096, -1, -1, 15, implicit $exec + EXP 0, %5097, %5097, %5097, %5097, -1, -1, 15, implicit $exec + EXP 0, %5098, %5098, %5098, %5098, -1, -1, 15, implicit $exec + EXP 0, %5099, %5099, %5099, %5099, -1, -1, 15, implicit $exec + EXP 0, %50100, %50100, %50100, %50100, -1, -1, 15, implicit $exec + EXP 0, %50101, %50101, %50101, %50101, -1, -1, 15, implicit $exec + EXP 0, %50102, %50102, %50102, %50102, -1, -1, 15, implicit $exec + EXP 0, %50103, %50103, %50103, %50103, -1, -1, 15, implicit $exec + EXP 0, %50104, %50104, %50104, %50104, -1, -1, 15, implicit $exec + EXP 0, %50105, %50105, %50105, %50105, -1, -1, 15, implicit $exec + EXP 0, %50106, %50106, %50106, %50106, -1, -1, 15, implicit $exec + EXP 0, %50107, %50107, %50107, %50107, -1, -1, 15, implicit $exec + EXP 0, %50108, %50108, %50108, %50108, -1, -1, 15, implicit $exec + EXP 0, %50109, %50109, %50109, %50109, -1, -1, 15, implicit $exec + EXP 0, %50110, %50110, %50110, %50110, -1, -1, 15, implicit $exec + EXP 0, %50111, %50111, %50111, %50111, -1, -1, 15, implicit $exec + EXP 0, %50112, %50112, %50112, %50112, -1, -1, 15, implicit $exec + EXP 0, %50113, %50113, %50113, %50113, -1, -1, 15, implicit $exec + EXP 0, %50114, %50114, %50114, %50114, -1, -1, 15, implicit $exec + EXP 0, %50115, %50115, %50115, %50115, -1, -1, 15, implicit $exec + EXP 0, %50116, %50116, %50116, %50116, -1, -1, 15, implicit $exec + EXP 0, %50117, %50117, %50117, %50117, -1, -1, 15, implicit $exec + EXP 0, %50118, %50118, %50118, %50118, -1, -1, 15, implicit $exec + EXP 0, %50119, %50119, %50119, %50119, -1, -1, 15, implicit $exec + EXP 0, %50120, %50120, %50120, %50120, -1, -1, 15, implicit $exec + EXP 0, %50121, %50121, %50121, %50121, -1, -1, 15, implicit $exec + EXP 0, %50122, %50122, %50122, %50122, -1, -1, 15, implicit $exec + EXP 0, %50123, %50123, %50123, %50123, -1, -1, 15, implicit $exec + EXP 0, %50124, %50124, %50124, %50124, -1, -1, 15, implicit $exec + EXP 0, %50125, %50125, %50125, %50125, -1, -1, 15, implicit $exec + EXP 0, %50126, %50126, %50126, %50126, -1, -1, 15, implicit $exec + EXP 0, %50127, %50127, %50127, %50127, -1, -1, 15, implicit $exec + EXP 0, %50128, %50128, %50128, %50128, -1, -1, 15, implicit $exec + EXP 0, %50129, %50129, %50129, %50129, -1, -1, 15, implicit $exec + EXP 0, %50130, %50130, %50130, %50130, -1, -1, 15, implicit $exec + EXP 0, %50131, %50131, %50131, %50131, -1, -1, 15, implicit $exec + EXP 0, %50132, %50132, %50132, %50132, -1, -1, 15, implicit $exec + EXP 0, %50133, %50133, %50133, %50133, -1, -1, 15, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + %8001:vgpr_32 = COPY %8000 + %8002:vgpr_32 = COPY %8000 + %8003:vgpr_32 = COPY %8000 + %8004:vgpr_32 = COPY %8000 + %8005:vgpr_32 = COPY %8000 + %8006:vgpr_32 = COPY %8000 + %8007:vgpr_32 = COPY %8000 + %8008:vgpr_32 = COPY %8000 + %8009:vgpr_32 = COPY %8000 + %8010:vgpr_32 = COPY %8000 + %8011:vgpr_32 = COPY %8000 + %8012:vgpr_32 = COPY %8000 + %8013:vgpr_32 = COPY %8000 + %8014:vgpr_32 = COPY %8000 + %8015:vgpr_32 = COPY %8000 + %8016:vgpr_32 = COPY %8000 + %8017:vgpr_32 = COPY %8000 + + %9001:vgpr_32 = COPY %8001 + %9002:vgpr_32 = COPY %8002 + %9003:vgpr_32 = COPY %8003 + %9004:vgpr_32 = COPY %8004 + %9005:vgpr_32 = COPY %8005 + %9006:vgpr_32 = COPY %8006 + %9007:vgpr_32 = COPY %8007 + %9008:vgpr_32 = COPY %8008 + %9009:vgpr_32 = COPY %8009 + %9010:vgpr_32 = COPY %8010 + %9011:vgpr_32 = COPY %8011 + %9012:vgpr_32 = COPY %8012 + %9013:vgpr_32 = COPY %8013 + %9014:vgpr_32 = COPY %8014 + %9015:vgpr_32 = COPY %8015 + %9016:vgpr_32 = COPY %8016 + %9017:vgpr_32 = COPY %8017 + + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + + EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir new file mode 100644 index 0000000000000..ebd89451154ae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir @@ -0,0 +1,508 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat | FileCheck %s + +# Check that the whole expression gets moved to uses in bb.2. +# CHECK: bb.0: +# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0 +# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1 +# CHECK: bb.1: +# CHECK: bb.2: +# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]] +# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]] +# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]] +# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]] +# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]] +# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]] +# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]] +# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]] +# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]] +# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]] +# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]] +# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]] +# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]] +# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]] +# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]] +# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]] +# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]] +# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]] +# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]] +# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]] +# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]] +# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]] +# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]] +# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]] +# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]] +# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]] +# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]] +# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]] +# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]] +# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]] +# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]] +# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]] +# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]] +# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]] +# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]] +# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]] +# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]] +# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]] +# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]] +# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]] +# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]] +# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]] +# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]] +# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]] +# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]] +# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]] +# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]] +# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]] +# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]] +# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]] +# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]] +# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]] +# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]] +# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]] +# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]] +# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]] +# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]] +# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]] +# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]] +# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]] +# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]] +# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]] +# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]] +# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]] +# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]] +# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]] +# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]] +# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]] +# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]] +# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]] +# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]] +# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]] +# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]] +# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]] +# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]] +# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]] +# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]] +# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]] +# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]] +# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]] +# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]] +# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]] +# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]] +# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]] +# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]] +# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]] +# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]] +# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]] +# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]] +# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]] +# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]] +# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]] +# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]] +# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]] +# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]] +# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]] +# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]] +# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]] +# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]] +# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]] +# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]] +# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]] +# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]] +# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]] +# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]] +# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]] +# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]] +# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]] +# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]] +# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]] +# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]] +# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]] +# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]] +# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]] +# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]] +# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]] +# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]] +# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]] +# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]] +# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]] +# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]] +# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]] +# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]] +# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]] +# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]] +# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]] +# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]] +# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]] +# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]] +# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]] +# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]] +# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]] + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + + %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec + %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode + %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode + %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode + %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode + %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode + %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode + %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode + %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode + %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode + %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode + %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode + %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode + %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode + %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode + %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode + %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode + %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode + %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode + %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode + %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode + %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode + %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode + %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode + %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode + %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode + %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode + %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode + %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode + %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode + %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode + %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode + %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode + %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode + %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode + %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode + %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode + %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode + %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode + %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode + %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode + %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode + %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode + %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode + %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode + %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode + %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode + %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode + %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode + %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode + %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode + %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode + %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode + %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode + %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode + %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode + %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode + %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode + %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode + %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode + %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode + %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode + %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode + %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode + %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode + %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode + %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode + %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode + %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode + %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode + %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode + %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode + %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode + %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode + %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode + %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode + %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode + %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode + %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode + %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode + %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode + %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode + %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode + %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode + %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode + %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode + %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode + %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode + %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode + %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode + %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode + %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode + %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode + %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode + %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode + %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode + %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode + %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode + %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode + %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode + %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode + %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode + %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode + %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode + %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode + %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode + %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode + %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode + %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode + %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode + %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode + %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode + %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode + %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode + %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode + %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode + %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode + %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode + %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode + %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode + %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode + %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode + %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode + %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode + %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode + %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode + %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode + %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode + %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode + %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode + %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode + %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode + %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + %8001:vgpr_32 = COPY %8000 + %8002:vgpr_32 = COPY %8000 + %8003:vgpr_32 = COPY %8000 + %8004:vgpr_32 = COPY %8000 + %8005:vgpr_32 = COPY %8000 + %8006:vgpr_32 = COPY %8000 + %8007:vgpr_32 = COPY %8000 + %8008:vgpr_32 = COPY %8000 + %8009:vgpr_32 = COPY %8000 + %8010:vgpr_32 = COPY %8000 + %8011:vgpr_32 = COPY %8000 + %8012:vgpr_32 = COPY %8000 + %8013:vgpr_32 = COPY %8000 + %8014:vgpr_32 = COPY %8000 + %8015:vgpr_32 = COPY %8000 + %8016:vgpr_32 = COPY %8000 + %8017:vgpr_32 = COPY %8000 + + %9001:vgpr_32 = COPY %8001 + %9002:vgpr_32 = COPY %8002 + %9003:vgpr_32 = COPY %8003 + %9004:vgpr_32 = COPY %8004 + %9005:vgpr_32 = COPY %8005 + %9006:vgpr_32 = COPY %8006 + %9007:vgpr_32 = COPY %8007 + %9008:vgpr_32 = COPY %8008 + %9009:vgpr_32 = COPY %8009 + %9010:vgpr_32 = COPY %8010 + %9011:vgpr_32 = COPY %8011 + %9012:vgpr_32 = COPY %8012 + %9013:vgpr_32 = COPY %8013 + %9014:vgpr_32 = COPY %8014 + %9015:vgpr_32 = COPY %8015 + %9016:vgpr_32 = COPY %8016 + %9017:vgpr_32 = COPY %8017 + + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + + EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + diff --git a/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir b/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir new file mode 100644 index 0000000000000..6db673b849ef2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir @@ -0,0 +1,760 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-in-blk-remat | FileCheck %s + +# Check that pacifist insts are moved to their users within the block. +# CHECK: bb.0: +# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0 +# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1 +# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]] +# CHECK: EXP 0, %[[#r502]] +# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]] +# CHECK: EXP 0, %[[#r503]] +# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]] +# CHECK: EXP 0, %[[#r504]] +# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]] +# CHECK: EXP 0, %[[#r505]] +# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]] +# CHECK: EXP 0, %[[#r506]] +# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]] +# CHECK: EXP 0, %[[#r507]] +# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]] +# CHECK: EXP 0, %[[#r508]] +# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]] +# CHECK: EXP 0, %[[#r509]] +# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]] +# CHECK: EXP 0, %[[#r5010]] +# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]] +# CHECK: EXP 0, %[[#r5011]] +# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]] +# CHECK: EXP 0, %[[#r5012]] +# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]] +# CHECK: EXP 0, %[[#r5013]] +# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]] +# CHECK: EXP 0, %[[#r5014]] +# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]] +# CHECK: EXP 0, %[[#r5015]] +# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]] +# CHECK: EXP 0, %[[#r5016]] +# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]] +# CHECK: EXP 0, %[[#r5017]] +# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]] +# CHECK: EXP 0, %[[#r5018]] +# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]] +# CHECK: EXP 0, %[[#r5019]] +# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]] +# CHECK: EXP 0, %[[#r5020]] +# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]] +# CHECK: EXP 0, %[[#r5021]] +# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]] +# CHECK: EXP 0, %[[#r5022]] +# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]] +# CHECK: EXP 0, %[[#r5023]] +# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]] +# CHECK: EXP 0, %[[#r5024]] +# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]] +# CHECK: EXP 0, %[[#r5025]] +# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]] +# CHECK: EXP 0, %[[#r5026]] +# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]] +# CHECK: EXP 0, %[[#r5027]] +# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]] +# CHECK: EXP 0, %[[#r5028]] +# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]] +# CHECK: EXP 0, %[[#r5029]] +# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]] +# CHECK: EXP 0, %[[#r5030]] +# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]] +# CHECK: EXP 0, %[[#r5031]] +# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]] +# CHECK: EXP 0, %[[#r5032]] +# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]] +# CHECK: EXP 0, %[[#r5033]] +# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]] +# CHECK: EXP 0, %[[#r5034]] +# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]] +# CHECK: EXP 0, %[[#r5035]] +# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]] +# CHECK: EXP 0, %[[#r5036]] +# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]] +# CHECK: EXP 0, %[[#r5037]] +# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]] +# CHECK: EXP 0, %[[#r5038]] +# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]] +# CHECK: EXP 0, %[[#r5039]] +# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]] +# CHECK: EXP 0, %[[#r5040]] +# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]] +# CHECK: EXP 0, %[[#r5041]] +# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]] +# CHECK: EXP 0, %[[#r5042]] +# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]] +# CHECK: EXP 0, %[[#r5043]] +# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]] +# CHECK: EXP 0, %[[#r5044]] +# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]] +# CHECK: EXP 0, %[[#r5045]] +# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]] +# CHECK: EXP 0, %[[#r5046]] +# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]] +# CHECK: EXP 0, %[[#r5047]] +# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]] +# CHECK: EXP 0, %[[#r5048]] +# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]] +# CHECK: EXP 0, %[[#r5049]] +# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]] +# CHECK: EXP 0, %[[#r5050]] +# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]] +# CHECK: EXP 0, %[[#r5051]] +# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]] +# CHECK: EXP 0, %[[#r5052]] +# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]] +# CHECK: EXP 0, %[[#r5053]] +# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]] +# CHECK: EXP 0, %[[#r5054]] +# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]] +# CHECK: EXP 0, %[[#r5055]] +# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]] +# CHECK: EXP 0, %[[#r5056]] +# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]] +# CHECK: EXP 0, %[[#r5057]] +# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]] +# CHECK: EXP 0, %[[#r5058]] +# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]] +# CHECK: EXP 0, %[[#r5059]] +# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]] +# CHECK: EXP 0, %[[#r5060]] +# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]] +# CHECK: EXP 0, %[[#r5061]] +# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]] +# CHECK: EXP 0, %[[#r5062]] +# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]] +# CHECK: EXP 0, %[[#r5063]] +# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]] +# CHECK: EXP 0, %[[#r5064]] +# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]] +# CHECK: EXP 0, %[[#r5065]] +# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]] +# CHECK: EXP 0, %[[#r5066]] +# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]] +# CHECK: EXP 0, %[[#r5067]] +# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]] +# CHECK: EXP 0, %[[#r5068]] +# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]] +# CHECK: EXP 0, %[[#r5069]] +# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]] +# CHECK: EXP 0, %[[#r5070]] +# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]] +# CHECK: EXP 0, %[[#r5071]] +# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]] +# CHECK: EXP 0, %[[#r5072]] +# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]] +# CHECK: EXP 0, %[[#r5073]] +# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]] +# CHECK: EXP 0, %[[#r5074]] +# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]] +# CHECK: EXP 0, %[[#r5075]] +# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]] +# CHECK: EXP 0, %[[#r5076]] +# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]] +# CHECK: EXP 0, %[[#r5077]] +# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]] +# CHECK: EXP 0, %[[#r5078]] +# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]] +# CHECK: EXP 0, %[[#r5079]] +# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]] +# CHECK: EXP 0, %[[#r5080]] +# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]] +# CHECK: EXP 0, %[[#r5081]] +# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]] +# CHECK: EXP 0, %[[#r5082]] +# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]] +# CHECK: EXP 0, %[[#r5083]] +# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]] +# CHECK: EXP 0, %[[#r5084]] +# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]] +# CHECK: EXP 0, %[[#r5085]] +# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]] +# CHECK: EXP 0, %[[#r5086]] +# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]] +# CHECK: EXP 0, %[[#r5087]] +# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]] +# CHECK: EXP 0, %[[#r5088]] +# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]] +# CHECK: EXP 0, %[[#r5089]] +# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]] +# CHECK: EXP 0, %[[#r5090]] +# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]] +# CHECK: EXP 0, %[[#r5091]] +# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]] +# CHECK: EXP 0, %[[#r5092]] +# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]] +# CHECK: EXP 0, %[[#r5093]] +# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]] +# CHECK: EXP 0, %[[#r5094]] +# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]] +# CHECK: EXP 0, %[[#r5095]] +# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]] +# CHECK: EXP 0, %[[#r5096]] +# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]] +# CHECK: EXP 0, %[[#r5097]] +# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]] +# CHECK: EXP 0, %[[#r5098]] +# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]] +# CHECK: EXP 0, %[[#r5099]] +# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]] +# CHECK: EXP 0, %[[#r50100]] +# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]] +# CHECK: EXP 0, %[[#r50101]] +# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]] +# CHECK: EXP 0, %[[#r50102]] +# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]] +# CHECK: EXP 0, %[[#r50103]] +# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]] +# CHECK: EXP 0, %[[#r50104]] +# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]] +# CHECK: EXP 0, %[[#r50105]] +# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]] +# CHECK: EXP 0, %[[#r50106]] +# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]] +# CHECK: EXP 0, %[[#r50107]] +# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]] +# CHECK: EXP 0, %[[#r50108]] +# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]] +# CHECK: EXP 0, %[[#r50109]] +# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]] +# CHECK: EXP 0, %[[#r50110]] +# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]] +# CHECK: EXP 0, %[[#r50111]] +# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]] +# CHECK: EXP 0, %[[#r50112]] +# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]] +# CHECK: EXP 0, %[[#r50113]] +# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]] +# CHECK: EXP 0, %[[#r50114]] +# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]] +# CHECK: EXP 0, %[[#r50115]] +# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]] +# CHECK: EXP 0, %[[#r50116]] +# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]] +# CHECK: EXP 0, %[[#r50117]] +# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]] +# CHECK: EXP 0, %[[#r50118]] +# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]] +# CHECK: EXP 0, %[[#r50119]] +# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]] +# CHECK: EXP 0, %[[#r50120]] +# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]] +# CHECK: EXP 0, %[[#r50121]] +# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]] +# CHECK: EXP 0, %[[#r50122]] +# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]] +# CHECK: EXP 0, %[[#r50123]] +# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]] +# CHECK: EXP 0, %[[#r50124]] +# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]] +# CHECK: EXP 0, %[[#r50125]] +# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]] +# CHECK: EXP 0, %[[#r50126]] +# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]] +# CHECK: EXP 0, %[[#r50127]] +# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]] +# CHECK: EXP 0, %[[#r50128]] +# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]] +# CHECK: EXP 0, %[[#r50129]] +# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]] +# CHECK: EXP 0, %[[#r50130]] +# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]] +# CHECK: EXP 0, %[[#r50131]] +# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]] +# CHECK: EXP 0, %[[#r50132]] +# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]] +# CHECK: EXP 0, %[[#r50133]] + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + + %3:vgpr_32 = IMPLICIT_DEF + + + %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec + %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode + %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode + %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode + %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode + %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode + %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode + %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode + %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode + %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode + %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode + %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode + %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode + %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode + %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode + %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode + %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode + %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode + %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode + %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode + %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode + %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode + %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode + %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode + %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode + %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode + %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode + %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode + %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode + %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode + %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode + %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode + %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode + %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode + %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode + %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode + %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode + %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode + %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode + %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode + %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode + %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode + %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode + %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode + %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode + %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode + %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode + %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode + %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode + %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode + %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode + %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode + %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode + %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode + %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode + %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode + %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode + %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode + %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode + %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode + %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode + %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode + %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode + %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode + %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode + %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode + %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode + %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode + %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode + %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode + %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode + %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode + %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode + %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode + %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode + %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode + %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode + %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode + %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode + %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode + %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode + %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode + %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode + %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode + %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode + %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode + %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode + %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode + %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode + %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode + %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode + %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode + %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode + %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode + %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode + %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode + %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode + %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode + %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode + %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode + %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode + %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode + %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode + %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode + %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode + %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode + %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode + %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode + %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode + %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode + %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode + %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode + %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode + %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode + %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode + %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode + %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode + %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode + %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode + %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode + %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode + %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode + %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode + %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode + %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode + %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode + %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode + %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode + %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode + %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode + %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode + %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode + %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode + + + + EXP 0, %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_BRANCH %bb.1 + + ; %8001:vgpr_32 = COPY %8000 + ; %8002:vgpr_32 = COPY %8000 + ; %8003:vgpr_32 = COPY %8000 + ; %8004:vgpr_32 = COPY %8000 + ; %8005:vgpr_32 = COPY %8000 + ; %8006:vgpr_32 = COPY %8000 + ; %8007:vgpr_32 = COPY %8000 + ; %8008:vgpr_32 = COPY %8000 + ; %8009:vgpr_32 = COPY %8000 + ; %8010:vgpr_32 = COPY %8000 + ; %8011:vgpr_32 = COPY %8000 + ; %8012:vgpr_32 = COPY %8000 + ; %8013:vgpr_32 = COPY %8000 + ; %8014:vgpr_32 = COPY %8000 + ; %8015:vgpr_32 = COPY %8000 + ; %8016:vgpr_32 = COPY %8000 + ; %8017:vgpr_32 = COPY %8000 + + ; %9001:vgpr_32 = COPY %8001 + ; %9002:vgpr_32 = COPY %8002 + ; %9003:vgpr_32 = COPY %8003 + ; %9004:vgpr_32 = COPY %8004 + ; %9005:vgpr_32 = COPY %8005 + ; %9006:vgpr_32 = COPY %8006 + ; %9007:vgpr_32 = COPY %8007 + ; %9008:vgpr_32 = COPY %8008 + ; %9009:vgpr_32 = COPY %8009 + ; %9010:vgpr_32 = COPY %8010 + ; %9011:vgpr_32 = COPY %8011 + ; %9012:vgpr_32 = COPY %8012 + ; %9013:vgpr_32 = COPY %8013 + ; %9014:vgpr_32 = COPY %8014 + ; %9015:vgpr_32 = COPY %8015 + ; %9016:vgpr_32 = COPY %8016 + ; %9017:vgpr_32 = COPY %8017 + + bb.1: + + EXP 0, %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir new file mode 100644 index 0000000000000..5ee563e7a633f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/phi.mir @@ -0,0 +1,709 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that none of these defs are moved to their uses since they're used by +# PHIS. +# CHECK: bb.0: +# CHECK: %[[#r3000:]]:sgpr_32 = S_MOV_B32 0 +# CHECK: %[[#r3001:]]:sgpr_32 = S_MOV_B32 1 +# CHECK: %[[#r3002:]]:sgpr_32 = S_MOV_B32 2 +# CHECK: %[[#r3003:]]:sgpr_32 = S_MOV_B32 3 +# CHECK: %[[#r3004:]]:sgpr_32 = S_MOV_B32 4 +# CHECK: %[[#r3005:]]:sgpr_32 = S_MOV_B32 5 +# CHECK: %[[#r3006:]]:sgpr_32 = S_MOV_B32 6 +# CHECK: %[[#r3007:]]:sgpr_32 = S_MOV_B32 7 +# CHECK: %[[#r3008:]]:sgpr_32 = S_MOV_B32 8 +# CHECK: %[[#r3009:]]:sgpr_32 = S_MOV_B32 9 +# CHECK: %[[#r3010:]]:sgpr_32 = S_MOV_B32 10 +# CHECK: %[[#r3011:]]:sgpr_32 = S_MOV_B32 11 +# CHECK: %[[#r3012:]]:sgpr_32 = S_MOV_B32 12 +# CHECK: %[[#r3013:]]:sgpr_32 = S_MOV_B32 13 +# CHECK: %[[#r3014:]]:sgpr_32 = S_MOV_B32 14 +# CHECK: %[[#r3015:]]:sgpr_32 = S_MOV_B32 15 +# CHECK: %[[#r3016:]]:sgpr_32 = S_MOV_B32 16 +# CHECK: %[[#r3017:]]:sgpr_32 = S_MOV_B32 17 +# CHECK: %[[#r3018:]]:sgpr_32 = S_MOV_B32 18 +# CHECK: %[[#r3019:]]:sgpr_32 = S_MOV_B32 19 +# CHECK: %[[#r3020:]]:sgpr_32 = S_MOV_B32 20 +# CHECK: %[[#r3021:]]:sgpr_32 = S_MOV_B32 21 +# CHECK: %[[#r3022:]]:sgpr_32 = S_MOV_B32 22 +# CHECK: %[[#r3023:]]:sgpr_32 = S_MOV_B32 23 +# CHECK: %[[#r3024:]]:sgpr_32 = S_MOV_B32 24 +# CHECK: %[[#r3025:]]:sgpr_32 = S_MOV_B32 25 +# CHECK: %[[#r3026:]]:sgpr_32 = S_MOV_B32 26 +# CHECK: %[[#r3027:]]:sgpr_32 = S_MOV_B32 27 +# CHECK: %[[#r3028:]]:sgpr_32 = S_MOV_B32 28 +# CHECK: %[[#r3029:]]:sgpr_32 = S_MOV_B32 29 +# CHECK: %[[#r3030:]]:sgpr_32 = S_MOV_B32 30 +# CHECK: %[[#r3031:]]:sgpr_32 = S_MOV_B32 31 +# CHECK: %[[#r3032:]]:sgpr_32 = S_MOV_B32 32 +# CHECK: %[[#r3033:]]:sgpr_32 = S_MOV_B32 33 +# CHECK: %[[#r3034:]]:sgpr_32 = S_MOV_B32 34 +# CHECK: %[[#r3035:]]:sgpr_32 = S_MOV_B32 35 +# CHECK: %[[#r3036:]]:sgpr_32 = S_MOV_B32 36 +# CHECK: %[[#r3037:]]:sgpr_32 = S_MOV_B32 37 +# CHECK: %[[#r3038:]]:sgpr_32 = S_MOV_B32 38 +# CHECK: %[[#r3039:]]:sgpr_32 = S_MOV_B32 39 +# CHECK: %[[#r3040:]]:sgpr_32 = S_MOV_B32 40 +# CHECK: %[[#r3041:]]:sgpr_32 = S_MOV_B32 41 +# CHECK: %[[#r3042:]]:sgpr_32 = S_MOV_B32 42 +# CHECK: %[[#r3043:]]:sgpr_32 = S_MOV_B32 43 +# CHECK: %[[#r3044:]]:sgpr_32 = S_MOV_B32 44 +# CHECK: %[[#r3045:]]:sgpr_32 = S_MOV_B32 45 +# CHECK: %[[#r3046:]]:sgpr_32 = S_MOV_B32 46 +# CHECK: %[[#r3047:]]:sgpr_32 = S_MOV_B32 47 +# CHECK: %[[#r3048:]]:sgpr_32 = S_MOV_B32 48 +# CHECK: %[[#r3049:]]:sgpr_32 = S_MOV_B32 49 +# CHECK: %[[#r3050:]]:sgpr_32 = S_MOV_B32 50 +# CHECK: %[[#r3051:]]:sgpr_32 = S_MOV_B32 51 +# CHECK: %[[#r3052:]]:sgpr_32 = S_MOV_B32 52 +# CHECK: %[[#r3053:]]:sgpr_32 = S_MOV_B32 53 +# CHECK: %[[#r3054:]]:sgpr_32 = S_MOV_B32 54 +# CHECK: %[[#r3055:]]:sgpr_32 = S_MOV_B32 55 +# CHECK: %[[#r3056:]]:sgpr_32 = S_MOV_B32 56 +# CHECK: %[[#r3057:]]:sgpr_32 = S_MOV_B32 57 +# CHECK: %[[#r3058:]]:sgpr_32 = S_MOV_B32 58 +# CHECK: %[[#r3059:]]:sgpr_32 = S_MOV_B32 59 +# CHECK: %[[#r3060:]]:sgpr_32 = S_MOV_B32 60 +# CHECK: %[[#r3061:]]:sgpr_32 = S_MOV_B32 61 +# CHECK: %[[#r3062:]]:sgpr_32 = S_MOV_B32 62 +# CHECK: %[[#r3063:]]:sgpr_32 = S_MOV_B32 63 +# CHECK: %[[#r3064:]]:sgpr_32 = S_MOV_B32 64 +# CHECK: %[[#r3065:]]:sgpr_32 = S_MOV_B32 65 +# CHECK: %[[#r3066:]]:sgpr_32 = S_MOV_B32 66 +# CHECK: %[[#r3067:]]:sgpr_32 = S_MOV_B32 67 +# CHECK: %[[#r3068:]]:sgpr_32 = S_MOV_B32 68 +# CHECK: %[[#r3069:]]:sgpr_32 = S_MOV_B32 69 +# CHECK: %[[#r3070:]]:sgpr_32 = S_MOV_B32 70 +# CHECK: %[[#r3071:]]:sgpr_32 = S_MOV_B32 71 +# CHECK: %[[#r3072:]]:sgpr_32 = S_MOV_B32 72 +# CHECK: %[[#r3073:]]:sgpr_32 = S_MOV_B32 73 +# CHECK: %[[#r3074:]]:sgpr_32 = S_MOV_B32 74 +# CHECK: %[[#r3075:]]:sgpr_32 = S_MOV_B32 75 +# CHECK: %[[#r3076:]]:sgpr_32 = S_MOV_B32 76 +# CHECK: %[[#r3077:]]:sgpr_32 = S_MOV_B32 77 +# CHECK: %[[#r3078:]]:sgpr_32 = S_MOV_B32 78 +# CHECK: %[[#r3079:]]:sgpr_32 = S_MOV_B32 79 +# CHECK: %[[#r3080:]]:sgpr_32 = S_MOV_B32 80 +# CHECK: %[[#r3081:]]:sgpr_32 = S_MOV_B32 81 +# CHECK: %[[#r3082:]]:sgpr_32 = S_MOV_B32 82 +# CHECK: %[[#r3083:]]:sgpr_32 = S_MOV_B32 83 +# CHECK: %[[#r3084:]]:sgpr_32 = S_MOV_B32 84 +# CHECK: %[[#r3085:]]:sgpr_32 = S_MOV_B32 85 +# CHECK: %[[#r3086:]]:sgpr_32 = S_MOV_B32 86 +# CHECK: %[[#r3087:]]:sgpr_32 = S_MOV_B32 87 +# CHECK: %[[#r3088:]]:sgpr_32 = S_MOV_B32 88 +# CHECK: %[[#r3089:]]:sgpr_32 = S_MOV_B32 89 +# CHECK: %[[#r3090:]]:sgpr_32 = S_MOV_B32 90 +# CHECK: %[[#r3091:]]:sgpr_32 = S_MOV_B32 91 +# CHECK: %[[#r3092:]]:sgpr_32 = S_MOV_B32 92 +# CHECK: %[[#r3093:]]:sgpr_32 = S_MOV_B32 93 +# CHECK: %[[#r3094:]]:sgpr_32 = S_MOV_B32 94 +# CHECK: %[[#r3095:]]:sgpr_32 = S_MOV_B32 95 +# CHECK: %[[#r3096:]]:sgpr_32 = S_MOV_B32 96 +# CHECK: %[[#r3097:]]:sgpr_32 = S_MOV_B32 97 +# CHECK: %[[#r3098:]]:sgpr_32 = S_MOV_B32 98 +# CHECK: %[[#r3099:]]:sgpr_32 = S_MOV_B32 99 +# CHECK: bb.1: +# CHECK: bb.2: + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + + + %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %3000:sgpr_32 = S_MOV_B32 0 + %3001:sgpr_32 = S_MOV_B32 1 + %3002:sgpr_32 = S_MOV_B32 2 + %3003:sgpr_32 = S_MOV_B32 3 + %3004:sgpr_32 = S_MOV_B32 4 + %3005:sgpr_32 = S_MOV_B32 5 + %3006:sgpr_32 = S_MOV_B32 6 + %3007:sgpr_32 = S_MOV_B32 7 + %3008:sgpr_32 = S_MOV_B32 8 + %3009:sgpr_32 = S_MOV_B32 9 + %3010:sgpr_32 = S_MOV_B32 10 + %3011:sgpr_32 = S_MOV_B32 11 + %3012:sgpr_32 = S_MOV_B32 12 + %3013:sgpr_32 = S_MOV_B32 13 + %3014:sgpr_32 = S_MOV_B32 14 + %3015:sgpr_32 = S_MOV_B32 15 + %3016:sgpr_32 = S_MOV_B32 16 + %3017:sgpr_32 = S_MOV_B32 17 + %3018:sgpr_32 = S_MOV_B32 18 + %3019:sgpr_32 = S_MOV_B32 19 + %3020:sgpr_32 = S_MOV_B32 20 + %3021:sgpr_32 = S_MOV_B32 21 + %3022:sgpr_32 = S_MOV_B32 22 + %3023:sgpr_32 = S_MOV_B32 23 + %3024:sgpr_32 = S_MOV_B32 24 + %3025:sgpr_32 = S_MOV_B32 25 + %3026:sgpr_32 = S_MOV_B32 26 + %3027:sgpr_32 = S_MOV_B32 27 + %3028:sgpr_32 = S_MOV_B32 28 + %3029:sgpr_32 = S_MOV_B32 29 + %3030:sgpr_32 = S_MOV_B32 30 + %3031:sgpr_32 = S_MOV_B32 31 + %3032:sgpr_32 = S_MOV_B32 32 + %3033:sgpr_32 = S_MOV_B32 33 + %3034:sgpr_32 = S_MOV_B32 34 + %3035:sgpr_32 = S_MOV_B32 35 + %3036:sgpr_32 = S_MOV_B32 36 + %3037:sgpr_32 = S_MOV_B32 37 + %3038:sgpr_32 = S_MOV_B32 38 + %3039:sgpr_32 = S_MOV_B32 39 + %3040:sgpr_32 = S_MOV_B32 40 + %3041:sgpr_32 = S_MOV_B32 41 + %3042:sgpr_32 = S_MOV_B32 42 + %3043:sgpr_32 = S_MOV_B32 43 + %3044:sgpr_32 = S_MOV_B32 44 + %3045:sgpr_32 = S_MOV_B32 45 + %3046:sgpr_32 = S_MOV_B32 46 + %3047:sgpr_32 = S_MOV_B32 47 + %3048:sgpr_32 = S_MOV_B32 48 + %3049:sgpr_32 = S_MOV_B32 49 + %3050:sgpr_32 = S_MOV_B32 50 + %3051:sgpr_32 = S_MOV_B32 51 + %3052:sgpr_32 = S_MOV_B32 52 + %3053:sgpr_32 = S_MOV_B32 53 + %3054:sgpr_32 = S_MOV_B32 54 + %3055:sgpr_32 = S_MOV_B32 55 + %3056:sgpr_32 = S_MOV_B32 56 + %3057:sgpr_32 = S_MOV_B32 57 + %3058:sgpr_32 = S_MOV_B32 58 + %3059:sgpr_32 = S_MOV_B32 59 + %3060:sgpr_32 = S_MOV_B32 60 + %3061:sgpr_32 = S_MOV_B32 61 + %3062:sgpr_32 = S_MOV_B32 62 + %3063:sgpr_32 = S_MOV_B32 63 + %3064:sgpr_32 = S_MOV_B32 64 + %3065:sgpr_32 = S_MOV_B32 65 + %3066:sgpr_32 = S_MOV_B32 66 + %3067:sgpr_32 = S_MOV_B32 67 + %3068:sgpr_32 = S_MOV_B32 68 + %3069:sgpr_32 = S_MOV_B32 69 + %3070:sgpr_32 = S_MOV_B32 70 + %3071:sgpr_32 = S_MOV_B32 71 + %3072:sgpr_32 = S_MOV_B32 72 + %3073:sgpr_32 = S_MOV_B32 73 + %3074:sgpr_32 = S_MOV_B32 74 + %3075:sgpr_32 = S_MOV_B32 75 + %3076:sgpr_32 = S_MOV_B32 76 + %3077:sgpr_32 = S_MOV_B32 77 + %3078:sgpr_32 = S_MOV_B32 78 + %3079:sgpr_32 = S_MOV_B32 79 + %3080:sgpr_32 = S_MOV_B32 80 + %3081:sgpr_32 = S_MOV_B32 81 + %3082:sgpr_32 = S_MOV_B32 82 + %3083:sgpr_32 = S_MOV_B32 83 + %3084:sgpr_32 = S_MOV_B32 84 + %3085:sgpr_32 = S_MOV_B32 85 + %3086:sgpr_32 = S_MOV_B32 86 + %3087:sgpr_32 = S_MOV_B32 87 + %3088:sgpr_32 = S_MOV_B32 88 + %3089:sgpr_32 = S_MOV_B32 89 + %3090:sgpr_32 = S_MOV_B32 90 + %3091:sgpr_32 = S_MOV_B32 91 + %3092:sgpr_32 = S_MOV_B32 92 + %3093:sgpr_32 = S_MOV_B32 93 + %3094:sgpr_32 = S_MOV_B32 94 + %3095:sgpr_32 = S_MOV_B32 95 + %3096:sgpr_32 = S_MOV_B32 96 + %3097:sgpr_32 = S_MOV_B32 97 + %3098:sgpr_32 = S_MOV_B32 98 + %3099:sgpr_32 = S_MOV_B32 99 + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + %8001:vgpr_32 = COPY %8000 + %8002:vgpr_32 = COPY %8000 + %8003:vgpr_32 = COPY %8000 + %8004:vgpr_32 = COPY %8000 + %8005:vgpr_32 = COPY %8000 + %8006:vgpr_32 = COPY %8000 + %8007:vgpr_32 = COPY %8000 + %8008:vgpr_32 = COPY %8000 + %8009:vgpr_32 = COPY %8000 + %8010:vgpr_32 = COPY %8000 + %8011:vgpr_32 = COPY %8000 + %8012:vgpr_32 = COPY %8000 + %8013:vgpr_32 = COPY %8000 + %8014:vgpr_32 = COPY %8000 + %8015:vgpr_32 = COPY %8000 + %8016:vgpr_32 = COPY %8000 + %8017:vgpr_32 = COPY %8000 + + %9001:vgpr_32 = COPY %8001 + %9002:vgpr_32 = COPY %8002 + %9003:vgpr_32 = COPY %8003 + %9004:vgpr_32 = COPY %8004 + %9005:vgpr_32 = COPY %8005 + %9006:vgpr_32 = COPY %8006 + %9007:vgpr_32 = COPY %8007 + %9008:vgpr_32 = COPY %8008 + %9009:vgpr_32 = COPY %8009 + %9010:vgpr_32 = COPY %8010 + %9011:vgpr_32 = COPY %8011 + %9012:vgpr_32 = COPY %8012 + %9013:vgpr_32 = COPY %8013 + %9014:vgpr_32 = COPY %8014 + %9015:vgpr_32 = COPY %8015 + %9016:vgpr_32 = COPY %8016 + %9017:vgpr_32 = COPY %8017 + + S_BRANCH %bb.2 + + bb.2: + %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1 + %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1 + %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1 + %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1 + %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1 + %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1 + %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1 + %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1 + %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1 + %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1 + %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1 + %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1 + %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1 + %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1 + %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1 + %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1 + %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1 + %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1 + %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1 + %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1 + %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1 + %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1 + %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1 + %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1 + %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1 + %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1 + %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1 + %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1 + %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1 + %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1 + %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1 + %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1 + %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1 + %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1 + %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1 + %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1 + %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1 + %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1 + %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1 + %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1 + %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1 + %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1 + %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1 + %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1 + %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1 + %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1 + %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1 + %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1 + %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1 + %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1 + %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1 + %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1 + %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1 + %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1 + %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1 + %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1 + %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1 + %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1 + %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1 + %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1 + %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1 + %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1 + %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1 + %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1 + %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1 + %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1 + %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1 + %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1 + %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1 + %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1 + %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1 + %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1 + %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1 + %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1 + %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1 + %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1 + %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1 + %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1 + %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1 + %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1 + %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1 + %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1 + %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1 + %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1 + %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1 + %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1 + %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1 + %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1 + %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1 + %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1 + %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1 + %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1 + %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1 + %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1 + %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1 + %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1 + %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1 + %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1 + %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1 + %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1 + + + %3:vgpr_32 = IMPLICIT_DEF + + %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec + %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec + %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec + %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec + %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec + %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec + %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec + %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec + %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec + %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec + %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec + %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec + %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec + %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec + %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec + %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec + %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec + %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec + %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec + %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec + %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec + %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec + %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec + %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec + %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec + %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec + %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec + %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec + %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec + %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec + %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec + %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec + %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec + %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec + %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec + %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec + %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec + %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec + %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec + %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec + %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec + %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec + %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec + %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec + %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec + %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec + %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec + %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec + %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec + %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec + %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec + %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec + %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec + %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec + %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec + %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec + %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec + %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec + %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec + %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec + %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec + %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec + %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec + %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec + %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec + %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec + %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec + %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec + %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec + %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec + %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec + %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec + %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec + %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec + %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec + %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec + %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec + %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec + %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec + %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec + %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec + %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec + %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec + %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec + %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec + %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec + %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec + %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec + %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec + %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec + %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec + %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec + %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec + %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec + %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec + %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec + %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec + %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec + %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec + %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec + EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll new file mode 100644 index 0000000000000..3369486e0323a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-hot-block-remat -amdgpu-remat-enable-sub-exp-remat + +; Regression test for PHI being sinked to uses as a pacifist. +; Just checking that the test does not crash. + +; ModuleID = 'reduced.ll' +source_filename = "reduced.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn--amdpal" + +define amdgpu_ps void @_amdgpu_ps_main(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, i32 %arg5, float %arg6, float %arg7, float %arg8, <2 x half> %arg9, i1 %arg10) #0 { +bb: + br label %bb19 + +bb11: ; preds = %bb19 + %i = bitcast i32 %i21 to float + %i12 = bitcast i32 %i23 to float + %i13 = fmul float 0.000000e+00, %i26 + %i14 = fmul float %i13, 0.000000e+00 + %i15 = fmul float %i12, %i + %i16 = fadd float %i15, %i14 + %i17 = select i1 false, float 0.000000e+00, float %i16 + %i18 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %arg4, float %arg8) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i18, <2 x half> %arg9, i1 false, i1 false) + ret void + +bb19: ; preds = %bb19, %bb + %i20 = phi i32 [ 0, %bb19 ], [ %arg5, %bb ] + %i21 = phi i32 [ %i35, %bb19 ], [ 0, %bb ] + %i22 = phi i32 [ %i38, %bb19 ], [ 0, %bb ] + %i23 = phi i32 [ %i60, %bb19 ], [ 0, %bb ] + %i24 = phi i32 [ %i61, %bb19 ], [ 0, %bb ] + %i25 = phi i32 [ %i62, %bb19 ], [ 0, %bb ] + %i26 = phi float [ %i39, %bb19 ], [ 0.000000e+00, %bb ] + %i27 = phi i32 [ %i49, %bb19 ], [ 0, %bb ] + %i28 = phi i32 [ %i50, %bb19 ], [ 0, %bb ] + %i29 = phi i32 [ %i51, %bb19 ], [ 0, %bb ] + %i30 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 1, i32 %i20, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0) + %i31 = extractelement <4 x float> %i30, i64 0 + %i32 = fmul float %arg1, %i31 + %i33 = bitcast i32 %i22 to float + %i34 = fmul float %arg, %i32 + %i35 = select i1 %arg10, i32 %arg5, i32 %i21 + %i36 = fadd float 0.000000e+00, %i33 + %i37 = bitcast float %i36 to i32 + %i38 = select i1 %arg10, i32 %i22, i32 %i37 + %i39 = fadd float %i26, 1.000000e+00 + %i40 = bitcast i32 %i27 to float + %i41 = bitcast i32 %i28 to float + %i42 = bitcast i32 %i29 to float + %i43 = fadd float 0.000000e+00, %i40 + %i44 = fadd float 0.000000e+00, %i41 + %i45 = fadd float 0.000000e+00, %i42 + %i46 = bitcast float %i43 to i32 + %i47 = bitcast float %i44 to i32 + %i48 = bitcast float %i45 to i32 + %i49 = select i1 %arg10, i32 %i27, i32 %i46 + %i50 = select i1 %arg10, i32 %i28, i32 %i47 + %i51 = select i1 %arg10, i32 %i29, i32 %i48 + %i52 = fmul float %i34, %arg7 + %i53 = bitcast i32 %i24 to float + %i54 = bitcast i32 %i25 to float + %i55 = fadd float %arg6, %i53 + %i56 = fadd float %arg2, %i54 + %i57 = bitcast float %i52 to i32 + %i58 = bitcast float %i55 to i32 + %i59 = bitcast float %i56 to i32 + %i60 = select i1 %arg10, i32 %i57, i32 %i23 + %i61 = select i1 %arg10, i32 %i58, i32 %i24 + %i62 = select i1 %arg10, i32 %i59, i32 %i25 + %i63 = sitofp i32 %i20 to float + %i64 = fcmp olt float %arg3, %i63 + br i1 %i64, label %bb11, label %bb19 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #3 + +attributes #0 = { "target-features"=",+wavefrontsize64,+cumode,-xnack" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #3 = { nocallback nofree nosync nounwind willreturn memory(read) } diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir new file mode 100644 index 0000000000000..e9a8486bfa6b1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir @@ -0,0 +1,372 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat | FileCheck %s + +# Regression test for PHI being sinked to uses as a pacifist. + +# CHECK: bb.2.bb19: +# CHECK: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI + +--- | + ; ModuleID = 'C:\llvm-project\llvm\test\CodeGen\AMDGPU\remat\phi_pacifist.ll' + source_filename = "reduced.ll" + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" + target triple = "amdgcn" + + define amdgpu_ps void @_amdgpu_ps_main(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, i32 %arg5, float %arg6, float %arg7, float %arg8, <2 x half> %arg9, i1 %arg10) #0 { + bb: + br label %bb19, !amdgpu.uniform !0 + + bb11: ; preds = %bb19 + %i21.lcssa = phi i32 [ %i21, %bb19 ] + %i23.lcssa = phi i32 [ %i23, %bb19 ] + %i26.lcssa = phi float [ %i26, %bb19 ] + %.lcssa = phi i64 [ %0, %bb19 ] + call void @llvm.amdgcn.end.cf.i64(i64 %.lcssa) + %i = bitcast i32 %i21.lcssa to float + %i12 = bitcast i32 %i23.lcssa to float + %i13 = fmul float 0.000000e+00, %i26.lcssa + %i18 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %arg4, float %arg8) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i18, <2 x half> %arg9, i1 false, i1 false) + ret void + + bb19: ; preds = %bb19, %bb + %phi.broken = phi i64 [ %0, %bb19 ], [ 0, %bb ] + %i20 = phi i32 [ %arg5, %bb ], [ 0, %bb19 ] + %i21 = phi i32 [ 0, %bb ], [ %i35, %bb19 ] + %i22 = phi i32 [ 0, %bb ], [ %i38, %bb19 ] + %i23 = phi i32 [ 0, %bb ], [ %i60, %bb19 ] + %i24 = phi i32 [ 0, %bb ], [ %i61, %bb19 ] + %i25 = phi i32 [ 0, %bb ], [ %i62, %bb19 ] + %i26 = phi float [ 0.000000e+00, %bb ], [ %i39, %bb19 ] + %i27 = phi i32 [ 0, %bb ], [ %i49, %bb19 ] + %i28 = phi i32 [ 0, %bb ], [ %i50, %bb19 ] + %i29 = phi i32 [ 0, %bb ], [ %i51, %bb19 ] + %i30 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 1, i32 %i20, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0) + %i31 = extractelement <4 x float> %i30, i64 0 + %i32 = fmul float %arg1, %i31 + %i33 = bitcast i32 %i22 to float + %i34 = fmul float %arg, %i32 + %i35 = select i1 %arg10, i32 %arg5, i32 %i21 + %i36 = fadd float 0.000000e+00, %i33 + %i37 = bitcast float %i36 to i32 + %i38 = select i1 %arg10, i32 %i22, i32 %i37 + %i39 = fadd float %i26, 1.000000e+00 + %i40 = bitcast i32 %i27 to float + %i41 = bitcast i32 %i28 to float + %i42 = bitcast i32 %i29 to float + %i43 = fadd float 0.000000e+00, %i40 + %i44 = fadd float 0.000000e+00, %i41 + %i45 = fadd float 0.000000e+00, %i42 + %i46 = bitcast float %i43 to i32 + %i47 = bitcast float %i44 to i32 + %i48 = bitcast float %i45 to i32 + %i49 = select i1 %arg10, i32 %i27, i32 %i46 + %i50 = select i1 %arg10, i32 %i28, i32 %i47 + %i51 = select i1 %arg10, i32 %i29, i32 %i48 + %i52 = fmul float %i34, %arg7 + %i53 = bitcast i32 %i24 to float + %i54 = bitcast i32 %i25 to float + %i55 = fadd float %arg6, %i53 + %i56 = fadd float %arg2, %i54 + %i57 = bitcast float %i52 to i32 + %i58 = bitcast float %i55 to i32 + %i59 = bitcast float %i56 to i32 + %i60 = select i1 %arg10, i32 %i57, i32 %i23 + %i61 = select i1 %arg10, i32 %i58, i32 %i24 + %i62 = select i1 %arg10, i32 %i59, i32 %i25 + %i63 = sitofp i32 %i20 to float + %i64 = fcmp olt float %arg3, %i63 + %0 = call i64 @llvm.amdgcn.if.break.i64(i1 %i64, i64 %phi.broken) + %1 = call i1 @llvm.amdgcn.loop.i64(i64 %0) + br i1 %1, label %bb11, label %bb19 + } + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) + declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #2 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) + declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #3 + + ; Function Attrs: nocallback nofree nounwind willreturn memory(none) + declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #4 + + ; Function Attrs: nocallback nofree nounwind willreturn + declare i1 @llvm.amdgcn.loop.i64(i64) #5 + + ; Function Attrs: nocallback nofree nounwind willreturn + declare void @llvm.amdgcn.end.cf.i64(i64) #5 + + attributes #0 = { "target-cpu"="gfx1010" "target-features"=",+wavefrontsize64,+cumode,-xnack" } + attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" } + attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx1010" } + attributes #3 = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx1010" } + attributes #4 = { nocallback nofree nounwind willreturn memory(none) } + attributes #5 = { nocallback nofree nounwind willreturn } + + !0 = !{} + +... +--- +name: _amdgpu_ps_main +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 3, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 4, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 5, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 6, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 7, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 8, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 10, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 11, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 12, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 13, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 14, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 15, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 16, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 17, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 18, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 19, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 20, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 21, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 22, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 23, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 24, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 25, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 26, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 27, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 30, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 31, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 32, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 33, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 34, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 35, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 36, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 37, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 38, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 39, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 40, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 41, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 42, class: sreg_64_xexec, preferred-register: '$vcc', flags: [ ] } + - { id: 43, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 44, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 45, class: sgpr_256, preferred-register: '', flags: [ ] } + - { id: 46, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 47, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 48, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 49, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 50, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 51, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 52, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 53, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 54, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 55, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 56, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 57, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 58, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 59, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 60, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 61, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 62, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 63, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 64, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 65, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 66, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 67, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 68, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 69, class: sreg_64, preferred-register: '$vcc', flags: [ ] } + - { id: 70, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 71, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 72, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 73, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 74, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 75, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 76, class: vgpr_32, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$vgpr0', virtual-reg: '%25' } + - { reg: '$vgpr1', virtual-reg: '%26' } + - { reg: '$vgpr2', virtual-reg: '%27' } + - { reg: '$vgpr3', virtual-reg: '%28' } + - { reg: '$vgpr4', virtual-reg: '%29' } + - { reg: '$vgpr5', virtual-reg: '%30' } + - { reg: '$vgpr6', virtual-reg: '%31' } + - { reg: '$vgpr7', virtual-reg: '%32' } + - { reg: '$vgpr8', virtual-reg: '%33' } + - { reg: '$vgpr9', virtual-reg: '%34' } + - { reg: '$vgpr10', virtual-reg: '%35' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + explicitKernArgSize: 0 + maxKernArgAlign: 4 + ldsSize: 0 + gdsSize: 0 + dynLDSAlign: 1 + isEntryFunction: true + isChainFunction: false + noSignedZerosFPMath: false + memoryBound: false + waveLimiter: false + hasSpilledSGPRs: false + hasSpilledVGPRs: false + scratchRSrcReg: '$sgpr100_sgpr101_sgpr102_sgpr103' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + bytesInStackArgArea: 0 + returnsVoid: true + argumentInfo: + privateSegmentWaveByteOffset: { reg: '$sgpr0' } + psInputAddr: 2047 + psInputEnable: 2047 + maxMemoryClusterDWords: 8 + mode: + ieee: false + dx10-clamp: true + fp32-input-denormals: true + fp32-output-denormals: true + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + highBitsOf32BitAddress: 0 + occupancy: 20 + vgprForAGPRCopy: '' + sgprForEXECCopy: '$sgpr104_sgpr105' + longBranchReservedReg: '' + hasInitWholeWave: false +body: | + bb.0.bb: + successors: %bb.2(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + + %35:vgpr_32 = COPY $vgpr10 + %34:vgpr_32 = COPY $vgpr9 + %33:vgpr_32 = COPY $vgpr8 + %32:vgpr_32 = COPY $vgpr7 + %31:vgpr_32 = COPY $vgpr6 + %30:vgpr_32 = COPY $vgpr5 + %29:vgpr_32 = COPY $vgpr4 + %28:vgpr_32 = COPY $vgpr3 + %27:vgpr_32 = COPY $vgpr2 + %26:vgpr_32 = COPY $vgpr1 + %25:vgpr_32 = COPY $vgpr0 + %41:vgpr_32 = V_AND_B32_e64 1, %35, implicit $exec + %42:sreg_64_xexec = V_CMP_EQ_U32_e64 1, killed %41, implicit $exec + %39:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %38:sreg_64 = S_MOV_B64 0 + %76:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %51:sgpr_32 = S_MOV_B32 0 + %45:sgpr_256 = REG_SEQUENCE %51, %subreg.sub0, %51, %subreg.sub1, %51, %subreg.sub2, %51, %subreg.sub3, %51, %subreg.sub4, %51, %subreg.sub5, %51, %subreg.sub6, %51, %subreg.sub7 + S_BRANCH %bb.2 + + bb.1.bb11: + SI_END_CF %24, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %70:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, %29, 0, %33, 0, 0, implicit $mode, implicit $exec + %71:sreg_32 = IMPLICIT_DEF + %72:vgpr_32 = COPY %71 + %73:sreg_32 = IMPLICIT_DEF + %74:vgpr_32 = COPY %73 + EXP 0, killed %70, %34, %72, %74, 0, -1, 0, implicit $exec + S_ENDPGM 0 + + bb.2.bb19: + successors: %bb.1(0x04000000), %bb.2(0x7c000000) + + %4:sreg_64 = PHI %38, %bb.0, %24, %bb.2 + %5:vgpr_32 = PHI %30, %bb.0, %76, %bb.2 + %6:vgpr_32 = PHI %39, %bb.0, %15, %bb.2 + %7:vgpr_32 = PHI %39, %bb.0, %16, %bb.2 + %8:vgpr_32 = PHI %39, %bb.0, %21, %bb.2 + %9:vgpr_32 = PHI %39, %bb.0, %22, %bb.2 + %10:vgpr_32 = PHI %39, %bb.0, %23, %bb.2 + %75:vgpr_32 = PHI %76, %bb.0, %55, %bb.2 + %12:vgpr_32 = PHI %39, %bb.0, %18, %bb.2 + %13:vgpr_32 = PHI %39, %bb.0, %19, %bb.2 + %14:vgpr_32 = PHI %39, %bb.0, %20, %bb.2 + %46:vgpr_32 = IMAGE_LOAD_V1_V2_nsa_gfx10 %5, %76, %45, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + %48:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %26, 0, killed %46, 0, 0, implicit $mode, implicit $exec + %49:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %25, 0, killed %48, 0, 0, implicit $mode, implicit $exec + %15:vgpr_32 = V_CNDMASK_B32_e64 0, %6, 0, %30, %42, implicit $exec + %52:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %7, 0, 0, implicit $mode, implicit $exec + %16:vgpr_32 = V_CNDMASK_B32_e64 0, killed %52, 0, %7, %42, implicit $exec + %55:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 1065353216, 0, %75, 0, 0, implicit $mode, implicit $exec + %56:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %12, 0, 0, implicit $mode, implicit $exec + %57:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %13, 0, 0, implicit $mode, implicit $exec + %58:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %14, 0, 0, implicit $mode, implicit $exec + %18:vgpr_32 = V_CNDMASK_B32_e64 0, killed %56, 0, %12, %42, implicit $exec + %19:vgpr_32 = V_CNDMASK_B32_e64 0, killed %57, 0, %13, %42, implicit $exec + %20:vgpr_32 = V_CNDMASK_B32_e64 0, killed %58, 0, %14, %42, implicit $exec + %62:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %49, 0, %32, 0, 0, implicit $mode, implicit $exec + %63:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %31, 0, %9, 0, 0, implicit $mode, implicit $exec + %64:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %27, 0, %10, 0, 0, implicit $mode, implicit $exec + %21:vgpr_32 = V_CNDMASK_B32_e64 0, %8, 0, killed %62, %42, implicit $exec + %22:vgpr_32 = V_CNDMASK_B32_e64 0, %9, 0, killed %63, %42, implicit $exec + %23:vgpr_32 = V_CNDMASK_B32_e64 0, %10, 0, killed %64, %42, implicit $exec + %68:vgpr_32 = V_CVT_F32_I32_e64 %5, 0, 0, implicit $mode, implicit $exec + %69:sreg_64 = nofpexcept V_CMP_LT_F32_e64 0, %28, 0, killed %68, 0, implicit $mode, implicit $exec + %24:sreg_64 = SI_IF_BREAK killed %69, %4, implicit-def dead $scc + SI_LOOP %24, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir new file mode 100644 index 0000000000000..9f5d402340329 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir @@ -0,0 +1,565 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that the buffer loads have been moved to the use and the lanes are reduced +# correctly. +# +# CHECK: bb.2: +#========================================================================== +# X4_IMM, Using .x +# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0 +# X4_IMM, Using .xy +# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0 +# X4_IMM, Using .xyz +# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0 +# X4_IMM, Using .yz +# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0 +# X4_IMM, Using .yzw +# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0 +#========================================================================== +# X8_IMM, Using .x +# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0 +# X8_IMM, Using .xy +# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0 +# X8_IMM, Using .xyz +# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0 +# X8_IMM, Using .xyzw +# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0 +# X8_IMM, Using .xyzw + 5th dword +# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0 +#========================================================================== +# X16_IMM, Using .xy and .zw +# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0 +#========================================================================== +# X4_SGPR, Using .x +# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0 +# X8_SGPR, Using .xy +# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0 +# X16_SGPR, Using .xy + .zw +# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0 +#========================================================================== +# +# +# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 +# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 +# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 +# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 +# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 +# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 +# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 +# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 +# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 +# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 +# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 +# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 +# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 +# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 +# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 +# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 +# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 +# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 +# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 +# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 +# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 +# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 +# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 +# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 +# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 +# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 +# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 +# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 +# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 +# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 +# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 +# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 +# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 +# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 +# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 +# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 +# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 +# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 +# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 +# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 +# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 +# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 +# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 +# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 +# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 +# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 +# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 +# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 +# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 +# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3 + + ; X4_IMM + %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0 + %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0 + %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0 + %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0 + %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0 + + ; X8_IMM + %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0 + %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0 + %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0 + %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0 + %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0 + + ; X16_IMM + %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0 + + ; X4_SGPR + %50:sgpr_32 = COPY $sgpr0 + %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0 + + ; X8_SGPR + %51:sgpr_32 = COPY $sgpr1 + %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0 + + ; X16_SGPR + %52:sgpr_32 = COPY $sgpr2 + %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0 + + %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0 + %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0 + %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0 + %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0 + %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0 + %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0 + %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0 + %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0 + %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0 + %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0 + %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0 + %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0 + %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0 + %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0 + %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0 + %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0 + %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0 + %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0 + %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0 + %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0 + %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0 + %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0 + %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0 + %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0 + %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0 + %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0 + %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0 + %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0 + %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0 + %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0 + %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0 + %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0 + %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0 + %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0 + %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0 + %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0 + %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0 + %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0 + %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0 + %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0 + %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0 + %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0 + %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0 + %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0 + %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0 + %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0 + %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0 + %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0 + %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0 + %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0 + + %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %8001:vgpr_32 = COPY %8000 + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + ;========================================================================== + ; X4_IMM, Using .x + S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0 + S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses. + + ; X4_IMM, Using .xy + S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0 + S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0 + + ; X4_IMM, Using .xyz + S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0 + S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0 + S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0 + + ; X4_IMM, Using .yz + S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0 + S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0 + + ; X4_IMM, Using .yzw + S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0 + S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0 + S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0 + + ;========================================================================== + ; X8_IMM, Using .x + S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0 + S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0 + + ; X8_IMM, Using .xy + S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0 + S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0 + + ; X8_IMM, Using .xyz + S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0 + S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0 + S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0 + + ; X8_IMM, Using .xyzw + S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0 + + ; X8_IMM, Using .xyzw + 5th dword + S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0 + + ;========================================================================== + ; X16_IMM, Using .xy and .zw + S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0 + S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0 + + ;========================================================================== + ; X4_SGPR, Using .x + S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0 + S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0 + + ; X8_SGPR, Using .xy + S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0 + S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0 + + ; X16_SGPR, Using .xy + .zw + S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0 + S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0 + + ;========================================================================== + S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 + + EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + + + + diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir new file mode 100644 index 0000000000000..69875261b74e9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -0,0 +1,452 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that the loads have been moved to the use +# CHECK: bb.2: +# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0 +# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0 +# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0 +# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0 +# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0 +# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0 +# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0 +# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0 +# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0 +# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0 +# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0 +# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0 +# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0 +# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0 +# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 +# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 +# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 +# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 +# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 +# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 +# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 +# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 +# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 +# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 +# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 +# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 +# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 +# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 +# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 +# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 +# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 +# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 +# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 +# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 +# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 +# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 +# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 +# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 +# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 +# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 +# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 +# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 +# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 +# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 +# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 +# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 +# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 +# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 +# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 +# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 +# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 +# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 +# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 +# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 +# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 +# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 +# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 +# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 +# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 +# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 +# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 +# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 +# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 +# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + + %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 + %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 + %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0 + %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0 + %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0 + %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0 + %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0 + %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0 + %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0 + %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0 + %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0 + %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0 + %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0 + %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0 + %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0 + %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0 + %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0 + %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0 + %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0 + %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0 + %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0 + %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0 + %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0 + %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0 + %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0 + %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0 + %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0 + %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0 + %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0 + %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0 + %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0 + %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0 + %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0 + %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0 + %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0 + %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0 + %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0 + %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0 + %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0 + %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0 + %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0 + %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0 + %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0 + %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0 + %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0 + %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0 + %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0 + %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0 + %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0 + %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0 + %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0 + %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0 + %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0 + %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0 + %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0 + %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0 + %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0 + %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0 + %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0 + %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0 + %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0 + %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0 + %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0 + %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0 + + %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %8001:vgpr_32 = COPY %8000 + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 + + EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir new file mode 100644 index 0000000000000..3a2d61555c0b4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir @@ -0,0 +1,405 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-sub-exp-remat | FileCheck %s + +# DEFS +# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni00:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div00]], implicit $exec +# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni01:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div01]], implicit $exec +# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni02:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div02]], implicit $exec +# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni03:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div03]], implicit $exec +# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni04:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div04]], implicit $exec +# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni05:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div05]], implicit $exec +# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni06:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div06]], implicit $exec +# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni07:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div07]], implicit $exec +# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni08:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div08]], implicit $exec +# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni09:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div09]], implicit $exec +# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni10:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div10]], implicit $exec +# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni11:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div11]], implicit $exec +# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni12:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div12]], implicit $exec +# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni13:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div13]], implicit $exec +# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni14:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div14]], implicit $exec +# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni15:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div15]], implicit $exec +# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni16:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div16]], implicit $exec +# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni17:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div17]], implicit $exec +# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni18:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div18]], implicit $exec +# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni19:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div19]], implicit $exec +# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni20:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div20]], implicit $exec +# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni21:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div21]], implicit $exec +# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni22:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div22]], implicit $exec +# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni23:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div23]], implicit $exec +# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni24:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div24]], implicit $exec +# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni25:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div25]], implicit $exec +# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni26:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div26]], implicit $exec +# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni27:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div27]], implicit $exec +# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni28:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div28]], implicit $exec +# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni29:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div29]], implicit $exec +# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni30:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div30]], implicit $exec +# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni31:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div31]], implicit $exec +# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni32:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div32]], implicit $exec +# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni33:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div33]], implicit $exec +# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni34:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div34]], implicit $exec +# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni35:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div35]], implicit $exec +# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni36:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div36]], implicit $exec +# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni37:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div37]], implicit $exec +# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni38:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div38]], implicit $exec +# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni39:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div39]], implicit $exec +# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni40:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div40]], implicit $exec +# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni41:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div41]], implicit $exec +# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni42:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div42]], implicit $exec +# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni43:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div43]], implicit $exec +# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni44:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div44]], implicit $exec +# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni45:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div45]], implicit $exec +# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni46:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div46]], implicit $exec +# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni47:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div47]], implicit $exec +# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni48:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div48]], implicit $exec +# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni49:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div49]], implicit $exec +# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni50:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div50]], implicit $exec +# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni51:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div51]], implicit $exec +# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni52:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div52]], implicit $exec +# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni53:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div53]], implicit $exec +# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni54:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div54]], implicit $exec +# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni55:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div55]], implicit $exec +# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni56:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div56]], implicit $exec +# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni57:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div57]], implicit $exec +# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni58:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div58]], implicit $exec +# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec +# CHECK: %[[#uni59:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div59]], implicit $exec + + +# USERS: +# CHECK: %[[#div_00:]]:vgpr_32 = COPY %[[#uni00]] +#CHECK: EXP 0, %[[#div_00]], +# CHECK: %[[#div_01:]]:vgpr_32 = COPY %[[#uni01]] +#CHECK: EXP 0, %[[#div_01]], +# CHECK: %[[#div_02:]]:vgpr_32 = COPY %[[#uni02]] +#CHECK: EXP 0, %[[#div_02]], +# CHECK: %[[#div_03:]]:vgpr_32 = COPY %[[#uni03]] +#CHECK: EXP 0, %[[#div_03]], +# CHECK: %[[#div_04:]]:vgpr_32 = COPY %[[#uni04]] +#CHECK: EXP 0, %[[#div_04]], +# CHECK: %[[#div_05:]]:vgpr_32 = COPY %[[#uni05]] +#CHECK: EXP 0, %[[#div_05]], +# CHECK: %[[#div_06:]]:vgpr_32 = COPY %[[#uni06]] +#CHECK: EXP 0, %[[#div_06]], +# CHECK: %[[#div_07:]]:vgpr_32 = COPY %[[#uni07]] +#CHECK: EXP 0, %[[#div_07]], +# CHECK: %[[#div_08:]]:vgpr_32 = COPY %[[#uni08]] +#CHECK: EXP 0, %[[#div_08]], +# CHECK: %[[#div_09:]]:vgpr_32 = COPY %[[#uni09]] +#CHECK: EXP 0, %[[#div_09]], +# CHECK: %[[#div_10:]]:vgpr_32 = COPY %[[#uni10]] +#CHECK: EXP 0, %[[#div_10]], +# CHECK: %[[#div_11:]]:vgpr_32 = COPY %[[#uni11]] +#CHECK: EXP 0, %[[#div_11]], +# CHECK: %[[#div_12:]]:vgpr_32 = COPY %[[#uni12]] +#CHECK: EXP 0, %[[#div_12]], +# CHECK: %[[#div_13:]]:vgpr_32 = COPY %[[#uni13]] +#CHECK: EXP 0, %[[#div_13]], +# CHECK: %[[#div_14:]]:vgpr_32 = COPY %[[#uni14]] +#CHECK: EXP 0, %[[#div_14]], +# CHECK: %[[#div_15:]]:vgpr_32 = COPY %[[#uni15]] +#CHECK: EXP 0, %[[#div_15]], +# CHECK: %[[#div_16:]]:vgpr_32 = COPY %[[#uni16]] +#CHECK: EXP 0, %[[#div_16]], +# CHECK: %[[#div_17:]]:vgpr_32 = COPY %[[#uni17]] +#CHECK: EXP 0, %[[#div_17]], +# CHECK: %[[#div_18:]]:vgpr_32 = COPY %[[#uni18]] +#CHECK: EXP 0, %[[#div_18]], +# CHECK: %[[#div_19:]]:vgpr_32 = COPY %[[#uni19]] +#CHECK: EXP 0, %[[#div_19]], +# CHECK: %[[#div_20:]]:vgpr_32 = COPY %[[#uni20]] +#CHECK: EXP 0, %[[#div_20]], +# CHECK: %[[#div_21:]]:vgpr_32 = COPY %[[#uni21]] +#CHECK: EXP 0, %[[#div_21]], +# CHECK: %[[#div_22:]]:vgpr_32 = COPY %[[#uni22]] +#CHECK: EXP 0, %[[#div_22]], +# CHECK: %[[#div_23:]]:vgpr_32 = COPY %[[#uni23]] +#CHECK: EXP 0, %[[#div_23]], +# CHECK: %[[#div_24:]]:vgpr_32 = COPY %[[#uni24]] +#CHECK: EXP 0, %[[#div_24]], +# CHECK: %[[#div_25:]]:vgpr_32 = COPY %[[#uni25]] +#CHECK: EXP 0, %[[#div_25]], +# CHECK: %[[#div_26:]]:vgpr_32 = COPY %[[#uni26]] +#CHECK: EXP 0, %[[#div_26]], +# CHECK: %[[#div_27:]]:vgpr_32 = COPY %[[#uni27]] +#CHECK: EXP 0, %[[#div_27]], +# CHECK: %[[#div_28:]]:vgpr_32 = COPY %[[#uni28]] +#CHECK: EXP 0, %[[#div_28]], +# CHECK: %[[#div_29:]]:vgpr_32 = COPY %[[#uni29]] +#CHECK: EXP 0, %[[#div_29]], +# CHECK: %[[#div_30:]]:vgpr_32 = COPY %[[#uni30]] +#CHECK: EXP 0, %[[#div_30]], +# CHECK: %[[#div_31:]]:vgpr_32 = COPY %[[#uni31]] +#CHECK: EXP 0, %[[#div_31]], +# CHECK: %[[#div_32:]]:vgpr_32 = COPY %[[#uni32]] +#CHECK: EXP 0, %[[#div_32]], +# CHECK: %[[#div_33:]]:vgpr_32 = COPY %[[#uni33]] +#CHECK: EXP 0, %[[#div_33]], +# CHECK: %[[#div_34:]]:vgpr_32 = COPY %[[#uni34]] +#CHECK: EXP 0, %[[#div_34]], +# CHECK: %[[#div_35:]]:vgpr_32 = COPY %[[#uni35]] +#CHECK: EXP 0, %[[#div_35]], +# CHECK: %[[#div_36:]]:vgpr_32 = COPY %[[#uni36]] +#CHECK: EXP 0, %[[#div_36]], +# CHECK: %[[#div_37:]]:vgpr_32 = COPY %[[#uni37]] +#CHECK: EXP 0, %[[#div_37]], +# CHECK: %[[#div_38:]]:vgpr_32 = COPY %[[#uni38]] +#CHECK: EXP 0, %[[#div_38]], +# CHECK: %[[#div_39:]]:vgpr_32 = COPY %[[#uni39]] +#CHECK: EXP 0, %[[#div_39]], +# CHECK: %[[#div_40:]]:vgpr_32 = COPY %[[#uni40]] +#CHECK: EXP 0, %[[#div_40]], +# CHECK: %[[#div_41:]]:vgpr_32 = COPY %[[#uni41]] +#CHECK: EXP 0, %[[#div_41]], +# CHECK: %[[#div_42:]]:vgpr_32 = COPY %[[#uni42]] +#CHECK: EXP 0, %[[#div_42]], +# CHECK: %[[#div_43:]]:vgpr_32 = COPY %[[#uni43]] +#CHECK: EXP 0, %[[#div_43]], +# CHECK: %[[#div_44:]]:vgpr_32 = COPY %[[#uni44]] +#CHECK: EXP 0, %[[#div_44]], +# CHECK: %[[#div_45:]]:vgpr_32 = COPY %[[#uni45]] +#CHECK: EXP 0, %[[#div_45]], +# CHECK: %[[#div_46:]]:vgpr_32 = COPY %[[#uni46]] +#CHECK: EXP 0, %[[#div_46]], +# CHECK: %[[#div_47:]]:vgpr_32 = COPY %[[#uni47]] +#CHECK: EXP 0, %[[#div_47]], +# CHECK: %[[#div_48:]]:vgpr_32 = COPY %[[#uni48]] +#CHECK: EXP 0, %[[#div_48]], +# CHECK: %[[#div_49:]]:vgpr_32 = COPY %[[#uni49]] +#CHECK: EXP 0, %[[#div_49]], +# CHECK: %[[#div_50:]]:vgpr_32 = COPY %[[#uni50]] +#CHECK: EXP 0, %[[#div_50]], +# CHECK: %[[#div_51:]]:vgpr_32 = COPY %[[#uni51]] +#CHECK: EXP 0, %[[#div_51]], +# CHECK: %[[#div_52:]]:vgpr_32 = COPY %[[#uni52]] +#CHECK: EXP 0, %[[#div_52]], +# CHECK: %[[#div_53:]]:vgpr_32 = COPY %[[#uni53]] +#CHECK: EXP 0, %[[#div_53]], +# CHECK: %[[#div_54:]]:vgpr_32 = COPY %[[#uni54]] +#CHECK: EXP 0, %[[#div_54]], +# CHECK: %[[#div_55:]]:vgpr_32 = COPY %[[#uni55]] +#CHECK: EXP 0, %[[#div_55]], +# CHECK: %[[#div_56:]]:vgpr_32 = COPY %[[#uni56]] +#CHECK: EXP 0, %[[#div_56]], +# CHECK: %[[#div_57:]]:vgpr_32 = COPY %[[#uni57]] +#CHECK: EXP 0, %[[#div_57]], +# CHECK: %[[#div_58:]]:vgpr_32 = COPY %[[#uni58]] +#CHECK: EXP 0, %[[#div_58]], +# CHECK: %[[#div_59:]]:vgpr_32 = COPY %[[#uni59]] +#CHECK: EXP 0, %[[#div_59]], + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr8, $vgpr0, $vgpr1 + + %1000:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1001:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1002:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1003:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1004:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1005:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1006:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1007:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1008:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1009:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %1059, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %99:vgpr_32 = COPY %1058 + S_BRANCH %bb.2 + + bb.2: + %1:vgpr_32 = IMPLICIT_DEF + EXP 0, killed %1000, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1001, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1002, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1003, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1004, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1005, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1006, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1007, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1008, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1009, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %1, %1, %1, -1, -1, 15, implicit $exec + S_ENDPGM 0 +...