From b6eb3b304f0a193a3660d921eae1401ed85ff1b2 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 20:24:04 -0700 Subject: [PATCH 01/25] Added rematerialize pass and test. --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 8 + llvm/lib/CodeGen/TargetRegisterInfo.cpp | 91 + llvm/lib/Target/AMDGPU/AMDGPU.h | 4 + .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 4665 +++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 2241 ++++++++ llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 217 + .../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp | 2767 ++++++++++ .../AMDGPU/AMDGPUMirDivergenceAnalysis.h | 281 + .../AMDGPUMirSyncDependenceAnalysis.cpp | 511 ++ .../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h | 98 + .../AMDGPUOccupancyAndLatencyHelper.cpp | 188 + .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 74 + llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 1790 +++++++ llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h | 197 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h | 106 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 6 + llvm/lib/Target/AMDGPU/GCNRegPressure.h | 4 + llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 + .../CodeGen/AMDGPU/remat/vector_to_scalar.mir | 405 ++ 20 files changed, 13657 insertions(+) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h create mode 100644 llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index e4fad8f9ec869..974cd8a5f36b4 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -430,6 +430,14 @@ class TargetRegisterInfo : public MCRegisterInfo { LaneBitmask LaneMask, SmallVectorImpl &Indexes) const; + /// Return the set of sub register indexes that minimally cover the given + /// lane mask for the given register class. + /// + /// \returns an empty set if there is no set of covering sub registers. + std::vector + getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC, + LaneBitmask mask) const; + /// The lane masks returned by getSubRegIndexLaneMask() above can only be /// used to determine if sub-registers overlap - they can't be used to /// determine if a set of sub-registers completely cover another diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 701a9f8d72a65..d458648fd8bd8 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -727,3 +727,94 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex, dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n"; } #endif + +std::vector +TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask( + const TargetRegisterClass *RC, LaneBitmask mask) const { + // TODO: this could replace the code it was copied from in SplitKit.cpp + + // First pass: Try to find a perfectly matching subregister index. + // If none exists find the one covering the most lanemask bits. + SmallVector PossibleIndexes; + unsigned BestIdx = 0; + const LaneBitmask avoid = ~mask; + { + unsigned BestCover = 0; + for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == mask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside + if ((SubRegMask & avoid).any()) + continue; + + unsigned PopCount = SubRegMask.getNumLanes(); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) { + LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " + << getRegClassName(RC) << " mask " << PrintLaneMask(mask) + << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + std::vector result; + result.push_back(BestIdx); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + mask &= ~(getSubRegIndexLaneMask(BestIdx)); + while (mask.any()) { + BestIdx = 0; + int BestCover = std::numeric_limits::min(); + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == mask) { + BestIdx = Idx; + break; + } + + // Guaranteed above + assert((SubRegMask & avoid).none()); + + // Try to cover as much of the remaining lanes as possible but as few of + // the already covered lanes as possible. + int Cover = (SubRegMask & mask).getNumLanes() - + (SubRegMask & ~mask).getNumLanes(); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) { + LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " + << getRegClassName(RC) << " mask " << PrintLaneMask(mask) + << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + result.push_back(BestIdx); + mask &= ~getSubRegIndexLaneMask(BestIdx); + } + + return result; +} + diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index f5c2b09c84806..24e9bb358d519 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -491,6 +491,10 @@ extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); extern char &AMDGPUWaitSGPRHazardsLegacyID; +void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &); +FunctionPass *createAMDGPUHotBlockRematerializePass(); +extern char &AMDGPUHotBlockRematerializeID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp new file mode 100644 index 0000000000000..44ebaa2d51bec --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -0,0 +1,4665 @@ +//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU hot block Rematerialize +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUMirDivergenceAnalysis.h" +#include "AMDGPUSubExpDag.h" +#include "AMDGPUVMemDegreeDAG.h" +#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "GCNRegPressure.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" +#include "AMDGPUMIRUtils.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/SlotIndexes.h" + +#include +#define DEBUG_TYPE "amdgpu-hot-block-remat" + +using namespace llvm; + +static cl::opt TargetOccupancy("amdgpu-remat-target-occupancy"); +static cl::opt EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive"); +static cl::opt EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive"); +static cl::opt EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone"); +static cl::opt EnableVmemDegree("amdgpu-remat-enable-vmem-degree"); +static cl::opt EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat"); +static cl::opt EnableSubExp("amdgpu-remat-enable-sub-exp-remat"); +static cl::opt EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos"); +static cl::opt EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg"); + +namespace { +typedef DenseSet InstSet; +typedef DenseSet BlockSet; +template +using BlockMap = MapVector; + +// Rematerialize in a single pass instead of doing in register allcation. +// If in register allocation, fail to rematerialize will cause spill. +class AMDGPUHotBlockRematerialize : public MachineFunctionPass { + +public: + static char ID; + + DenseSet TotalUniformInsts; + DenseSet SafeToRemoveInsts; + DenseSet DivergentInsts; + void RemoveInst(const MachineInstr *MI) { + TotalUniformInsts.erase(MI); + SafeToRemoveInsts.erase(MI); + DivergentInsts.erase(MI); + } + + AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "AMDGPU rematerialize"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +typedef AMDGPUHotBlockRematerialize Remat; + +} // end anonymous namespace + +// Util functions. +namespace { + +MachineBasicBlock * +nearest_common_dominator(MachineDominatorTree *DT, + BlockSet &Blocks) { + auto I = Blocks.begin(), E = Blocks.end(); + + MachineBasicBlock *DomB = cast(*(I++)); + while (I != E) { + MachineBasicBlock *B = cast(*(I++)); + DomB = DT->findNearestCommonDominator(DomB, B); + if (DomB == nullptr) + return nullptr; + } + // For split block like: + // bb.42: + // %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec, + // // implicit $exec + // %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // implicitdef $scc, implicit $exec + // + // bb.68: + //; predecessors: %bb.42 + // successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%), + // %bb.43(50.00%) + // + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // which is from + // bb.42: + //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit + //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // The real common dom is bb.42. + // TODO: use _term version of exec update instructions so don't need this + // anymore. + if (DomB && DomB->pred_size() == 1 && !DomB->empty()) { + // Upstreaming note: This used to be SI_MASK_BRANCH + if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) { + MachineBasicBlock *Pred = *DomB->pred_begin(); + if (Pred->succ_size() == 1 && + (Pred->empty() || !Pred->back().isBranch())) { + DomB = Pred; + } + } + } + + return DomB; +} + +MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB, + MachineDominatorTree *DT, + MachineLoopInfo *LI) { + while (LI->getLoopDepth(BB) > 0) { + MachineDomTreeNode *N = DT->getNode(BB); + if (N == nullptr) + return nullptr; + MachineDomTreeNode *IDom = N->getIDom(); + if (IDom == nullptr) + return nullptr; + + BB = IDom->getBlock(); + } + + return BB; +} + +MachineBasicBlock * +FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + const MachineRegisterInfo &MRI, bool bMemBound) { + + BlockSet BBSet; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + BBSet.insert(UseMI.getParent()); + } + if (BBSet.size() == 0) + return nullptr; + + MachineBasicBlock *BB = *BBSet.begin(); + if (BBSet.size() > 1) { + MachineBasicBlock *BDom = nearest_common_dominator(DT, BBSet); + if (!BDom) + return nullptr; + BB = BDom; + } + // Try to find non loop dominator. + if (!bMemBound) { + BB = find_non_loop_dominator(BB, DT, MLI); + } + if (!BB) + return nullptr; + + // If BB is already a hot block, move to BB will not help. + // hotBlockRemat will fail it when process BB. + + // Must reachable from DefMI. + if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB)) + return nullptr; + + return BB; +} + +bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + unsigned OpNum = DefMI->getNumOperands(); + + // Only move DefMI which all operand is unique def. + for (unsigned i = 0; i < OpNum; i++) { + MachineOperand &Op = DefMI->getOperand(i); + if (!Op.isReg()) + continue; + if (!MRI.getUniqueVRegDef(Op.getReg()) && + !llvm::IsSub0Sub1SingleDef(Op.getReg(), MRI)) { + return false; + } + } + return true; +} + + +// SGPR has alignment requirment, cannot get accurate reg number. +const unsigned NearTargetRegLimit = 10; +bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) { + unsigned maxSGPR = ST->getAddressableNumSGPRs(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg) + maxSGPR -= 4; + + const unsigned AlignmentDelta = 3; + maxSGPR -= AlignmentDelta; + + return maxSPressure > maxSGPR; +} + +struct RematStatus { + unsigned TargetOcc; + unsigned TargetVLimit; + unsigned TargetSLimit; + unsigned MaxVPressure; + unsigned MaxSPressure; + unsigned InputPhysicalVPressure; + unsigned InputPhysicalSPressure; + // More occupancy can help more than latency cost to reach it. + bool bMemBound; + // abs(VTargetOcc-STargetOcc) > 1. + bool bNotBalance; + DenseMap MBBPressureMap; + DenseMap MBBInputLiveMap; + DenseMap MBBOutputLiveMap; + // Collect MBBs which has memory write. When move instructions cross MBB, skip + // mem inst if the MBB has memory write. To make things fast, just check + // mayStore and isBarrier. + DenseSet MemWriteMBBSet; +}; + +unsigned CollectMBBPressure( + MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure, + RematStatus &status) { + // Skip processing current block if it has only debug instructions + if (MBB.getFirstNonDebugInstr() == MBB.end()) + return ST->getOccupancyWithNumVGPRs(0); + auto BBEnd = MBB.rbegin(); + GCNUpwardRPTracker RPTracker(*LIS); + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB)) + return ST->getOccupancyWithNumVGPRs(0); + + GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; + RPTracker.reset(*BBEnd, &outputLive, true); + + for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) { + MachineInstr &MI = (*I++); + RPTracker.recede(MI); + if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH)) + status.MemWriteMBBSet.insert(&MBB); + } + + GCNRegPressure RP = RPTracker.getMaxPressureAndReset(); + unsigned sPressure = RP.getMaxSGPR(); + if (sPressure > maxSPressure) { + maxSPressure = sPressure; + } + if (RP.getVGPRNum(ST->hasGFX90AInsts()) > maxVPressure) { + maxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + } + status.MBBPressureMap[&MBB] = RP; + return RP.getOccupancy(*ST); +} + +unsigned CollectFnPressure( + MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure, + RematStatus &status) { + unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF); + // If only have one block, input/ouput virtual live set are empty. + if (MF.size() > 1) { + // Build input output live reg first. + auto *SlotIndexes = LIS->getSlotIndexes(); + DenseMap MBBInputSlotMap; + DenseMap MBBOutputSlotMap; + for (MachineBasicBlock &MBB : MF) { + auto BBBegin = MBB.getFirstNonDebugInstr(); + if (BBBegin != MBB.end()) { + auto SI = SlotIndexes->getInstructionIndex(*BBBegin); + MBBInputSlotMap[&MBB] = SI; + } + + auto BBEnd = MBB.rbegin(); + + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) { + auto SI = SlotIndexes->getInstructionIndex(*BBEnd); + MBBOutputSlotMap[&MBB] = SI; + } + } + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + + LaneBitmask LiveMask; + const auto &LI = LIS->getInterval(Reg); + + // Skip local live interval to make live input/ouput faster. + if (llvm::isLocalLiveInterval(LI, SlotIndexes)) + continue; + + for (auto inputIt : MBBInputSlotMap) { + MachineBasicBlock *MBB = inputIt.first; + auto SI = inputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + status.MBBInputLiveMap[MBB][Reg] |= LiveMask; + } + + for (auto outputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = outputIt.first; + auto SI = outputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + status.MBBOutputLiveMap[MBB][Reg] |= LiveMask; + } + } + } + + LLVM_DEBUG( + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + dbgs() << "output live"; for (auto &it + : status.MBBOutputLiveMap) { + unsigned Idx = it.first->getNumber(); + auto LiveReg = it.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + } dbgs() << "input live"; + for (auto &it + : status.MBBInputLiveMap) { + unsigned Idx = it.first->getNumber(); + auto LiveReg = it.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + }); + + for (auto it = MF.begin(); it != MF.end(); ++it) { + MachineBasicBlock &MBB = *it; + unsigned Occ = CollectMBBPressure(MBB, LIS, MRI, ST, maxVPressure, + maxSPressure, status); + if (TgtOcc > Occ) + TgtOcc = Occ; + } + return TgtOcc; +} +RematStatus +GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const GCNSubtarget *ST) { + unsigned maxSPressure = 0; + unsigned maxVPressure = 0; + RematStatus status; + unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, + maxSPressure, status); + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (TgtOcc >= MaxOcc) { + status.TargetOcc = TgtOcc; + status.TargetVLimit = 0; + status.TargetSLimit = 0; + status.MaxVPressure = 0; + status.MaxSPressure = 0; + status.InputPhysicalVPressure = 0; + status.InputPhysicalSPressure = 0; + status.bMemBound = false; + status.bNotBalance = false; + return status; + } + + maxSPressure += RegForVCC; + maxVPressure = std::min(maxVPressure, ST->getMaxNumVGPRs(MF)); + unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(maxSPressure); + unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(maxVPressure); + + llvm::SchedScore totalScore = llvm::CollectLatency(MF, *ST, MLI); + bool bMemBound = + totalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc); + + bool bNotBalance = false; + + const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU(); + // Currently, only sgpr bound can be fixed with remat. + if (STgtOcc < VTgtOcc) { + unsigned bigOcc = std::max(STgtOcc, VTgtOcc); + // Change TgtOcc to bigOcc in case sgpr and vgpr is not balance. + if (bigOcc > TgtOcc) { + TgtOcc = bigOcc; + bNotBalance = true; + if (TgtOcc >= MaxOccupancy) + TgtOcc = MaxOccupancy-1; + } + } + + // Collect input physical pressure. + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + unsigned vInputPressure = 0; + uint64_t sInputMask = 0; + for (const auto &livein : MRI.liveins()) { + const Register Reg = livein.first; + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + assert(Reg.isPhysical() && "input must be physical reg"); + unsigned RegSize = RC->getLaneMask().getNumLanes(); + if (SIRI->isVGPR(MRI, Reg)) { + vInputPressure += RegSize; + } else { + unsigned RegIndex = SIRI->getHWRegIndex(Reg); + uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex; + sInputMask |= mask; + } + } + // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high + // pressure. + unsigned sInputPressure = 0; + uint64_t mask = 0xf; + while (mask != 0) { + if (mask & sInputMask) { + sInputPressure += 4; + } + mask = mask << 4; + } + + + // If balanced, try next occupancy. + TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1); + + auto CC = MF.getFunction().getCallingConv(); + bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS; + // For shader profiles other than ps/cs, set target profile max as 4. + if (!IsPsCs) { + TgtOcc = TgtOcc > 4 ? 4 : TgtOcc; + } + if (TargetOccupancy) + TgtOcc = TargetOccupancy; + + unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true); + unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc); + + status.TargetOcc = TgtOcc; + status.TargetVLimit = VLimit; + status.TargetSLimit = SLimit; + status.MaxVPressure = maxVPressure; + status.MaxSPressure = maxSPressure; + status.InputPhysicalVPressure = vInputPressure; + status.InputPhysicalSPressure = sInputPressure; + status.bMemBound = bMemBound; + status.bNotBalance = bNotBalance; + return status; +} + +} // namespace + +// Remat. +namespace { + +struct RematNode { + enum class RematKind { + Candidate, // Not ready yet. + OneDefOneUse, + Clone, + }; + RematNode() + : Reg(0), DefMI(nullptr), Kind(RematKind::Candidate), + InsertPointMI(nullptr), InsertBlock(nullptr), Size(0) {} + RematNode(unsigned R, MachineInstr *MI, unsigned S) + : Reg(R), DefMI(MI), Kind(RematKind::Candidate), InsertPointMI(nullptr), + InsertBlock(nullptr), Size(S) {} + RematNode(const RematNode &N) + : Reg(N.Reg), DefMI(N.DefMI), Kind(N.Kind), + InsertPointMI(N.InsertPointMI), InsertBlock(N.InsertBlock), + Size(N.Size) {} + unsigned Reg; + MachineInstr *DefMI; + MachineBasicBlock *InsertBlock; + union { + MachineInstr *InsertPointMI; + unsigned UserCount; + }; + RematKind Kind; + unsigned Size; +}; + +struct BlockLiveInfo { + MachineBasicBlock *BB; + unsigned maxSReg; + unsigned maxVReg; + // Input live is the live reg which cross block. + const GCNRPTracker::LiveRegSet inputLive; +}; + +// Skip live reg remated to other block. +void UpdateLiveInfo(MapVector &RematMap, + GCNRPTracker::LiveRegSet &LiveSet, + const GCNRPTracker::LiveRegSet &inputLive, + MachineBasicBlock *CurBB, + DenseMap &RPOTIndexMap) { + for (auto &it : RematMap) { + unsigned Reg = it.first; + // Skip reg not in live set. + if (!LiveSet.count(Reg)) + continue; + // Skip reg already in input set. + // Input set will be taken care in GetReducedSize. + if (inputLive.count(Reg)) + continue; + + auto &Node = it.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is after InsertBB in Reverse post order, the def is + // still before LiveInfo.BB, it is still live. + unsigned LiveBBIndex = RPOTIndexMap[CurBB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex > InsertBBIndex) { + continue; + } + } + // Already in remat map, don't need to check again, remove from + // candidate. + LiveSet.erase(Reg); + } +} + +int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + + // Find shared operand in ReducedInsts. + int SharedSize = 0; + DenseMap SharedRegMaskMap; + for (MachineInstr *DefMI : ReducedInsts) { + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + Register Reg = MO.getReg(); + + if (Reg == AMDGPU::EXEC) + continue; + if (!Reg.isVirtual()) + continue; + + bool isVGPR = SIRI->isVGPR(MRI, MO.getReg()); + if (bVGPR != isVGPR) { + // Not support mix of v and s when remat now. + continue; + } + + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + unsigned Mask; + if (unsigned SubIdx = MO.getSubReg()) { + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + Mask = (1 << SubMOSize) - 1; + } else { + Mask = (1 << MOSize) - 1; + } + auto SharedRegIt = SharedRegMaskMap.find(Reg); + if (SharedRegIt == SharedRegMaskMap.end()) { + SharedRegMaskMap[Reg] = LaneBitmask(Mask); + } else { + unsigned PrevMask = SharedRegIt->second.getAsInteger(); + if (unsigned SharedMask = (PrevMask & Mask)) { + // Some thing is shared. + for (int i = 0; i < MOSize; i++) { + if (SharedMask & (1 << i)) { + SharedSize += 1; + } + } + } + LaneBitmask MoMask = LaneBitmask(Mask | PrevMask); + SharedRegMaskMap[Reg] = MoMask; + } + } + } + return SharedSize; +} + +int GetReducedSize(MapVector &RematMap, bool bVGPR, + GCNRPTracker::LiveRegSet &CanidateSet, + InstSet &ReducedInsts, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + BlockLiveInfo &LiveInfo, + DenseMap &RPOTIndexMap) { + int ReducedSize = 0; + for (auto &it : RematMap) { + unsigned Reg = it.first; + + if (!CanidateSet.count(Reg)) + continue; + + bool bReduced = false; + auto &Node = it.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is before InsertBB in Reverse post order, the def is + // moved after LiveInfo.BB, it is not live anymore. + unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex < InsertBBIndex) + bReduced = true; + } else { + // Clone. + bReduced = true; + // If has use in LiveInfo.BB, could not reduce from input live. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == LiveInfo.BB) { + bReduced = false; + break; + } + } + } + if (bReduced) { + ReducedSize += Node.Size; + ReducedInsts.insert(Node.DefMI); + } + + // Already in remat map, don't need to check again, remove from candidate. + CanidateSet.erase(Reg); + } + + return ReducedSize; +} + +int RematGain(MachineInstr *DefMI, unsigned Reg, + GCNRPTracker::LiveRegSet &CandidateRegSet, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + bool bVGPR) { + int rematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + + if (MO.getReg() == AMDGPU::EXEC) + continue; + + // Don't move user of VCC. + if (MO.getReg() == AMDGPU::VCC) { + rematSize = 0; + break; + } + Register Reg = MO.getReg(); + + // Don't move physical register use. + if (Reg.isPhysical()) { + rematSize = 0; + break; + } + + bool isVGPR = SIRI->isVGPR(MRI, Reg); + if (bVGPR != isVGPR) { + // Not support mix of v and s when remat now. + // TODO: count possible pressure change here. + rematSize = 0; + break; + } + bool bSingleDef = MRI.hasOneDef(Reg); + if (!bSingleDef) { + bSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI); + } + + if (bSingleDef) { + // The reg might share with other candidates, but not check it here. + // Count share reg in GetReducedSize. + if (EnableAggressive) { + // In case of aggressive remat, treat multi use reg as shared reg and + // ignore size of shared reg. + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + } + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + if (unsigned SubIdx = MO.getSubReg()) { + if (OpRC) + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + } + int inputSize = SIRI->getRegSizeInBits(*OpRC); + // If input not live in hotspot, move it cross hotspot should have + // less reg then DefMi. + if (rematSize > inputSize) { + rematSize -= inputSize; + continue; + } + } + + rematSize = 0; + break; + } + return rematSize; +} + +void BuildRematCandiates(std::vector &Candidates, + GCNRPTracker::LiveRegSet &CandidateRegSet, + DenseSet &PinnedRegSet, + const MachineRegisterInfo &MRI, + const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, + bool bVGPR) { + + for (auto liveRegIt : CandidateRegSet) { + unsigned Reg = liveRegIt.first; + // Skip unsafe reg. + if (PinnedRegSet.count(Reg)) + continue; + + bool isVGPR = SIRI->isVGPR(MRI, Reg); + if (isVGPR != bVGPR) + continue; + bool bSafeCandidate = true; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (MI) { + if (bVGPR) { + // Only remat valu now. + if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY) + bSafeCandidate = false; + if (MI->getOpcode() == AMDGPU::COPY) { + // Make sure src is unique define. + if (MI->getOperand(1).isReg() && + nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg())) + bSafeCandidate = false; + } else { + // Skip convergent valu. + if (MI->isConvergent()) + bSafeCandidate = false; + } + } + // Skip inst has more than 1 def. + if (MI->getDesc().NumDefs > 1) + bSafeCandidate = false; + } else { + bSafeCandidate = false; + } + + if (bSafeCandidate) { + int gain = RematGain(MI, Reg, CandidateRegSet, MRI, SIRI, bVGPR); + if (gain > 0) { + Candidates.emplace_back(RematNode(Reg, MI, gain >> 5)); + } else { + bSafeCandidate = false; + } + } + // Save unsafe reg. + if (!bSafeCandidate) + PinnedRegSet.insert(Reg); + } + + // Sort by gain. + std::sort(Candidates.begin(), Candidates.end(), + [](RematNode &i, RematNode &j) { return i.Size > j.Size; }); +} + +// For case like +// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform +// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform +// %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform +// Sink S_AND right before S_CSELECT will overwrite SCC. +// To avoid it, skip case when DefMI and UseMI has implicit define use. +bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { + if (DefMI->getDesc().NumImplicitDefs == 0) + return false; + + auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo(); + for (MachineOperand &def : DefMI->implicit_operands()) { + if (!def.isReg()) + continue; + if (def.isUse()) + continue; + unsigned Reg = def.getReg(); + if (UseMI->readsRegister(Reg, TRI)) + return true; + } + return false; +} + +void AddOneDefOneUseCandidate(RematNode &Node, + std::vector &RematList, + MachineRegisterInfo &MRI, int &rematCnt, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, + MachineLoopInfo *MLI, bool bVGPR, + bool bMemBound) { + unsigned Reg = Node.Reg; + MachineInstr *DefMI = Node.DefMI; + + unsigned size = Node.Size; + MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin(); + MachineBasicBlock *InsertBB = UseMI->getParent(); + + // For VGPR, always move next to the only user to avoid wqm or exec issue. + // But doing this will cause issue when DefMI is in wqm but single user not in + // wqm. Disable VGPR remat for now. + // TODO: make sure single user don't need wqm. + if (!bVGPR) { + if (MachineBasicBlock *NewInsertBB = + FindInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, bMemBound)) { + if (InsertBB != NewInsertBB) { + InsertBB = NewInsertBB; + // If can find a non-loop insert block, go to the insert block. + if (DefMI->getParent() != InsertBB) { + if (!InsertBB->empty()) { + auto it = InsertBB->getFirstNonPHI(); + it = skipDebugInstructionsForward(it, InsertBB->end()); + if (it == InsertBB->end()) + UseMI = nullptr; + else + UseMI = &*it; + } + } + } + } + } + + if (bVGPR) { + // Don't count reg in same block for valu. + if (UseMI->getParent() == DefMI->getParent()) + return; + } + + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(DefMI, UseMI)) { + return; + } + + Node.InsertBlock = InsertBB; + Node.InsertPointMI = UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + RematList.emplace_back(Node); + rematCnt += size; +} + +void AddCloneCandidate(std::vector &cloneList, + std::vector &RematList, + DenseSet &PinnedRegSet, + MachineRegisterInfo &MRI, int &rematCnt, + SlotIndexes *SlotIndexes, MachineFunction &MF) { + // Group user in same blocks. + std::vector UserSetList(cloneList.size()); + + for (int i = 0; i < cloneList.size(); i++) { + auto *Node = cloneList[i]; + unsigned Reg = Node->Reg; + MachineInstr *DefMI = Node->DefMI; + // Group user in same blocks. + BlockSet &UserSet = UserSetList[i]; + + for (auto useIt = MRI.use_instr_nodbg_begin(Reg); + useIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(useIt++); + UserSet.insert(UseMI.getParent()); + } + + if (UserSet.size() == 1) { + // All users are in same block with DefMI. + if (*UserSet.begin() == DefMI->getParent()) { + // Mark cannot remat for now. + // TODO: try to split if is bigger than 4 and only used once per + // channel. + PinnedRegSet.insert(Reg); + continue; + } + } + + int size = Node->Size; + size <<= 16; + // Pack userSet size to size. + size |= UserSet.size(); + Node->UserCount = size; + } + + std::sort(cloneList.begin(), cloneList.end(), + // Sort based on userSet size. + [](const RematNode *a, const RematNode *b) { + static constexpr int mask = 0xffff; + return (a->UserCount & mask) < (b->UserCount & mask); + }); + + for (RematNode *Node : cloneList) { + Node->Kind = RematNode::RematKind::Clone; + RematList.emplace_back(*Node); + rematCnt += Node->Size; + } +} + +int FilterRematCandiates(std::vector &Candidates, + std::vector &RematList, + DenseSet &PinnedRegSet, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + MachineFunction &MF, SlotIndexes *SlotIndexes, + bool bVGPR, bool bMemBound) { + int rematCnt = 0; + // Work one def one use first. + for (auto &Node : Candidates) { + unsigned Reg = Node.Reg; + if (!MRI.hasOneNonDBGUse(Reg)) { + continue; + } + MachineInstr *DefMI = Node.DefMI; + if (!IsSafeToMove(DefMI, MRI)) { + PinnedRegSet.insert(Reg); + continue; + } + + AddOneDefOneUseCandidate(Node, RematList, MRI, rematCnt, DT, PDT, MLI, + bVGPR, bMemBound); + } + + if (!bVGPR) { + std::vector cloneList; + // Try multi use case. + for (auto &Node : Candidates) { + unsigned Reg = Node.Reg; + if (MRI.hasOneNonDBGUse(Reg)) { + continue; + } + MachineInstr *DefMI = Node.DefMI; + if (!IsSafeToMove(DefMI, MRI)) { + PinnedRegSet.insert(Reg); + continue; + } + + // Clone for each user. + cloneList.emplace_back(&Node); + } + + AddCloneCandidate(cloneList, RematList, PinnedRegSet, MRI, rematCnt, + SlotIndexes, MF); + } + + return rematCnt; +} + +void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef, + SmallVector &userMIs) { + for (MachineInstr *UseMI : userMIs) { + for (MachineOperand &MO : UseMI->operands()) { + if (!MO.isReg()) + continue; + if (MO.getReg() == Reg) { + MO.setReg(NewReg); + if (bSubRegDef) + MO.setSubReg(0); + } + } + } +} + +DenseMap reduceClonedMBBs( + unsigned Reg, BlockMap> &userBlocks, + DenseSet &UserMBBSet, + std::vector &hotBlocks, MachineDominatorTree *pDT) { + // Collect hot blocks which Exp is live in. + DenseSet hotBlockSet; + for (BlockLiveInfo &hotBlock : hotBlocks) { + if (hotBlock.inputLive.count(Reg)) { + hotBlockSet.insert(hotBlock.BB); + } + } + + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseSet afterHotRangeMBBs; + for (MachineBasicBlock *MBB : UserMBBSet) { + // Always clone in hot block. + if (hotBlockSet.count(MBB)) + continue; + + bool bDomAllHotBlocks = true; + bool bDomedByAllHotBlocks = true; + for (MachineBasicBlock *hotMBB : hotBlockSet) { + if (!pDT->dominates(MBB, hotMBB)) { + bDomAllHotBlocks = false; + } + if (!pDT->dominates(hotMBB, MBB)) { + bDomedByAllHotBlocks = false; + } + if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) { + break; + } + } + if (bDomAllHotBlocks) { + userBlocks.erase(MBB); + } else if (bDomedByAllHotBlocks) { + afterHotRangeMBBs.insert(MBB); + } + } + + // Split after hotRange block set by domtree. + DenseMap DomMap; + if (!afterHotRangeMBBs.empty()) { + for (auto it : afterHotRangeMBBs) { + MachineBasicBlock *MBB = it; + for (auto it2 : afterHotRangeMBBs) { + MachineBasicBlock *MBB2 = it2; + if (MBB == MBB2) + continue; + if (pDT->dominates(MBB, MBB2)) { + auto &Dom = DomMap[MBB]; + Dom.insert(MBB2); + auto &Dom2 = DomMap[MBB2]; + Dom.insert(Dom2.begin(), Dom2.end()); + } + } + } + for (auto it : afterHotRangeMBBs) { + MachineBasicBlock *MBB = it; + auto &Dom = DomMap[MBB]; + for (MachineBasicBlock *domedMBB : Dom) { + // Remove domedMBB. + DomMap.erase(domedMBB); + UserMBBSet.erase(domedMBB); + } + } + } + + return DomMap; +} + +// Look for an earlier insert point if the InstructionToMove +// writes to scc and scc is live at the CurrentInsertPoint. +static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash( + MachineInstr *InstructionToMove, + MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, + MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII +) +{ + const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) + { + CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB, + CurrentInsertPoint, + SIRI, + SIII, + &MRI + ); + } + + return CurrentInsertPoint; +} + +// Look for an earlier insert point if the SubExp +// writes to scc and scc is live at the CurrentInsertPoint. +static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash( + const SubExp &SubExpToMove, + MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, + MachineRegisterInfo& MRI, + const SIRegisterInfo* SIRI, + const SIInstrInfo* SIII +) +{ + const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) + { + CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB, + CurrentInsertPoint, + SIRI, + SIII, + &MRI + ); + } + + return CurrentInsertPoint; +} + +// Return trun if moving MI to Location will smash a live scc value. +static bool WillSmashSccAtLocation( + MachineInstr* MI, + MachineBasicBlock* MBB, + MachineBasicBlock::iterator Location +) +{ + // It is ok to pass nullptr to `modifiesRegister` for TRI here since + // SCC has no subreg/suprereg relationships. + return MI->modifiesRegister(AMDGPU::SCC, nullptr) + && llvm::IsSccLiveAt(MBB, Location); +} + +void ApplyCloneRemat(Remat *Remat, + RematNode &Node, std::vector &hotBlocks, + MachineDominatorTree *pDT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF) { + unsigned Reg = Node.Reg; + + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + auto DefOp = DefMI->getOperand(0); + const MCInstrDesc &Desc = DefMI->getDesc(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + // When the unique def has subReg, just create newReg for the subReg part. + bool bSubRegDef = false; + if (DefOp.getSubReg() != 0) { + RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg()); + bSubRegDef = true; + } + const DebugLoc DL = DefMI->getDebugLoc(); + unsigned OpNum = DefMI->getNumOperands(); + + Node.Kind = RematNode::RematKind::Clone; + + // Group user in same blocks. + BlockMap> UserMap; + DenseSet UserMBBSet; + for (auto useIt = MRI.use_instr_nodbg_begin(Reg); + useIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(useIt++); + UserMap[UseMI.getParent()].emplace_back(&UseMI); + UserMBBSet.insert(UseMI.getParent()); + } + + DenseMap DomMap = + reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, pDT); + + for (auto useIt : UserMap) { + MachineBasicBlock *MBB = useIt.first; + // Skip same block uses. + if (MBB == DefMI->getParent()) { + continue; + } + // Skip MBB which share clone from other MBBs. + if (UserMBBSet.count(MBB) == 0) + continue; + + unsigned NewReg = MRI.createVirtualRegister(RC); + auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg); + for (unsigned i = 1; i < OpNum; i++) { + NewDef = NewDef.add(DefMI->getOperand(i)); + } + + MachineInstr *InsertPointMI = useIt.second.front(); + SlotIndex lastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI); + + for (MachineInstr *UseMI : useIt.second) { + SlotIndex slot = SlotIndexes->getInstructionIndex(*UseMI); + if (lastSlot > slot) { + lastSlot = slot; + InsertPointMI = UseMI; + } + } + + MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash( + DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII + ); + + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(MF, MO); + } + + MBB->insert(InsertPoint, NewDef); + + SlotIndexes->insertMachineInstrInMaps(*NewDef); + + SmallVector &userMIs = useIt.second; + updateUsers(Reg, NewReg, bSubRegDef, userMIs); + + // update users in dom MBBs. + auto domMapIt = DomMap.find(MBB); + if (domMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : domMapIt->second) { + SmallVector &userMIs = UserMap[UpdateMBB]; + updateUsers(Reg, NewReg, bSubRegDef, userMIs); + } + } + + llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes); + } + if (MRI.use_empty(Reg)) { + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + Remat->RemoveInst(DefMI); + DefMI->eraseFromParent(); + } +} + +void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, + SlotIndexes *slotIndexes, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + MachineInstr *DefMI = Node.DefMI; + MachineInstr *InsertPointMI = Node.InsertPointMI; + MachineBasicBlock* MBB = nullptr; + + // Find a valid insert point. + MachineBasicBlock::iterator InsertPoint; + if (InsertPointMI) { + InsertPoint = InsertPointMI->getIterator(); + MBB = InsertPointMI->getParent(); + } else { + InsertPoint = Node.InsertBlock->getFirstTerminator(); + MBB = Node.InsertBlock; + } + + InsertPoint = AdjustInsertPointToAvoidSccSmash( + DefMI, MBB, InsertPoint, MRI, SIRI, SIII + ); + + // Move instruction to new location. + DefMI->removeFromParent(); + InsertPoint->getParent()->insert(InsertPoint, DefMI); + + // Update slot index. + slotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + slotIndexes->insertMachineInstrInMaps(*DefMI); +} + +void ApplyRemat(Remat *Remat, MapVector &RematMap, + std::vector &hotBlocks, + MachineDominatorTree *pDT, SlotIndexes *slotIndexes, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineFunction &MF) { + std::vector UpdateList; + for (auto &it : RematMap) { + UpdateList.emplace_back(it.second); + } + // Sort update list with slotIndex to make sure def moved before use. + // If use moved before def, it might not be the first use anymore. + std::sort(UpdateList.begin(), UpdateList.end(), + [&slotIndexes](RematNode &i, RematNode &j) { + SlotIndex a = slotIndexes->getInstructionIndex(*i.DefMI); + SlotIndex b = slotIndexes->getInstructionIndex(*j.DefMI); + return a < b; + }); + + for (RematNode &Node : UpdateList) { + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII); + } else if (Node.Kind == RematNode::RematKind::Clone) { + ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF); + } + } +} + +void dumpRematMap(MapVector &RematMap, + const SIRegisterInfo *SIRI) { + dbgs() << "\n rematMap: \n"; + for (auto it : RematMap) { + int Reg = it.first; + dbgs() << printReg(Reg, SIRI); + dbgs() << "\n"; + } +} + +int DebugBlockIndex = 42; + +void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet, + MapVector &VRematMap, + MapVector &SRematMap, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + llvm::dumpLiveSet(LiveSet, SIRI); + dumpRematMap(VRematMap, SIRI); + dumpRematMap(SRematMap, SIRI); +} + +void dumpCandidates(std::vector &RematCandidates, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + dbgs() << "\n Candidates: \n"; + unsigned TotalSize = 0; + for (RematNode &Node : RematCandidates) { + dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size; + dbgs() << "\n"; + TotalSize += Node.Size; + } + dbgs() << "Total Size:" << TotalSize << "\n"; +} + +} // namespace + +bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, MachineDominatorTree *pDT, + MachinePostDominatorTree *pPDT, bool &bNearTarget) { + const GCNSubtarget *ST = &MF.getSubtarget(); + + const SIInstrInfo *SIII = ST->getInstrInfo(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + ReversePostOrderTraversal RPOT(&MF); + DenseMap RPOTIndexMap; + for (MachineBasicBlock *MBB : RPOT) { + RPOTIndexMap[MBB] = RPOTIndexMap.size(); + } + + auto &MRI = MF.getRegInfo(); + + bool bUpdated = false; + RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST); + + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (status.TargetOcc >= MaxOcc) + return false; + + unsigned VLimit = status.TargetVLimit; + unsigned SLimit = status.TargetSLimit; + + int rematSCnt = status.MaxSPressure - SLimit; + // when agressive sgpr remat, reserve some for allocation lost. + if (EnableAggressive) + rematSCnt += NearTargetRegLimit; + + bool bSGPRSpill = false; + if (rematSCnt > 0) { + bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); + } + + bool bForceRematSgpr = bSGPRSpill | status.bNotBalance; + + // If bound by lds, skip. + if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) && + !bForceRematSgpr) + return false; + + MachineBasicBlock *EntryMBB = &MF.front(); + + auto *SlotIndexes = LIS->getSlotIndexes(); + + // Reg which already marked remat. + MapVector VRematMap; + MapVector SRematMap; + // Reg which cannot move around to remat. + DenseSet PinnedRegSet; + std::vector hotBlocks; + for (auto it = po_begin(EntryMBB); it != po_end(EntryMBB); it++) { + MachineBasicBlock *MBB = *it; + auto &RP = status.MBBPressureMap[MBB]; + // ignore block not hot. + if (RP.getVGPRNum(ST->hasGFX90AInsts()) < status.TargetVLimit && + (RP.getMaxSGPR() + RegForVCC + status.InputPhysicalSPressure) < + status.TargetSLimit) + continue; + // Collect reg pressure. + unsigned maxVPressure = 0; + unsigned maxSPressure = 0; + const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB]; + + const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB]; + LLVM_DEBUG( + dumpHotBlock(inputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI)); + + GCNDownwardRPTracker Tracker(*LIS); + + Tracker.reset(*MBB->begin(), &inputLive); + + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + Tracker.advance(); + auto LISLR = Tracker.getLiveRegs(); + // Update live set for things already remated. + UpdateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap); + UpdateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap); + + const GCNRPTracker::LiveRegSet &liveSet = LISLR; + unsigned VPressure = 0; + unsigned SPressure = 0; + CollectLiveSetPressure(liveSet, MRI, SIRI, VPressure, SPressure); + if (maxVPressure < VPressure) + maxVPressure = VPressure; + if (maxSPressure < SPressure) + maxSPressure = SPressure; + } + maxSPressure += RegForVCC + status.InputPhysicalSPressure; + if (maxVPressure <= VLimit && maxSPressure <= SLimit) + continue; + + // Build block live info. + // Use outputLive for EntryMBB. + BlockLiveInfo LiveInfo = {MBB, maxSPressure, maxVPressure, + MBB != EntryMBB ? inputLive : outputLive}; + // Skip entry block when save hotBlock to reduce clone because not clone in + // entry block. + if (MBB != EntryMBB) + hotBlocks.emplace_back(LiveInfo); + GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.inputLive; + + // Update reg pressure based on remat list. + InstSet VReducedInsts; + InstSet SReducedInsts; + int VReduced = + GetReducedSize(VRematMap, /*bVGPR*/ true, CandidateRegs, VReducedInsts, + MRI, SIRI, LiveInfo, RPOTIndexMap); + int SReduced = + GetReducedSize(SRematMap, /*bVGPR*/ false, CandidateRegs, SReducedInsts, + MRI, SIRI, LiveInfo, RPOTIndexMap); + + // Calculate size need to be remat. + int rematVCnt = maxVPressure - VReduced - VLimit; + int rematSCnt = maxSPressure - SReduced - SLimit; + + bool bSGPRSpill = false; + if (rematSCnt > 0) { + bSGPRSpill = nearSgprSpill(maxSPressure, ST, MF); + } + bool bForceRematSgpr = bSGPRSpill | status.bNotBalance; + // Try to add candidates into remat list. + + int newRematSCnt = 0; + if (rematSCnt > 0) { + // Build candidate nodes. + std::vector SRematCandidates; + BuildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI, + SIII, SIRI, /*bVGPR*/ false); + + LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI)); + std::vector SRematList; + // Filter candidates. + newRematSCnt = + FilterRematCandiates(SRematCandidates, SRematList, PinnedRegSet, pDT, + pPDT, MLI, MRI, SIRI, MF, SlotIndexes, + /*bVGPR*/ false, status.bMemBound); + if (newRematSCnt > rematSCnt) { + // Has enough remat node to cover rematCnt. + int rematCnt = 0; + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + rematCnt += Node.Size; + if (rematCnt > rematSCnt && !EnableAggressive) + break; + } + newRematSCnt = 0; + } else { + + for (RematNode &Node : SRematList) { + SReducedInsts.insert(Node.DefMI); + } + // Check shared size. + int SharedReducedSize = + GetSharedReducedSize(SReducedInsts, /*bVGPR*/ false, MRI, SIRI); + if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= + rematSCnt) { + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + } + } else { + if (!bForceRematSgpr) { + return false; + } else { + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + } + // Find local one def one use candidates. + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + if (MI.getDesc().NumDefs != 1) + continue; + MachineOperand &DstMO = MI.getOperand(0); + Register Reg = DstMO.getReg(); + if (!SIRI->isSGPRReg(MRI, Reg)) + continue; + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + if (!MRI.hasOneDef(Reg)) + continue; + if (Reg.isPhysical()) + continue; + MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); + if (UseMI.getParent() != MBB) + continue; + int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false); + if (gain > 0) { + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(&MI, &UseMI)) { + continue; + } + RematNode Node = {Reg, &MI, (unsigned)gain >> 5}; + Node.InsertPointMI = &UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + SRematMap[Reg] = Node; + SharedReducedSize += Node.Size; + } + } + } + } + newRematSCnt = rematSCnt - newRematSCnt - SharedReducedSize; + } + } + // If works, continue. + + // Collect live range from hot inst. + // find common live range in hot insts. + // Remat these common live range. + // Apply the remat. + + int newRematVCnt = 0; + if (rematVCnt > 0) { + // TODO: V remat. + } + + bool bNeedSRemat = rematSCnt > 0; + bool bNeedVRemat = rematVCnt > 0; + // If sgpr spill, always do remat. + bool bSRematOK = + (newRematSCnt <= 0 && !SRematMap.empty()) || + bForceRematSgpr; + bool bVRematOK = + (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty(); + if (bNeedSRemat && bNeedVRemat) { + if (bVRematOK && bSRematOK) { + bUpdated = true; + } else if (bSGPRSpill) { + bUpdated = true; + } + } else if (bNeedSRemat) { + if (bSRematOK) { + bUpdated = true; + } + } else if (bNeedVRemat) { + if (bVRematOK) { + bUpdated = true; + } + } + // TODO: what to do when cannot reach target? + if (newRematSCnt > 0) { + if (newRematSCnt <= NearTargetRegLimit) { + bNearTarget = true; + } else { + if (!bSGPRSpill) + return false; + } + } + } + + if (SRematMap.empty() && VRematMap.empty()) { + return bUpdated; + } + + if (!SRematMap.empty()) { + bUpdated = true; + ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF); + LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); + } + + // Balance between vector and scalar if possible. + return bUpdated; +} + +namespace { +bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) { + DenseSet DefMIs; + for (MachineInstr &DefMI : MRI.def_instructions(Reg)) { + // skip implicit def. + if (DefMI.getOpcode() == AMDGPU::IMPLICIT_DEF) + continue; + DefMIs.insert(&DefMI); + } + return DefMIs.size() == 1; +} + +static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) +{ + if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) + { + return false; + } + + return MO.getReg() == Reg; +} + +static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) +{ + if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) + { + return false; + } + + return MO.getReg() == Reg; +} + +static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII) +{ + // Make sure UseMI is not wqm like sample. + if (SIII->isWQM(UseMI->getOpcode())) + return false; + if (UseMI->getOpcode() == AMDGPU::PHI) + return false; + + return true; +} + +static bool isConvergent(Remat *Remat, const MachineInstr &MI) { + return MI.isConvergent() && + // This flag is set on readfirstlane's to indicate that they + // are redundant (the value being read is already uniform). + // Normally, readfirstlanes are convergent, because different exec + // will cause a different value to be read; a known uniform + // readfirstlane is safe to move or clone and not actually convergent. + !Remat->TotalUniformInsts.count(&MI); +} + +bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) { + if (Reg.isPhysical()) + return false; + bool bVGPR = SIRI->isVGPR(MRI, Reg); + + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + if (!DefMI) + return false; + if (DefMI->getOpcode() == AMDGPU::PHI) + return false; + + // Skip convergent. + if (isConvergent(Remat, *DefMI)) + return false; + + // Skip inst has more than 1 def. + if (DefMI->getDesc().NumDefs > 1) + return false; + + unsigned OpNum = DefMI->getNumOperands(); + + // Only move DefMI which all operand is unique def. + for (unsigned i = 0; i < OpNum; i++) { + MachineOperand &Op = DefMI->getOperand(i); + if (!Op.isReg()) + continue; + Register OpReg = Op.getReg(); + if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO)) + continue; + if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI)) + continue; + // Alow unused scc define. + if (Op.isImplicit() && Op.isDead() && Op.isDef()) + continue; + if (OpReg.isPhysical()) + return false; + if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) { + return false; + } + } + + if (bVGPR && bSink) { + // Skip mem related inst. + if (DefMI->mayLoadOrStore()) { + return false; + } + + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (!IsSafeRematCandidateUser(&UseMI, SIII)) + return false; + } + } + + return true; +} + +std::vector buildSubExpFromCandidates( + Remat *Remat, + GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, + GCNRPTracker::LiveRegSet &unUsedPassThrus, + bool bAllowPartialUseInSubExp) { + InstSet CandidateDefs; + DenseSet RemovedCandidates; + std::vector CandidateRegs; + CandidateRegs.reserve(Candidates.size()); + for (auto it : Candidates) { + unsigned Reg = it.first; + CandidateRegs.emplace_back(Reg); + } + // Sort candidate by defMI order to make sure defMI has dependent check after + // all its dependent node. + std::sort(CandidateRegs.begin(), CandidateRegs.end(), + [&MRI, &slotIndexes](const unsigned a, unsigned b) { + MachineInstr *MIa = MRI.getUniqueVRegDef(a); + + MachineInstr *MIb = MRI.getUniqueVRegDef(b); + // Later instr first. + return !SlotIndex::isEarlierInstr( + slotIndexes->getInstructionIndex(*MIa), + slotIndexes->getInstructionIndex(*MIb)); + }); + + // If Candidate def has user in MBB, add it when allow partial candidates. + // And the subExp has the define could only be clone, cannot move cross blocks + // because user in MBB. + DenseSet PartialCandidates; + LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";); + for (unsigned Reg : CandidateRegs) { + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + bool bHasNoCandidatesSameBlockUser = false; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == MI->getParent()) { + if (UseMI.getNumExplicitDefs() == 1) { + // Skip user which already in Candidates. + unsigned UserDefReg = UseMI.getOperand(0).getReg(); + if (Candidates.count(UserDefReg) > 0 && + RemovedCandidates.count(UserDefReg) == 0) + continue; + } + if (!bAllowPartialUseInSubExp) + bHasNoCandidatesSameBlockUser = true; + else + PartialCandidates.insert(MI); + break; + } + } + if (bHasNoCandidatesSameBlockUser) { + RemovedCandidates.insert(Reg); + continue; + } + LLVM_DEBUG(MI->dump()); + CandidateDefs.insert(MI); + } + LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";); + + if (CandidateDefs.empty()) + return std::vector(); + for (unsigned Reg : RemovedCandidates) { + unUsedPassThrus[Reg] = Candidates[Reg]; + Candidates.erase(Reg); + } + + // iterate MBB backward. + // add inst which only used for candidate defines. + for (auto it = MBB->rbegin(); it != MBB->rend(); it++) { + MachineInstr &MI = *it; + if (CandidateDefs.count(&MI) > 0) { + continue; + } + + if (isConvergent(Remat, MI)) + continue; + // Skip if MI is not safe to move. + if (MI.getNumDefs() != 1) { + // allow to move unused implicit def. + bool bDeadImplictDef = false; + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + bDeadImplictDef = MO.isDead(); + } + if (!bDeadImplictDef) + continue; + } + + unsigned Reg = -1; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Reg = MO.getReg(); + break; + } + + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true)) + continue; + + // If all users of MI are in candidate defs, add MI into candidate defs. + // If part of user of MI is in candidate defs, add MI into candidate defs + // when allow partialUse. + bool bAllUserInCandidate = true; + bool bHasCandidateUser = false; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (CandidateDefs.count(&UseMI) == 0) + bAllUserInCandidate = false; + else + bHasCandidateUser = true; + } + if (!bHasCandidateUser) + continue; + if (!bAllUserInCandidate) { + if (!bAllowPartialUseInSubExp) + continue; + PartialCandidates.insert(&MI); + } + + CandidateDefs.insert(&MI); + } + + // Collect input for CandidateDefs. + GCNRPTracker::LiveRegSet CandidateInput; + for (MachineInstr *MI : CandidateDefs) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (MO.isImplicit() && Reg.isPhysical()) + continue; + + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) && + "UseMI should be safe to move"); + if (DefMI && CandidateDefs.count(DefMI) > 0) + continue; + // Add to input. + CandidateInput[Reg] |= llvm::getRegMask(MO, MRI); + } + } + + // Build defs in order. + std::vector defs; + defs.reserve(CandidateDefs.size()); + for (MachineInstr &MI : *MBB) { + MachineInstr *pMI = &MI; + if (CandidateDefs.count(pMI) == 0) + continue; + defs.emplace_back(pMI); + } + + LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI + : defs) { + MI->dump(); + } dbgs() << "\nFinished Candidate Defs End\n";); + + // Build SubExp with CandidateDefs as Nodes, CandidateInput as input + // Candidates as output. + ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true); + dag.build(CandidateInput, Candidates, defs); + if (bAllowPartialUseInSubExp) { + for (auto &subExp : dag.SubExps) { + for (auto *MI : subExp.SUnits) { + if (PartialCandidates.count(MI)) { + subExp.bCloneOnly = true; + break; + } + } + } + } + return dag.SubExps; +} + + +std::vector buildSubExpFromCandidatesTopBottom( + Remat* Remat, + GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) { + InstSet CandidateDefs; + + LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";); + for (auto it : Candidates) { + unsigned Reg = it.first; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (isConvergent(Remat, UseMI)) + continue; + MachineBasicBlock *UseMBB = UseMI.getParent(); + if (UseMBB == MI->getParent()) + continue; + assert(UseMBB == MBB && "block mismatch"); + // If all operands in CandidateRegs, add to candidateDefs. + bool bHasOpRegNotInCandidates = false; + for (MachineOperand &MO : UseMI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register OpReg = MO.getReg(); + if (MO.isImplicit() && OpReg.isPhysical()) + continue; + if (Candidates.count(OpReg) == 0) { + bHasOpRegNotInCandidates = true; + break; + } + } + if (bHasOpRegNotInCandidates) + continue; + + LLVM_DEBUG(UseMI.dump()); + CandidateDefs.insert(&UseMI); + } + } + LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";); + + if (CandidateDefs.empty()) + return std::vector(); + + // iterate MBB. + GCNRPTracker::LiveRegSet LocalCandidates = Candidates; + // add inst which only used by candidate defines. + for (auto it = MBB->begin(); it != MBB->end(); it++) { + MachineInstr &MI = *it; + if (CandidateDefs.count(&MI) > 0) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (Reg.isPhysical()) + continue; + LocalCandidates[Reg]; + } + continue; + } + + // Skip if MI is not safe to move. + if (isConvergent(Remat, MI)) + continue; + + if (MI.getNumDefs() != 1) + continue; + + if (MI.mayLoadOrStore()) { + continue; + } + + unsigned Reg = -1; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Reg = MO.getReg(); + break; + } + + // Still use bsink to skip mem load/store. + // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*bSink*/true)) + // continue; + + // If all user of MI is in candidate defs, add MI into candidate defs. + bool bAllOperandInCandidate = true; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register OpReg = MO.getReg(); + if (LocalCandidates.count(OpReg)) + continue; + + if (MO.isImplicit() && + (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO)) + continue; + if (OpReg.isPhysical()) { + bAllOperandInCandidate = false; + break; + } + MachineInstr *OpMI = MRI.getUniqueVRegDef(OpReg); + if (!OpMI) { + bAllOperandInCandidate = false; + break; + } + if (CandidateDefs.count(OpMI) == 0) { + bAllOperandInCandidate = false; + break; + } + if (MO.isTied()) + continue; + } + if (!bAllOperandInCandidate) + continue; + LLVM_DEBUG(llvm::dbgs() << "Add local candidates:"; + pressure::print_reg(Reg, MRI, SIRI, llvm::dbgs());); + LocalCandidates[Reg]; + CandidateDefs.insert(&MI); + } + + // Collect input for CandidateDefs. + GCNRPTracker::LiveRegSet CandidateInput; + for (MachineInstr *MI : CandidateDefs) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)) + continue; + if (Reg.isPhysical()) + continue; + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + if (!DefMI) { + // Skip local def which is not unique. + if (MO.isTied()) + continue; + if (Candidates.count(Reg) == 0 && LocalCandidates.count(Reg) != 0) + continue; + } + assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) && + "UseMI should be safe to move"); + if (DefMI && CandidateDefs.count(DefMI) > 0) + continue; + // Add to input. + CandidateInput[Reg] = llvm::getRegMask(MO, MRI); + } + } + + // Build defs in order. + std::vector defs; + defs.reserve(CandidateDefs.size()); + for (MachineInstr &MI : *MBB) { + MachineInstr *pMI = &MI; + if (CandidateDefs.count(pMI) == 0) + continue; + defs.emplace_back(pMI); + } + + LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI + : defs) { + MI->dump(); + } dbgs() << "\nFinished Candidate Defs End\n";); + + LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it + : LocalCandidates) { + pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs()); + } dbgs() << "\nLocalCandidates End\n";); + // Make sure all input reg are uniqueDef. + // Input is Candidates, output is? + // Build SubExp with CandidateDefs as Nodes, CandidateInput as input + // Candidates as output. + ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true); + dag.build(Candidates, LocalCandidates, defs); + return dag.SubExps; +} + + +void print_vreg(Register Reg, const MachineRegisterInfo &MRI) { + if (Reg.isVirtual()) { + StringRef Name = MRI.getVRegName(Reg); + if (Name != "") { + dbgs() << '%' << Name; + } else { + dbgs() << '%' << Register::virtReg2Index(Reg); + } + } +} + +MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, + const MachineRegisterInfo &MRI, + MachineDominatorTree *pDT) { + BlockSet userBlocks; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB != FromBB) + userBlocks.insert(UserBB); + else + // When has user in FromBB, userBlock will be FromBB. + return nullptr; + } + if (userBlocks.empty()) + return nullptr; + MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks); + if (!pDT->dominates(FromBB, userBlock)) { + return nullptr; + } + if (userBlock == FromBB) + return nullptr; + return userBlock; +} + +void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, + MachineDominatorTree *pDT, + SlotIndexes *slotIndexes, + const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI) { + // Move from bottom. + MachineBasicBlock *FromBB = Exp.FromBB; + for (auto it = Exp.SUnits.rbegin(); it != Exp.SUnits.rend(); it++) { + MachineInstr *DefMI = *it; + if (DefMI->getNumExplicitDefs() != 1) + continue; + + unsigned Reg = DefMI->getOperand(0).getReg(); + MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, pDT); + if (!ToBB) + continue; + + // Do not overwrite a live scc. + MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin()); + if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint)) + continue; + + DefMI->removeFromParent(); + assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point"); + ToBB->insert(InsertPoint, DefMI); + // Debug insts don't need slot index. + if (DefMI->isDebugInstr()) + continue; + // Update slot index. + slotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + slotIndexes->insertMachineInstrInMaps(*DefMI); + } +} + + +void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, + MachineDominatorTree *pDT, + SlotIndexes *slotIndexes, + const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI) { + // Move from top. + // Find lowest input def. + MachineBasicBlock *ToBB = Exp.ToBB; + assert(!ToBB->empty() && "ToBB have instructions for define of input nodes"); + auto Terminator = ToBB->getFirstTerminator(); + if (Terminator == ToBB->end() && ToBB->succ_size() == 1) { + MachineInstr &EndMI = *ToBB->rbegin(); + if (SIII->isSchedulingBoundary(EndMI, ToBB, *ToBB->getParent())) + // Insert before the scheduling boundary instruction. + Terminator = EndMI.getIterator(); + else + // No boundary so just insert inst at the end of the block. + Terminator = ToBB->end(); + } + + Terminator = AdjustInsertPointForSubExpToAvoidSccSmash( + Exp, ToBB, Terminator, MRI, SIRI, SIII + ); + + for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { + MachineInstr *DefMI = *it; + if (DefMI->getNumExplicitDefs() != 1) + continue; + if (SIII->isEXP(DefMI->getOpcode())) + continue; + if (DefMI->mayStore()) + continue; + // Find def for DefMI operands as insert point. + DefMI->removeFromParent(); + ToBB->insert(Terminator, DefMI); + + // Debug insts don't need slot index. + if (DefMI->isDebugInstr()) + continue; + // Update slot index. + slotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + slotIndexes->insertMachineInstrInMaps(*DefMI); + } +} + +DenseSet buildCloneSet(ExpDag &dag, + DenseSet &dagBottoms, + GCNRPTracker::LiveRegSet &usedOutput) { + DenseSet copySet; + for (auto it = dag.SUnits.rbegin(); it != dag.SUnits.rend(); it++) { + SUnit &SU = *it; + // Skip non-inst node. + if (!SU.isInstr()) + continue; + MachineInstr *MI = SU.getInstr(); + if (dagBottoms.find(&SU) != dagBottoms.end()) { + bool bUsed = false; + // For bottom SU, if in usedOutput, add to copySet; + for (MachineOperand &DefMO : MI->defs()) { + if (!DefMO.isReg()) + continue; + unsigned Reg = DefMO.getReg(); + if (usedOutput.count(Reg) > 0) { + bUsed = true; + break; + } + } + if (bUsed) { + copySet.insert(MI); + continue; + } + // bottom SU may still have succNode when it used both inExp and outExp. + // So continue check succNode. + } + + // If any SuccNode is in copySet, add to copySet. + bool bSuccCopied = false; + for (SDep &SucDep : SU.Succs) { + SUnit *SucSU = SucDep.getSUnit(); + MachineInstr *SuccMI = SucSU->getInstr(); + if (copySet.count(SuccMI) > 0) { + bSuccCopied = true; + break; + } + } + if (bSuccCopied) + copySet.insert(MI); + } + return copySet; +} + +void updateUsers(SmallVector &userMIs, + DenseMap &RegMap) { + + for (MachineInstr *UserMI : userMIs) { + for (MachineOperand &MO : UserMI->uses()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + auto it = RegMap.find(Reg); + if (it == RegMap.end()) + continue; + unsigned NewReg = it->second; + MO.setReg(NewReg); + } + } +} + +struct HotBlock { + MachineBasicBlock *MBB = nullptr; + GCNRPTracker::LiveRegSet inputLive; + std::pair maxPressures; + // Info about vmemLd. + int vmemLdInputSize; + int vmemLdOutputSize; +}; + +DenseMap reduceClonedMBBs( + SubExp &Exp, + MapVector> &userBlocks, + DenseMap &userBlocksLiveRegs, + std::vector &hotBlocks, MachineDominatorTree *pDT) { + // Collect hot blocks which Exp is live in. + DenseSet hotBlockSet; + for (HotBlock &hotBlock : hotBlocks) { + for (unsigned Reg : Exp.BottomRegs) { + if (hotBlock.inputLive.count(Reg)) { + hotBlockSet.insert(hotBlock.MBB); + break; + } + } + } + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseSet afterHotRangeMBBs; + for (auto it : userBlocksLiveRegs) { + MachineBasicBlock *MBB = it.first; + // Always clone in hot block. + if (hotBlockSet.count(MBB)) + continue; + + bool bDomAllHotBlocks = true; + bool bDomedByAllHotBlocks = true; + for (MachineBasicBlock *hotMBB : hotBlockSet) { + if (!pDT->dominates(MBB, hotMBB)) { + bDomAllHotBlocks = false; + } + if (!pDT->dominates(hotMBB, MBB)) { + bDomedByAllHotBlocks = false; + } + if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) { + break; + } + } + if (bDomAllHotBlocks) { + userBlocks.erase(MBB); + } else if (bDomedByAllHotBlocks) { + afterHotRangeMBBs.insert(MBB); + } + } + + // Split after hotRange block set by domtree. + DenseMap DomMap; + if (!afterHotRangeMBBs.empty()) { + for (auto it : afterHotRangeMBBs) { + MachineBasicBlock *MBB = it; + for (auto it2 : afterHotRangeMBBs) { + MachineBasicBlock *MBB2 = it2; + if (MBB == MBB2) + continue; + if (pDT->dominates(MBB, MBB2)) { + auto &Dom = DomMap[MBB]; + Dom.insert(MBB2); + auto &Dom2 = DomMap[MBB2]; + Dom.insert(Dom2.begin(), Dom2.end()); + } + } + } + for (auto it : afterHotRangeMBBs) { + MachineBasicBlock *MBB = it; + auto &usedOutput = userBlocksLiveRegs[MBB]; + auto &Dom = DomMap[MBB]; + for (MachineBasicBlock *domedMBB : Dom) { + // Merge domed use to MBB use. + mergeLiveRegSet(usedOutput, userBlocksLiveRegs[domedMBB]); + // Remove domedMBB. + DomMap.erase(domedMBB); + userBlocksLiveRegs.erase(domedMBB); + } + } + } + + return DomMap; +} + +void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, + MachineDominatorTree *pDT, + MachineRegisterInfo &MRI, + SlotIndexes *slotIndexes, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI) { + MapVector> userBlocks; + DenseMap userBlocksLiveRegs; + for (unsigned Reg : Exp.BottomRegs) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB == Exp.FromBB) + continue; + + userBlocks[UserBB].emplace_back(&UseMI); + auto &userLives = userBlocksLiveRegs[UserBB]; + for (MachineOperand &MO : UseMI.uses()) { + if (!MO.isReg()) + continue; + unsigned UseReg = MO.getReg(); + if (Reg != UseReg) + continue; + userLives[Reg] |= getRegMask(MO, MRI); + } + } + } + // Build dag for SubExp to help remove unused inst when clone. + ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true); + dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits); + DenseSet dagBottoms; + for (SUnit &SU : dag.SUnits) { + if (!SU.isInstr()) + continue; + if (SU.NumSuccs == 0) { + dagBottoms.insert(&SU); + } else { + MachineInstr *MI = SU.getInstr(); + // Add SU which def value in Exp.outputLive. + for (MachineOperand &DefMO : MI->defs()) { + if (!DefMO.isReg()) + continue; + unsigned Reg = DefMO.getReg(); + if (Exp.BottomRegs.count(Reg) > 0) { + dagBottoms.insert(&SU); + break; + } + } + } + } + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseMap DomMap = + reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT); + + // Sort to make stable order. + std::sort(userBlocks.begin(), userBlocks.end(), + [](std::pair>& it0, + std::pair>& it1) { + return it0.first->getNumber() < it1.first->getNumber(); + }); + + const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); + + // Clone for each userBlocks. Not share clone thru dom tree which cannot help + // reg pressure. + for (auto it : userBlocks) { + MachineBasicBlock *MBB = it.first; + // Skip MBB which share clone from other MBBs. + if (userBlocksLiveRegs.count(MBB) == 0) + continue; + auto &usedOutput = userBlocksLiveRegs[MBB]; + auto copySet = buildCloneSet(dag, dagBottoms, usedOutput); + // Clone to MBB. + // Create new regs first. + DenseMap RegMap; + auto insertPtr = MBB->getFirstNonPHI(); + // If Exp has scc read/write, make sure MBB not have scc in liveins. + if (bModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr)) + continue; + MachineFunction *MF = MBB->getParent(); + for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { + MachineInstr *DefMI = *it; + // Not clone if already in MBB. + if (DefMI->getParent() == MBB) + continue; + // Not clone if not used for MBB. + if (copySet.count(DefMI) == 0) + continue; + + auto ClonedMI = + BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); + + for (MachineOperand &Def : DefMI->defs()) { + Register Reg = Def.getReg(); + if (Reg.isPhysical()) { + if (Def.isImplicit()) + continue; + ClonedMI.addDef(Reg, 0, Def.getSubReg()); + } else { + unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + RegMap[Reg] = NewReg; + ClonedMI.addDef(NewReg, 0, Def.getSubReg()); + } + } + + for (MachineOperand &MO : DefMI->uses()) { + if (MO.isReg()) { + Register Reg = MO.getReg(); + if (Reg.isPhysical()) { + if (MO.isImplicit()) + continue; + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + auto it = RegMap.find(Reg); + if (it == RegMap.end()) { + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + ClonedMI.addReg(it->second, 0, MO.getSubReg()); + } + } + } else { + ClonedMI.add(MO); + } + } + + MachineInstr *NewDef = ClonedMI.getInstr(); + slotIndexes->insertMachineInstrInMaps(*NewDef); + // Set mem operand + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(*MF, MO); + } + } + + // update users in MBB. + SmallVector &userMIs = it.second; + updateUsers(userMIs, RegMap); + + // update users in dom MBBs. + auto domMapIt = DomMap.find(MBB); + if (domMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : domMapIt->second) { + SmallVector &userMIs = userBlocks[UpdateMBB]; + updateUsers(userMIs, RegMap); + } + } + } +} + + +void ApplySubExpCloneNearUserInBlock( + SubExp &Exp, + DenseMap &inBlockHotVInstMap, + DenseMap &inBlockHotSInstMap, + MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI) { + MachineBasicBlock *MBB = Exp.FromBB; + MachineFunction *MF = MBB->getParent(); + MachineInstr *hotVMI = inBlockHotVInstMap[MBB]; + MachineInstr *hotSMI = inBlockHotSInstMap[MBB]; + // Exp is build with hotVMI or hotSMI, cannot mix. + assert(!(hotVMI && hotSMI) && "cannot mix hot MI"); + MachineInstr *hotMI = hotVMI; + if (!hotMI) { + hotMI = hotSMI; + } + + SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex(); + const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); + + for (unsigned Reg : Exp.BottomRegs) { + + SmallVector useMIs; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB != Exp.FromBB) + continue; + // Skip inst in Exp. + if (Exp.BottomRoots.find(&UseMI) != Exp.BottomRoots.end()) + continue; + SlotIndex useSlot = + slotIndexes->getInstructionIndex(UseMI).getBaseIndex(); + // Only clone for use after hot slot. + if (useSlot < hotSlot) + continue; + + // Do not overwrite a live scc. + if (bModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI)) + continue; + + useMIs.emplace_back(&UseMI); + } + if (useMIs.empty()) + continue; + DenseMap RegMap; + + std::sort(useMIs.begin(), useMIs.end(), + [&slotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) { + return slotIndexes->getInstructionIndex(*MIa).getBaseIndex() < + slotIndexes->getInstructionIndex(*MIb).getBaseIndex(); + }); + auto insertPtr = useMIs.front()->getIterator(); + + for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { + MachineInstr *DefMI = *it; + auto ClonedMI = + BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); + + for (MachineOperand &Def : DefMI->defs()) { + Register Reg = Def.getReg(); + if (Reg.isPhysical()) { + ClonedMI.addDef(Reg, 0, Def.getSubReg()); + } else { + unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + RegMap[Reg] = NewReg; + ClonedMI.addDef(NewReg, 0, Def.getSubReg()); + } + } + + for (MachineOperand &MO : DefMI->uses()) { + if (MO.isReg()) { + if (MO.isImplicit()) { + continue; + } + Register Reg = MO.getReg(); + if (Reg.isPhysical()) { + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + auto it = RegMap.find(Reg); + if (it == RegMap.end()) { + ClonedMI.addReg(Reg, 0, MO.getSubReg()); + } else { + ClonedMI.addReg(it->second, 0, MO.getSubReg()); + } + } + } else { + ClonedMI.add(MO); + } + } + + MachineInstr *NewDef = ClonedMI.getInstr(); + slotIndexes->insertMachineInstrInMaps(*NewDef); + // Set mem operand + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(*MF, MO); + } + } + // TODO: only clone to cross hot range. + for (MachineInstr *UseMI : useMIs) { + for (MachineOperand &MO : UseMI->uses()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + auto it = RegMap.find(Reg); + if (it == RegMap.end()) + continue; + unsigned NewReg = it->second; + MO.setReg(NewReg); + } + } + } +} + +bool isInLiveSet(unsigned Reg, LaneBitmask mask, + const GCNRPTracker::LiveRegSet &live) { + auto it = live.find(Reg); + if (it == live.end()) + return false; + + LaneBitmask liveMask = it->second; + return (liveMask | mask) == liveMask; +} + +unsigned getPacifistLevel(unsigned Reg, + DenseMap &pacifistLevels, + const MachineRegisterInfo &MRI) { + unsigned level = 0; + for (MachineInstr &MI : MRI.def_instructions(Reg)) { + auto it = pacifistLevels.find(&MI); + if (it == pacifistLevels.end()) + continue; + level = it->second; + } + return level; +} + +bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB, + const MachineRegisterInfo &MRI) { + for (MachineInstr &def : MRI.def_instructions(Reg)) { + if (def.getParent() != MBB) + continue; + return true; + } + return false; +} + +MachineInstr *getInBlockUniqueDef(unsigned Reg, MachineBasicBlock *MBB, + const GCNRPTracker::LiveRegSet &inputLive, + const GCNRPTracker::LiveRegSet &outputLive, + const MachineRegisterInfo &MRI) { + MachineInstr *DefMI = nullptr; + // If live as input for MBB, cannot be unique def. + if (inputLive.count(Reg)) + return DefMI; + for (MachineInstr &def : MRI.def_instructions(Reg)) { + if (def.getParent() != MBB) + continue; + if (DefMI) { + // Not unique. + DefMI = nullptr; + break; + } + DefMI = &def; + } + return DefMI; +} + +bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive, + const GCNRPTracker::LiveRegSet &outputLive) { + return inputLive.count(Reg) && outputLive.count(Reg); +} + +// Instructions which only use imm/passThru reg/output only reg will not kill any +// live reg, so name them pacifist here. +bool collectPacifist(MachineInstr &MI, + const GCNRPTracker::LiveRegSet &inputLive, + const GCNRPTracker::LiveRegSet &outputLive, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + // If has implicit def, not move. + if (MI.getDesc().NumImplicitDefs != 0) + return false; + + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + + Register Reg = MO.getReg(); + if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)) + continue; + if (Reg.isPhysical()) + return false; + // The def for reg must be unique def in block or pass thru which not has + // def in block. If not, it is not safe to move. + if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), inputLive, + outputLive, MRI) || + (isPassThru(Reg, inputLive, outputLive) && + !hasInBlockDef(Reg, MI.getParent(), MRI)))) + return false; + + LaneBitmask mask = llvm::getRegMask(MO, MRI); + + if (isInLiveSet(Reg, mask, outputLive)) + continue; + + return false; + } + bool bHasDef = false; + for (MachineOperand &MO : MI.defs()) { + Register Reg = MO.getReg(); + + if (Reg.isPhysical()) + return false; + + if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) + return false; + + bHasDef = true; + } + // If no def, it will not increase pressure, don't mark it. + return bHasDef; +} + +static MachineInstr* findFirstAliasingLoadOrStoreInMBB( + MachineInstr &MI, + MachineBasicBlock &MBB, + AliasAnalysis *AA +) +{ + if (MI.mayLoadOrStore()) + { + for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I) + { + const bool UseTBAA = false; + if (MI.mayAlias(AA, *I, UseTBAA)) + { + return &*I; + } + } + } + + return nullptr; +} + +static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI, + AliasAnalysis *AA, + SlotIndexes *slotIndexes) { + + SmallVector users; + + // We cannot move the pacifist instruction past any memory + // op with which it aliases. Find the first instruction + // that aliases the pacifist MI (if any) and add it to the list + // of users. The sort() below will select the earliest user instruction. + if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) { + users.push_back(AliasMI); + } + + for (MachineOperand &MO : MI.defs()) { + unsigned Reg = MO.getReg(); + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) + { + if (&MBB != UseMI.getParent()) + continue; + users.emplace_back(&UseMI); + } + } + if (users.empty()) + return nullptr; + + std::sort(users.begin(), users.end(), + [&slotIndexes](const MachineInstr *MIa, MachineInstr *MIb) { + // Early instr first. + return SlotIndex::isEarlierInstr( + slotIndexes->getInstructionIndex(*MIa), + slotIndexes->getInstructionIndex(*MIb)); + }); + return users.front(); +} + +// Pacifist inst will only add pressure since they don't kill. +// Try to hold them as late as possible in a MBB to help pressure. +bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, AliasAnalysis *AA, + RematStatus &status) +{ + const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB]; + const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; + + SmallVector pacifistList; + LLVM_DEBUG(dbgs() << "pacifist begin\n"); + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + if (collectPacifist(MI, inputLive, outputLive, MRI, SIRI)) { + pacifistList.emplace_back(&MI); + LLVM_DEBUG(MI.dump()); + } + } + LLVM_DEBUG(dbgs() << "pacifist end\n"); + + SlotIndexes *slotIndexes = LIS->getSlotIndexes(); + bool bUpdated = false; + + // Move pacifist to its first user. + for (MachineInstr *MI : pacifistList) { + MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes); + if (firstUser == MI) + continue; + if (firstUser == MI->getNextNode()) + continue; + + auto insertPoint = MBB.getFirstInstrTerminator(); + if (firstUser) { + insertPoint = firstUser->getIterator(); + } else { + // When there's no terminator. + if (insertPoint == MBB.end()) + insertPoint--; + else + // BRANCH may have exec update before it. + insertPoint--; + + insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); + + while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) || + insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) && + insertPoint != MI->getIterator()) + { + insertPoint--; + insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); + } + if (insertPoint == MI->getIterator()) + continue; + } + // Do not overwrite a live scc. + if (WillSmashSccAtLocation(MI, &MBB, insertPoint)) + continue; + MI->removeFromParent(); + MBB.insert(insertPoint, MI); + + LIS->handleMove(*MI); + bUpdated = true; + } + + return bUpdated; +} + +DenseMap +collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + DenseMap UniformMap; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + if (!Remat->TotalUniformInsts.count(&MI)) + continue; + if (MI.getNumDefs() != 1) + continue; + unsigned dstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst); + if (dstIdx == -1) + continue; + MachineOperand &DstMO = MI.getOperand(dstIdx); + if (DstMO.getSubReg() != 0) + continue; + if (DstMO.isTied()) + continue; + unsigned Reg = DstMO.getReg(); + if (MRI.getUniqueVRegDef(Reg) == nullptr) + continue; + + auto *VRC = SIRI->getRegClassForReg(MRI, Reg); + if (SIRI->isSGPRClass(VRC)) + continue; + // TODO: Support more reg class. + if (VRC != &AMDGPU::VGPR_32RegClass) + continue; + + UniformMap[Reg] = &MI; + } + } + return UniformMap; +} + +// Try insert readfirstlane on uniform vgpr to turn it in sgpr and save vgpr +// pressure. +bool collectVToSCrossHotSpot( + MachineBasicBlock &MBB, RematStatus &status, + DenseMap &UniformMap, + SmallMapVector &VToSMap, LiveIntervals *LIS, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + unsigned VLimit = status.TargetVLimit; + unsigned SLimit = status.TargetSLimit; + auto& ST = MBB.getParent()->getSubtarget(); + + GCNDownwardRPTracker Tracker(*LIS); + + bool bUpdated = false; + const auto inputLive = status.MBBInputLiveMap[&MBB]; + Tracker.reset(*MBB.begin(), &inputLive); + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) { + continue; + } + + unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts()); + unsigned SPressure = Tracker.getPressure().getMaxSGPR(); + + SPressure += RegForVCC; + + Tracker.advance(); + // Sgpr bound, vtos cannot help. + if (SPressure > SLimit) + return false; + + if (VPressure <= VLimit) { + continue; + } + + // Try to make all possible vtos to reduce vpressure. + int VExtra = VPressure - VLimit; + + const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs(); + for (auto it : CurLives) { + unsigned Reg = it.first; + auto UniformIt = UniformMap.find(Reg); + if (UniformIt == UniformMap.end()) + continue; + VToSMap[UniformIt->first] = UniformIt->second; + VExtra--; + bUpdated = true; + } + + } + return bUpdated; +} + +// Return true if the user is outside of the def's loop. +static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI) +{ + MachineLoop* L = MLI->getLoopFor(Def->getParent()); + return L && !L->contains(User->getParent()); +} + +bool rematUniformVgprToSgpr( + Remat *Remat, + MachineFunction &MF, RematStatus &status, + DenseMap &MBBPressureMap, + std::vector &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) { + DenseMap UniformVgprMap = + collectUniformVgprs(Remat, MF, MRI, SIRI); + + SmallMapVector VToSMap; + + for (auto &hotBlock : hotBlocks) { + MachineBasicBlock &MBB = *hotBlock.MBB; + collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS, MRI, + SIRI, SIII); + } + + if (VToSMap.empty()) + return false; + SlotIndexes *slotIndexes = LIS->getSlotIndexes(); + const MCInstrDesc &ReadFirstLaneDesc = SIII->get(AMDGPU::V_READFIRSTLANE_B32); + for (auto it : VToSMap) { + unsigned Reg = it.first; + MachineInstr *MI = it.second; + + auto *VRC = SIRI->getRegClassForReg(MRI, Reg); + // TODO: support bigger vgpr to sgpr. + if (VRC != &AMDGPU::VGPR_32RegClass) + continue; + auto *NewRC = SIRI->getEquivalentSGPRClass(VRC); + unsigned newDst = MRI.createVirtualRegister(NewRC); + + auto ReadFirstLane = + BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, newDst); + SmallVector userMIs; + for (MachineInstr &userMI : MRI.use_nodbg_instructions(Reg)) { + // Do not replace v->s across loops. Even if the value is uniform + // branch divergence can cause a uniform value in a loop to be + // non-uniform when used outside a loop. + if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI)) + userMIs.emplace_back(&userMI); + } + + // Finish readfirstlane + ReadFirstLane.addReg(Reg); + MachineInstr *VToSMI = ReadFirstLane.getInstr(); + Remat->TotalUniformInsts.insert(VToSMI); + Remat->SafeToRemoveInsts.insert(VToSMI); + MachineBasicBlock *MBB = MI->getParent(); + MBB->insertAfter(MI->getIterator(), VToSMI); + slotIndexes->insertMachineInstrInMaps(*VToSMI); + + for (MachineInstr *userMI : userMIs) { + const auto &Desc = userMI->getDesc(); + bool bIllegal = false; + for (unsigned i=0;igetNumOperands();i++) { + MachineOperand &MO = userMI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.getReg() != Reg) + continue; + if (i >= Desc.getNumOperands()) { + bIllegal = true; + break; + } + + MO.setReg(newDst); + if (userMI->getDesc().operands()[i].RegClass != -1) { + if (!SIII->isOperandLegal(*userMI, i, &MO)) { + SIII->legalizeOperands(*userMI); + // In case legalizeOperands not help, just legalize with mov. + if (userMI->getDesc().operands()[i].RegClass != -1 && + !SIII->isOperandLegal(*userMI, i)) { + SIII->legalizeOpWithMove(*userMI, i); + } + } + } else { + // consider not have limit on reg class. + } + } + if (bIllegal) + continue; + + auto rit = userMI->getReverseIterator(); + rit++; + auto endIt = userMI->getParent()->rend(); + while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit)) + slotIndexes->insertMachineInstrInMaps(*(rit++)); + } + } + + return true; +} + +bool collectRematableHotReg( + MachineInstr &MI, const GCNRPTracker::LiveRegSet &hotLive, + GCNRPTracker::LiveRegSet &pureHotRematSet, + DenseMap &pureHotRematLevels, unsigned &DefReg, + const GCNRPTracker::LiveRegSet &inputLive, + const GCNRPTracker::LiveRegSet &outputLive, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + // Ignore inst not have def or more than 1 def. + if (MI.getDesc().getNumDefs() != 1) + return false; + + DefReg = MI.defs().begin()->getReg(); + + unsigned level = 0; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + + Register Reg = MO.getReg(); + + // If user is in same MI like + // %4:vgpr_32 = V_MAD_LEGACY_F32 %2:vgpr_32, %3:vgpr_32, %4:vgpr_32 + // remat it will not help. + if (Reg == DefReg) { + return false; + } + + if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)) + continue; + if (Reg.isPhysical()) + return false; + + if (nullptr == + getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) + return false; + + LaneBitmask mask = llvm::getRegMask(MO, MRI); + + if (isInLiveSet(Reg, mask, hotLive)) + continue; + + if (isInLiveSet(Reg, mask, pureHotRematSet)) { + unsigned regLevel = getPacifistLevel(Reg, pureHotRematLevels, MRI); + level = std::max(level, regLevel); + continue; + } + + return false; + } + + for (MachineOperand &MO : MI.defs()) { + Register Reg = MO.getReg(); + + if (Reg.isPhysical()) + return false; + + if (nullptr == + getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) + return false; + + LaneBitmask mask = llvm::getRegMask(MO, MRI); + pureHotRematSet[Reg] |= mask; + } + + pureHotRematLevels[&MI] = level + 1; + // If no def, it will not increase pressure, don't mark it. + return true; +} + +bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, + std::vector &inBlockCloneSubExps, bool bVGPR, + const GCNRPTracker::LiveRegSet &inputLive, + const GCNRPTracker::LiveRegSet &outputLive, + DenseSet &hotSet, int vDistance, int sDistance, + unsigned VLimit, unsigned SLimit, + const DenseSet &MemWriteMBBSet, + LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + auto &ST = MBB.getParent()->getSubtarget(); + const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex(); + const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI); + + GCNRPTracker::LiveRegSet hotLive = LISLR; + + GCNRPTracker::LiveRegSet pureHotRematSet; + std::vector pureHotRematList; + DenseMap pureHotRematLevels; + + GCNRPTracker::LiveRegSet outputSet; + LLVM_DEBUG(dbgs() << "pure hot remat begin\n"); + // Find reg which could remat from other reg in liveSet. + const unsigned kMaxRematLevel = 6; + GCNDownwardRPTracker Tracker(*LIS); + Tracker.reset(*MBB.begin(), &inputLive); + for (auto it = MBB.begin(); it != MBB.end(); it++) { + MachineInstr &MI = *it; + const GCNRegPressure &RP = Tracker.getPressure(); + + if (MI.isDebugInstr()) + continue; + + // Igonre inst in hot range. + if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) { + Tracker.advance(); + continue; + } + + // Stop at hotMI. + if (&MI == hotMI) + break; + + Tracker.advance(); + + unsigned DefReg = 0; + if (collectRematableHotReg(MI, hotLive, pureHotRematSet, pureHotRematLevels, + DefReg, inputLive, outputLive, MRI, SIRI)) { + unsigned level = pureHotRematLevels[&MI]; + if (level >= kMaxRematLevel) + continue; + + // If the def reg is in hot reg. + // Add to output. + if (hotLive.find(DefReg) != hotLive.end()) { + bool bUserIsHot = false; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) { + if (UseMI.getParent() != &MBB) + continue; + if (0 == hotSet.count(&UseMI)) + continue; + + const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex(); + // When has a hot user after hotMI, remat it may not help. + if (useSI > SI) { + bUserIsHot = true; + break; + } + } + + if (bUserIsHot) + continue; + outputSet[DefReg]; + LLVM_DEBUG(dbgs() << "hotRemat:"); + LLVM_DEBUG(MI.getOperand(0).dump()); + // remove it from hotLive to avoid it as input when build dag. + hotLive.erase(DefReg); + } + pureHotRematList.emplace_back(&MI); + LLVM_DEBUG(dbgs() << "level:" << level); + LLVM_DEBUG(MI.dump()); + } + } + + LLVM_DEBUG(dbgs() << "pure hot remat end\n"); + + // Create input/output for pure hot remat. + // Input is things hot reg in level 1 and output is things level > 1. + // Build SubExp with pureHotRematList as Nodes, hotLive as input + // rematHot as output. + // Not join input when build ExpDag to get small subExps. + ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ false); + dag.build(hotLive, outputSet, pureHotRematList); + // Find best subExp add to inBlockCloneSubExps. + // Sort by size of subExp. + std::sort(dag.SubExps.begin(), dag.SubExps.end(), + [](const SubExp &a, const SubExp &b) { + return a.SUnits.size() < b.SUnits.size(); + }); + std::vector cloneSubExps; + int distance = bVGPR ? vDistance : sDistance; + for (SubExp &subExp : dag.SubExps) { + if (subExp.bNotSafeToCopy) + continue; + if (bVGPR) { + if (subExp.vOutputSize == 0) + continue; + } else { + if (subExp.sOutputSize == 0) + continue; + } + if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) + continue; + // Not clone big subExp. + if (subExp.SUnits.size() > 10) + continue; + // Do not allow remat in the block when the expression has a memory op and + // the block has a write. We could allow this in some cases with better + // analysis. + if (subExp.bHasMemInst && MemWriteMBBSet.count(&MBB)) + continue; + if (bVGPR) { + distance -= subExp.vOutputSize; + } else { + distance -= subExp.sOutputSize; + } + cloneSubExps.emplace_back(subExp); + if (distance <= 0) + break; + } + if (distance <= 0) { + inBlockCloneSubExps.insert(inBlockCloneSubExps.end(), cloneSubExps.begin(), + cloneSubExps.end()); + } + return distance <= 0; +} + +// Try to remat live reg in hot spot from other live reg in hot spot. +// +bool tryRematInHotSpot( + MachineBasicBlock &MBB, RematStatus &status, int vDistance, int sDistance, + int vSaved, int sSaved, std::vector &inBlockCloneSubExps, + DenseMap &inBlockHotVInstMap, + DenseMap &inBlockHotSInstMap, + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + unsigned VLimit = status.TargetVLimit; + unsigned SLimit = status.TargetSLimit; + + auto& ST = MBB.getParent()->getSubtarget(); + const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB]; + + const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; + + // Collect reg pressure. + unsigned maxLocalVPressure = 0; + unsigned maxLocalSPressure = 0; + // Build a DAG or only on demand? + MachineInstr *hotVMI = nullptr; + MachineInstr *hotSMI = nullptr; + DenseSet hotSet; + + GCNDownwardRPTracker Tracker(*LIS); + + Tracker.reset(*MBB.begin(), &inputLive); + for (auto it = MBB.begin(); it != MBB.end(); it++) { + MachineInstr &MI = *it; + if (MI.isDebugInstr()) { + continue; + } + + unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts()); + unsigned SPressure = Tracker.getPressure().getMaxSGPR(); + + SPressure += RegForVCC; + + VPressure -= vSaved; + SPressure -= sSaved; + Tracker.advance(); + + if (VPressure <= VLimit && SPressure <= SLimit) { + continue; + } + hotSet.insert(&MI); + if (maxLocalVPressure < VPressure) { + maxLocalVPressure = VPressure; + hotVMI = &MI; + } + if (maxLocalSPressure < SPressure) { + maxLocalSPressure = SPressure; + hotSMI = &MI; + } + } + + inBlockHotVInstMap[&MBB] = hotVMI; + inBlockHotSInstMap[&MBB] = hotSMI; + if (vDistance > 0 && hotVMI) { + // Use hotVMI when apply. + inBlockHotSInstMap[&MBB] = nullptr; + if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive, + outputLive, hotSet, vDistance, sDistance, VLimit, SLimit, + status.MemWriteMBBSet, + LIS, MRI, SIRI, SIII)) + return true; + } + + if (sDistance > 0 && hotSMI) { + // Use hotSMI when apply. + inBlockHotSInstMap[&MBB] = hotSMI; + inBlockHotVInstMap[&MBB] = nullptr; + return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false, + inputLive, outputLive, hotSet, vDistance, sDistance, VLimit, + SLimit, status.MemWriteMBBSet, + LIS, MRI, SIRI, SIII); + } + return false; +} +// Sort subExpCandidates to make sure deeper subExp apply first. +// If subExp0 use result of subExp1, subExp0 is deeper than subExp1. +// When apply subExp1 before subExp0, new clone of subExp0 which use result of +// subExp1 will have old reg of subExp1. And reg pressure will not be reduced. +void sortSubExpCandidates(std::vector &subExpCandidates) { + MapVector> inputMap; + MapVector> outputMap; + struct SortNode { + SubExp Exp; + unsigned Depth; + bool bDepthDirty; + SmallDenseSet Preds; + SmallDenseSet Succs; + }; + + { + SmallVector RegSortStorage; + for (SubExp &Exp : subExpCandidates) { + RegSortStorage.assign(Exp.TopRegs.begin(), Exp.TopRegs.end()); + std::sort(RegSortStorage.begin(), RegSortStorage.end()); + for (auto it : RegSortStorage) { + unsigned Reg = it; + inputMap[Reg].insert(&Exp); + } + + RegSortStorage.assign(Exp.BottomRegs.begin(), Exp.BottomRegs.end()); + std::sort(RegSortStorage.begin(), RegSortStorage.end()); + for (auto it : RegSortStorage) { + unsigned Reg = it; + outputMap[Reg].insert(&Exp); + } + } + } + + MapVector sortMap; + for (auto it : inputMap) { + unsigned Reg = it.first; + auto outIt = outputMap.find(Reg); + if (outIt == outputMap.end()) + continue; + auto &inExps = it.second; + auto &outExps = outIt->second; + for (SubExp *inExp : inExps) { + for (SubExp *outExp : outExps) { + if (inExp->bHoist != outExp->bHoist) { + // Different direction. + // If output (def) move up, input(use) move down, nothing happens. + if (outExp->bHoist) + continue; + // Canot input(use) move up, output(def) move down. + // Choose the exp which save more. + int inExpGain = inExp->vOutputSize - inExp->vInputSize; + int outExpGain = outExp->vInputSize - inExp->vOutputSize; + if (inExpGain >= outExpGain) { + outExp->SUnits.clear(); + } else { + inExp->SUnits.clear(); + } + continue; + } + // Link outExp to inExp. + if (inExp->bHoist) { + sortMap[outExp].Preds.insert(inExp); + sortMap[inExp].Succs.insert(outExp); + } else { + sortMap[inExp].Preds.insert(outExp); + sortMap[outExp].Succs.insert(inExp); + } + } + } + } + + if (sortMap.empty()) + return; + + SmallVector WorkList; + for (SubExp &Exp : subExpCandidates) { + SortNode &Node = sortMap[&Exp]; + Node.Depth = 0; + Node.Exp = Exp; + Node.bDepthDirty = !Node.Preds.empty(); + if (!Node.bDepthDirty) + WorkList.emplace_back(&Exp); + } + // Calc depth. + while (!WorkList.empty()) { + SubExp *Exp = WorkList.pop_back_val(); + SortNode &Node = sortMap[Exp]; + for (SubExp *Succ : Node.Succs) { + SortNode &SuccNode = sortMap[Succ]; + SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1); + bool bAllPrevClean = true; + for (SubExp *Prev : SuccNode.Preds) { + SortNode &PrevNode = sortMap[Prev]; + if (PrevNode.bDepthDirty) { + bAllPrevClean = false; + break; + } + } + if (bAllPrevClean) { + SuccNode.bDepthDirty = false; + WorkList.push_back(Succ); + } + } + } + + std::vector nodes; + for (auto &it : sortMap) { + SortNode &node = it.second; + nodes.emplace_back(&node); + } + + struct sorter { + bool operator()(const SortNode *a, const SortNode *b) { + return a->Depth > b->Depth; + } + }; + + // subExp deeper should be apply first. + std::sort(nodes.begin(), nodes.end(), sorter()); + + subExpCandidates.clear(); + for (auto &node : nodes) { + subExpCandidates.emplace_back(node->Exp); + } +} + +// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1. +bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1, + unsigned maxS1, const GCNSubtarget *ST) { + unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0); + unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(maxV1); + unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(maxS0); + unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1); + unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0); + unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1); + // big occupancy is low pressure. + if (Occ0 > Occ1) + return false; + if (Occ0 < Occ1) + return true; + // When sgpr bound, big sgpr is high pressure. + if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) { + return maxS0 > maxS1; + } + // When vgpr bound or mix, vgpr higher is higher pressure. + return maxV0 > maxV1; +} + +// Return true if the subExp can help pressure for passThrus. +bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, const MachineLoopInfo *MLI, + MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) { + LLVM_DEBUG(subExp.dump(MRI, SIRI)); + if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) + return false; + + // Update input size to ignore lives in which already in + // passThrus. + for (auto it : subExp.inputLive) { + unsigned Reg = it.first; + if (passThrus.count(Reg) == 0) + continue; + unsigned Size = getRegSize(Reg, it.second, MRI, SIRI); + if (SIRI->isVGPR(MRI, Reg)) { + subExp.vInputSize -= Size; + } else { + subExp.sInputSize -= Size; + } + } + + if (subExp.vInputSize > subExp.vOutputSize) + return false; + + if (subExp.sInputSize > subExp.sOutputSize && bSgprBound) + return false; + + if (subExp.sInputSize >= subExp.sOutputSize && + subExp.vInputSize == subExp.vOutputSize) + return false; + + // Try to find a Insert Block. + // Skip multi def output sub exp. + // Collect user blocks, find common dom. + BlockSet userBlocks; + for (unsigned Reg : subExp.BottomRegs) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserBB = UseMI.getParent(); + // Skip current BB. + if (UserBB != subExp.FromBB) + userBlocks.insert(UserBB); + } + } + if (userBlocks.empty()) + return false; + MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks); + if (!pDT->dominates(subExp.FromBB, userBlock)) { + return false; + } + if (userBlock == subExp.FromBB && + // When allow clone, could go clone path if cannot move subExp. + !bCanClone) + return false; + + subExp.ToBB = userBlock; + if (auto *toLoop = MLI->getLoopFor(userBlock)) { + auto *fromLoop = MLI->getLoopFor(subExp.FromBB); + if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth()) + subExp.bMoveIntoLoop = true; + } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) { + auto *toLoop = MLI->getLoopFor(userBlock); + // not safe to move out of loop. + if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() || + toLoop != fromLoop) + return false; + } + return true; +} + +bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, + const MachineLoopInfo *MLI, bool bSgprBound) { + if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ true)) + return false; + if (subExp.vInputSize < subExp.vOutputSize) + return false; + if (subExp.sInputSize < subExp.sOutputSize && bSgprBound) + return false; + + if (subExp.sInputSize <= subExp.sOutputSize && + subExp.vInputSize == subExp.vOutputSize) + return false; + + // Try to find a Insert Block. + // Skip multi def output sub exp. + // Collect user blocks, find common dom. + BlockSet defBlocks; + for (unsigned Reg : subExp.TopRegs) { + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + if (!DefMI) + continue; + defBlocks.insert(DefMI->getParent()); + } + if (defBlocks.size() != 1) + return false; + MachineBasicBlock *defBlock = *defBlocks.begin(); + subExp.ToBB = defBlock; + // Not do same block hoist. + if (subExp.ToBB == subExp.FromBB) + return false; + + if (auto *toLoop = MLI->getLoopFor(defBlock)) { + auto *fromLoop = MLI->getLoopFor(subExp.FromBB); + // TODO: enable move into loop when hoist. + if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth()) + return false; + } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) { + auto *toLoop = MLI->getLoopFor(defBlock); + // not safe to move out of loop. + if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() || + toLoop != fromLoop) + return false; + } + return true; +} + +SmallVector> +groupPassThruByDefBlock(Remat *Remat, + const GCNRPTracker::LiveRegSet &passThrus, + GCNRPTracker::LiveRegSet &usedPassThrus, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + MapVector Candidates; + + // Group safe candidates by define block. + for (auto it : passThrus) { + unsigned Reg = it.first; + // Skip used pass thru reg to avoid count it twice for different hot block. + if (usedPassThrus.count(Reg)) + continue; + LLVM_DEBUG(print_vreg(Reg, MRI)); + LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr "; + else dbgs() << " vgpr ";); + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) { + LLVM_DEBUG(dbgs() << " is not safe\n"); + continue; + } + LLVM_DEBUG(dbgs() << " is safe\n"); + // DefMI is already checked in isSafeCandidate. + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + + GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()]; + DefInMBB[Reg] = it.second; + } + + llvm::SmallVector> result = Candidates.takeVector(); + + LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it + : result) { + MachineBasicBlock *MBB = it.first; + auto &defInMBB = it.second; + MBB->dump(); + llvm::dumpLiveSet(defInMBB, SIRI); + } llvm::dbgs() << "end of candidates\n";); + + std::sort(result.begin(), result.end(), + [](std::pair &it0, + std::pair &it1) { + return it0.first->getNumber() < it1.first->getNumber(); + }); + + LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it + : result) { + MachineBasicBlock *MBB = it.first; + auto &defInMBB = it.second; + MBB->dump(); + llvm::dumpLiveSet(defInMBB, SIRI); + } llvm::dbgs() << "end of candidates\n";); + + return result; +} + +// collect pass thru regs of MBB. +GCNRPTracker::LiveRegSet +collectPassThrus(MachineBasicBlock *MBB, + const GCNRPTracker::LiveRegSet &inputLive, + const GCNRPTracker::LiveRegSet &outputLive, + const GCNRPTracker::LiveRegSet &usedPassThrus, + const GCNRPTracker::LiveRegSet &liveRegCandidates, + MachineRegisterInfo &MRI, bool bCanClone) { + GCNRPTracker::LiveRegSet passThrus; + llvm::mergeLiveRegSet(passThrus, inputLive); + llvm::andLiveRegSet(passThrus, outputLive); + + // Remove reg which not in liveRegCandidates. + GCNRPTracker::LiveRegSet tmpPassThrus = passThrus; + for (auto it : tmpPassThrus) { + unsigned Reg = it.first; + if (!liveRegCandidates.count(Reg)) { + passThrus.erase(Reg); + } + } + tmpPassThrus = passThrus; + // Remove reg which has read/write in MBB. + for (auto it : tmpPassThrus) { + unsigned Reg = it.first; + DenseSet DefMBBs; + for (MachineInstr &DefMI : MRI.def_instructions(Reg)) { + MachineBasicBlock *MBB = DefMI.getParent(); + DefMBBs.insert(MBB); + } + DenseSet UseMBBs; + // Allow use for pass thru if clone is OK. + if (!bCanClone) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserMBB = UseMI.getParent(); + UseMBBs.insert(UserMBB); + } + } + bool bW = DefMBBs.count(MBB) > 0; + bool bR = UseMBBs.count(MBB) > 0; + + bool bPassThru = !bW && !bR; + if (!bPassThru) + passThrus.erase(Reg); + } + return passThrus; +} +// Try to build a free subExp which all input is passThrus. +SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { + SubExp freeExp; + // Try to split the subExp to find a help case. + // Scan all inst in subExp, propagate free inst which input is from + // passThrus. + SmallDenseSet freeRegs; + SmallDenseSet freeInstUseRegs; + SmallVector freeInsts; + for (MachineInstr *MI : subExp.SUnits) { + bool bIsFree = true; + // Check all use regs are free. + for (MachineOperand &MO : MI->uses()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (MO.isImplicit() && Reg == AMDGPU::EXEC) + continue; + if (MRI.getUniqueVRegDef(Reg) == nullptr) { + bIsFree = false; + break; + } + // Skip local pass thrus unless it is free. + if (passThrus.count(Reg) && subExp.TopRegs.count(Reg)) + continue; + if (freeRegs.count(Reg)) + continue; + bIsFree = false; + break; + } + // Check def is unique. + for (MachineOperand &MO : MI->defs()) { + unsigned Reg = MO.getReg(); + if (MRI.getUniqueVRegDef(Reg) == nullptr) { + bIsFree = false; + break; + } + } + if (!bIsFree) + continue; + // Save inst as free inst. + freeInsts.emplace_back(MI); + // Save def as free reg. + for (MachineOperand &MO : MI->defs()) { + unsigned Reg = MO.getReg(); + freeRegs.insert(Reg); + } + // Save use regs as free use reg. + for (MachineOperand &MO : MI->uses()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + + freeInstUseRegs.insert(Reg); + } + } + // Then remove local inst has no output use. + for (MachineInstr *MI : freeInsts) { + bool bIsFreeUsed = false; + for (MachineOperand &MO : MI->defs()) { + unsigned Reg = MO.getReg(); + // Used as freeInst or output. + bIsFreeUsed |= + freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg); + } + if (!bIsFreeUsed) + continue; + freeExp.SUnits.emplace_back(MI); + } + if (freeExp.SUnits.empty()) { + // mark has terminator to make it unsafe. + freeExp.bHasTerminatorInst = true; + return freeExp; + } + // Build BottomRegs and TopRegs for freeExp. + // BottomRegs is freeRegs in subExp.BottomRegs. + for (unsigned freeReg : freeRegs) { + if (subExp.BottomRegs.count(freeReg)) + freeExp.BottomRegs.insert(freeReg); + } + // TopRegs is freeInstUseRegs in subExp.TopRegs. + for (unsigned freeInstUseReg : freeInstUseRegs) { + if (subExp.TopRegs.count(freeInstUseReg)) + freeExp.TopRegs.insert(freeInstUseReg); + } + freeExp.FromBB = subExp.FromBB; + freeExp.ToBB = subExp.ToBB; + // must be clone since is partial of subExp. + freeExp.bCloneOnly = true; + + // Calc reg for freeExp. + for (unsigned Reg : freeExp.TopRegs) { + freeExp.inputLive[Reg]; + } + + for (unsigned Reg : freeExp.BottomRegs) { + freeExp.outputLive[Reg]; + } + + CollectLiveSetPressure(freeExp.inputLive, MRI, SIRI, freeExp.vInputSize, + freeExp.sInputSize); + CollectLiveSetPressure(freeExp.outputLive, MRI, SIRI, freeExp.vOutputSize, + freeExp.sOutputSize); + return freeExp; +} + +std::vector buildSubExpCandidates( + Remat *Remat, + SmallVector> + &Candidates, + GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const MachineLoopInfo *MLI, SlotIndexes *slotIndexes, + MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound, + GCNRPTracker::LiveRegSet &unUsedPassThrus, + DenseSet &MemWriteMBBSet, + bool bAllowPartialUseInSubExp) { + std::vector subExpCandidates; + // Build exp dag on define blocks. + // Save profit candidates into list. + for (auto &it : Candidates) { + MachineBasicBlock *DefMBB = it.first; + // Try to remove out reg def sub exp from DefMBB. + GCNRPTracker::LiveRegSet &DefInMBB = it.second; + // Go up on the dag until reach share node. + auto subExps = + buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, + slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp); + for (SubExp &subExp : subExps) { + if (subExp.bHasMemInst) { + // Skip when memory ld/st inst need to cross MBB which write memory. + // TODO: check all MBBs in between FromBB and ToBB not write memory. + // Currently just skip when any memory write exist. + if (!MemWriteMBBSet.empty()) { + MachineBasicBlock *FromBB = subExp.FromBB; + MachineBasicBlock *ToBB = subExp.ToBB; + if (subExp.bHoist) { + FromBB = subExp.ToBB; + ToBB = subExp.FromBB; + } + bool bCrossMemWriteMBB = false; + for (MachineBasicBlock *MemMBB : MemWriteMBBSet) { + if (pDT->dominates(ToBB, MemMBB)) + continue; + if (pDT->dominates(MemMBB, FromBB)) + continue; + bCrossMemWriteMBB = true; + break; + } + if (bCrossMemWriteMBB) + continue; + } + } + if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT, + bCanClone, bSgprBound)) { + if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) { + SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI); + if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT, + bCanClone, bSgprBound)) { + subExpCandidates.emplace_back(freeSubExp); + } + } + continue; + } + + subExpCandidates.emplace_back(subExp); + } + } + return subExpCandidates; +} + +std::pair +calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, + GCNRPTracker::LiveRegSet &inputLive, + GCNRPTracker::LiveRegSet &outputLive, bool bVOutBound, + bool bSOutBound, bool bCanClone, MachineDominatorTree *pDT, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { + int vgpr = 0; + int sgpr = 0; + MachineBasicBlock *MBB = hotBB.MBB; + // Sink saving. + for (SubExp &Exp : subExpCandidates) { + if (Exp.bHoist) { + // ToMBB -> MBB -> FromMBB. + // If ToMBB not dom hot block, reg will not live in MBB. + if (!pDT->dominates(Exp.ToBB, MBB)) + continue; + } else { + // If FromBB not dom hot block, reg will not live in MBB. + if (!pDT->dominates(Exp.FromBB, MBB)) + continue; + // When subExp is from hotBB, check output instead of input. + if (Exp.FromBB == MBB) { + if (bVOutBound && Exp.vOutputSize < Exp.vInputSize) + continue; + if (bSOutBound && Exp.sOutputSize < Exp.sInputSize) + continue; + vgpr += Exp.vInputSize; + vgpr -= Exp.vOutputSize; + sgpr += Exp.sInputSize; + sgpr -= Exp.sOutputSize; + continue; + } + } + int vgprDiff = 0; + int sgprDiff = 0; + MachineBasicBlock *ToMBB = Exp.ToBB; + // If subExp is to hotBB, it is crossing output instead of input. + GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive; + + bool bClone = false; + GCNRPTracker::LiveRegSet newInput; + if (!Exp.bMoveIntoLoop) { + if (Exp.bHoist) { + // If FromBB dom hot block, it will not change live for MBB. + if (Exp.FromBB != MBB && pDT->dominates(Exp.FromBB, MBB)) + continue; + } else { + // If ToBB dom hot block, it will not change live for MBB. + if (ToMBB != MBB && pDT->dominates(ToMBB, MBB)) { + if (bCanClone && !Exp.bNotSafeToCopy) { + bClone = true; + } else { + continue; + } + } + } + + for (auto outIt : Exp.outputLive) { + unsigned Reg = outIt.first; + LaneBitmask outMask = outIt.second; + LaneBitmask MBBBeginMask; + if (crossLive.find(Reg) != crossLive.end()) + MBBBeginMask = crossLive[Reg]; + // Check mask which live in both BeginSlot and exp output when sink to + // kill the output. Check mask which not live in BeginSlot but live in + // exp output when hoist to live the output. + LaneBitmask profitMask = + Exp.bHoist ? (outMask & (~MBBBeginMask)) : (outMask & MBBBeginMask); + if (MBBBeginMask.any()) { + unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + LLVM_DEBUG(std::string movStr = + Exp.bHoist ? "output hoist:" : "output sink:"; + dbgs() << movStr << Register::virtReg2Index(Reg) + << " " << Size); + // Exp out live at block input. + // It will descrease live for MBB when sink and increase when hoist. + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + if (Exp.bHoist) + vgprDiff += Size; + else + vgprDiff -= Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + if (Exp.bHoist) + sgprDiff += Size; + else + sgprDiff -= Size; + } + } + } + + for (auto inIt : Exp.inputLive) { + unsigned Reg = inIt.first; + LaneBitmask inMask = inIt.second; + LaneBitmask MBBBeginMask; + if (crossLive.find(Reg) != crossLive.end()) + MBBBeginMask = crossLive[Reg]; + // Check mask which not live in BeginSlot but live in exp input when + // sink to live the input. Check mask which live in both BeginSlot and + // exp output when hoist to kill the input. + LaneBitmask profitMask = + Exp.bHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask)); + if (profitMask.any()) { + // Update input live to avoid count same input more than once. + newInput[Reg] |= inMask; + // Exp in not live at block input. + // It will increase live for MBB. + unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + + LLVM_DEBUG(std::string movStr = + Exp.bHoist ? "input hoist:" : "input sink:"; + dbgs() << movStr << Register::virtReg2Index(Reg) + << " " << Size); + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + if (Exp.bHoist) + vgprDiff -= Size; + else + vgprDiff += Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + if (Exp.bHoist) + sgprDiff -= Size; + else + sgprDiff += Size; + } + } + } + } else { + // When sink into loop, the input will live for every block inside loop. + // The output will only lived between to blocks and the use blocks. + // If MBB dominate any user of output live reg, it will still live in + // MBB. So cannot count that output live reg as profit. + // Hoist into loop is not supported now. + for (auto outIt : Exp.outputLive) { + unsigned Reg = outIt.first; + bool bDomUser = false; + for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UserMBB = MI.getParent(); + if (pDT->dominates(MBB, UserMBB)) { + bDomUser = true; + break; + } + } + if (bDomUser) + continue; + + LaneBitmask outMask = outIt.second; + LaneBitmask MBBBeginMask; + if (inputLive.find(Reg) != inputLive.end()) + MBBBeginMask = inputLive[Reg]; + LaneBitmask profitMask = outMask & MBBBeginMask; + if (MBBBeginMask.any()) { + unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg) + << " " << Size); + // Exp out live at block input. + // It will descrease live for MBB. + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + vgprDiff -= Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + sgprDiff -= Size; + } + } + } + + for (auto inIt : Exp.inputLive) { + unsigned Reg = inIt.first; + LaneBitmask inMask = inIt.second; + LaneBitmask MBBBeginMask; + if (inputLive.find(Reg) != inputLive.end()) + MBBBeginMask = inputLive[Reg]; + // Check mask which not live in BeginSlot but live in exp input. + LaneBitmask profitMask = inMask & (~MBBBeginMask); + if (profitMask.any()) { + // Update input live to avoid count same input more than once. + newInput[Reg] |= inMask; + // Exp in not live at block input. + // It will increase live for MBB. + unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + + LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg) + << " " << Size); + if (SIRI->isVGPR(MRI, Reg)) { + LLVM_DEBUG(dbgs() << "v\n"); + vgprDiff += Size; + } else { + LLVM_DEBUG(dbgs() << "s\n"); + sgprDiff += Size; + } + } + } + } + + if (bVOutBound && vgprDiff > 0) + continue; + + if (bSOutBound && sgprDiff > 0) + continue; + llvm::mergeLiveRegSet(crossLive, newInput); + vgpr += vgprDiff; + sgpr += sgprDiff; + if (bClone) + Exp.bCloneOnly = true; + } + + return std::make_pair(vgpr, sgpr); +} + +void addExpCandidates(std::vector &subExpCandidates, + std::vector &subExps, + GCNRPTracker::LiveRegSet &usedRegs) { + subExpCandidates.insert(subExpCandidates.end(), subExps.begin(), + subExps.end()); + for (auto &Exp : subExps) { + if (Exp.bHoist) { + for (auto &Reg : Exp.TopRegs) { + usedRegs[Reg]; + } + } else { + for (auto &Reg : Exp.BottomRegs) { + usedRegs[Reg]; + } + } + } +} + +bool tryToAddSubExps( + Remat *Remat, + HotBlock &hotBB, RematStatus &status, std::vector &subExpCandidates, + std::vector &inBlockCloneSubExps, + DenseMap &inBlockHotVInstMap, + DenseMap &inBlockHotSInstMap, + SmallVector> + Candidates, + int vgpr, int sgpr, const GCNRPTracker::LiveRegSet &savingInputLive, + const GCNRPTracker::LiveRegSet &savingOutputLive, + GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, const MachineLoopInfo *MLI, + SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT, + bool bCanClone, bool bVOutBound, bool bSOutBound, + GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) { + std::vector partialSubExps = buildSubExpCandidates(Remat, + Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone, + bSOutBound, unUsedPassThrus, status.MemWriteMBBSet, + bAllowPartialUseInSubExp); + + GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive; + GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive; + std::pair curSaving = calculateSaving( + hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive, + bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI); + const int VLimit = status.TargetVLimit; + const int SLimit = status.TargetSLimit; + + vgpr += curSaving.first; + sgpr += curSaving.second; + + if (vgpr <= VLimit && sgpr <= SLimit) { + // nrmSubExps can help reach target occupancy, add it to + // subExpCandidates. + addExpCandidates(subExpCandidates, partialSubExps, usedRegs); + return true; + } + + if (EnableSubExpAggressive) { + // Build candidates from passThrus but not used in partialSubExps. + GCNRPTracker::LiveRegSet sinkUsedRegs; + for (auto &Exp : partialSubExps) { + for (auto &Reg : Exp.BottomRegs) { + sinkUsedRegs[Reg]; + } + } + MapVector HoistCandidates; + for (auto &it : hotBB.inputLive) { + unsigned Reg = it.first; + // Skip reg which already used for sink exp. + if (sinkUsedRegs.count(Reg)) + continue; + if (usedRegs.count(Reg)) + continue; + // Skip unsafe reg. + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ false)) { + LLVM_DEBUG(dbgs() << " is not safe to hoist\n"); + continue; + } + // DefMI is already checked in isSafeCandidate. + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + MachineBasicBlock *DefMBB = DefMI->getParent(); + DenseSet UseMBBSet; + // Make sure all uses not in Def block are in same block. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + MachineBasicBlock *UseMBB = UseMI.getParent(); + if (UseMBB == DefMBB) + continue; + UseMBBSet.insert(UseMBB); + } + + if (UseMBBSet.size() != 1) + continue; + MachineBasicBlock *UseMBB = *UseMBBSet.begin(); + GCNRPTracker::LiveRegSet &UseInMBB = HoistCandidates[UseMBB]; + UseInMBB[Reg] = getRegMask(DefMI->getOperand(0), MRI); + } + + SlotIndexes *slotIndexes = LIS->getSlotIndexes(); + // Build exp dag on define blocks. + std::vector hoistSubExpCandidates; + // Save profit candidates into list. + for (auto it : HoistCandidates) { + MachineBasicBlock *UseMBB = it.first; + // Try to remove out reg def sub exp from DefMBB. + GCNRPTracker::LiveRegSet &UseInMBB = it.second; + // Go up on the dag until reach share node. + auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI, + SIII, MRI, slotIndexes); + for (SubExp &subExp : subExps) { + if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound)) + continue; + subExp.bHoist = true; + hoistSubExpCandidates.emplace_back(subExp); + } + } + + std::pair hoistSaving = calculateSaving( + hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive, + bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI); + + int hoistVgpr = vgpr + hoistSaving.first; + int hoistSgpr = sgpr + hoistSaving.second; + + if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) || + // If status not balance, do the remat even cannot reach target. + // TODO: check the result not help even one occupancy. + (!hoistSubExpCandidates.empty() && !status.bNotBalance && + TargetOccupancy != 0)) { + // nrmSubExps can help reach target occupancy, add it to + // subExpCandidates. + addExpCandidates(subExpCandidates, partialSubExps, usedRegs); + addExpCandidates(subExpCandidates, hoistSubExpCandidates, usedRegs); + + return true; + } + } + + if (EnableVmemDegree && + // Only expect vmem when last tryToAddSubExps. + // If not, bAllowPartialUseInSubExp will no chance to be true. + (bAllowPartialUseInSubExp || + !EnableSubExpAggressive)) { + // Assume vmemLdSize could be optimized by not parallel. + if (((vgpr - hotBB.vmemLdInputSize) <= VLimit || + (vgpr - hotBB.vmemLdOutputSize) <= VLimit) && + sgpr <= SLimit) { + // nrmSubExps can help reach target occupancy, add it to + // subExpCandidates. + addExpCandidates(subExpCandidates, partialSubExps, usedRegs); + return true; + } + } + + int vDistance = vgpr - (int)VLimit; + int sDistance = status.TargetOcc > 4 ? (sgpr - (int)SLimit) : 0; + int vSaved = hotBB.maxPressures.first - vgpr; + int sSaved = hotBB.maxPressures.second - sgpr; + // Try to add inBlockCloneSubExps. + if (!tryRematInHotSpot(*hotBB.MBB, status, vDistance, sDistance, vSaved, + sSaved, inBlockCloneSubExps, inBlockHotVInstMap, + inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) { + // return false always when not allow partialUseInSubExp, it will try again + // with partialUseInSubExp enabled. + if (!bAllowPartialUseInSubExp) + return false; + // If status not balance, do the remat even cannot reach target. + // TODO: check the result not help even one occupancy. + if (!status.bNotBalance && TargetOccupancy == 0) + return false; + } + // nrmSubExps can help reach target occupancy, add it to + // subExpCandidates. + addExpCandidates(subExpCandidates, partialSubExps, usedRegs); + return true; +} + +// Remat passthru regs per hot block. +// Reason to do it per block is to make sure passthru reuse is precise. +// If try remat on all hot blocks together, the passthru might be on one block, +// but the reuse in on another block which the reg is not passthru there. +bool perBlockPassthruRemat(Remat *Remat, + std::vector &hotBlocks, + RematStatus &status, + GCNRPTracker::LiveRegSet &liveRegCandidates, + const GCNSubtarget *ST, LiveIntervals *LIS, + const MachineLoopInfo *MLI, + MachineDominatorTree *pDT, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + bool bUpdated = false; + bool bCanClone = EnableSubExpClone | + EnableSubExpAggressive; + + SlotIndexes *slotIndexes = LIS->getSlotIndexes(); + // Sort hot blocks by pressure first. + // The hot block with higher pressure is easier to fail. + // If fail, fail fast. It it works, save the subExpCandidates. The + // subExpCandidates may help other hotblocks. + std::sort(hotBlocks.begin(), hotBlocks.end(), + [&ST](const HotBlock &a, const HotBlock &b) { + return pressureHigher(a.maxPressures.first, a.maxPressures.second, + b.maxPressures.first, b.maxPressures.second, + ST); + }); + + std::vector subExpCandidates; + // For inBlock remat clone. + std::vector inBlockCloneSubExps; + DenseMap inBlockHotVInstMap; + DenseMap inBlockHotSInstMap; + + // Save used passThrus to avoid use same reg on different MBB. + GCNRPTracker::LiveRegSet usedPassThrus; + // Save moved regs to avoid use same reg hoist and sink. + GCNRPTracker::LiveRegSet usedRegs; + + const int VLimit = status.TargetVLimit; + const int SLimit = status.TargetSLimit; + // Collect passthru for hot block. + // Try remat on it. + for (auto &it : hotBlocks) { + MachineBasicBlock *MBB = it.MBB; + + const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB]; + const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB]; + + it.inputLive = inputLive; + + // Add pressure by 1 to consider spill to vgpr. + const int PressureDelta = -1; + int vgpr = it.maxPressures.first - PressureDelta; + int sgpr = it.maxPressures.second; + bool bVOutBound = vgpr > VLimit; + bool bSOutBound = sgpr > SLimit; + // savingInputLive is used to calculate saving which will be modified to + // avoid count same input multiple times. + GCNRPTracker::LiveRegSet savingInputLive = inputLive; + GCNRPTracker::LiveRegSet savingOutputLive = outputLive; + std::pair curSaving = + calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive, + bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI); + + vgpr += curSaving.first; + sgpr += curSaving.second; + + if (vgpr <= VLimit && sgpr <= SLimit) + continue; + + // Collect pass thru regs. + GCNRPTracker::LiveRegSet passThrus = + collectPassThrus(MBB, inputLive, outputLive, usedPassThrus, + liveRegCandidates, MRI, bCanClone); + + // Group pass thru regs by def MBB. + SmallVector> + Candidates = + groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII); + // unUsedPassThrus used to collect passThru which is skipped when build + // subExp. + GCNRPTracker::LiveRegSet unusedPassThrus; + // Build exp dag on define blocks. + bool bAllowPartialUseInSubExp = false; + if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps, + inBlockHotVInstMap, inBlockHotSInstMap, Candidates, + vgpr, sgpr, savingInputLive, savingOutputLive, + passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes, + LIS, pDT, bCanClone, bVOutBound, bSOutBound, + unusedPassThrus, bAllowPartialUseInSubExp)) { + // Remove unusedPassThrus from passThrus first. + llvm::andNotLiveRegSet(passThrus, unusedPassThrus); + llvm::mergeLiveRegSet(usedPassThrus, passThrus); + continue; + } + // If cannot clone, don't need to try partialUseInSubExp which must clone. + if (!bCanClone) + return false; + + // Partial use subExp may result big alu count caused by clone. + // Only try it when enable aggressive remat. + if (!EnableSubExpAggressive) + return false; + + bAllowPartialUseInSubExp = true; + if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps, + inBlockHotVInstMap, inBlockHotSInstMap, Candidates, + vgpr, sgpr, savingInputLive, savingOutputLive, + passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes, + LIS, pDT, bCanClone, bVOutBound, bSOutBound, + unusedPassThrus, bAllowPartialUseInSubExp)) { + return false; + } + // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp. + llvm::mergeLiveRegSet(usedPassThrus, passThrus); + } + + // Apply changes. + { + // sort subExpCandidates to make sure input use apply before output use if a + // reg is input and output of subExps. + LLVM_DEBUG(for (SubExp &Exp : subExpCandidates) { Exp.dump(MRI, SIRI); }); + sortSubExpCandidates(subExpCandidates); + + for (SubExp &Exp : subExpCandidates) { + // Skip exp which is cleared in sort for hoist sink conflict. + if (Exp.SUnits.empty()) + continue; + LLVM_DEBUG(Exp.dump(MRI, SIRI)); + if (Exp.bHoist) { + ApplySubExpMoveNearDefine(Exp, MRI, pDT, slotIndexes, SIII, SIRI); + } else { + if (Exp.bCloneOnly) + ApplySubExpCloneNearUser(Exp, hotBlocks, pDT, MRI, slotIndexes, SIII, + SIRI); + else + ApplySubExpMoveNearUser(Exp, MRI, pDT, slotIndexes, SIII, SIRI); + } + } + + for (SubExp &Exp : inBlockCloneSubExps) { + ApplySubExpCloneNearUserInBlock(Exp, inBlockHotVInstMap, + inBlockHotSInstMap, MRI, slotIndexes, + SIII, SIRI); + } + // Try to see possible occupancy could reach, then dicide a target. + // Apply remat. + bUpdated = subExpCandidates.size(); + } + + return bUpdated; +} + +int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI, const MachineRegisterInfo &MRI) { + int vmemLdSize = 0; + // Collect vmemLd when enable split. + for (MachineInstr &MI : MBB) { + bool bIsHighLatency = SIII->isHighLatencyInstruction(MI); + if (!bIsHighLatency) + continue; + if (!(MI.mayLoad() && + // Skip case like atomic which not return value. + MI.getNumDefs() > 0)) + continue; + // a vmem ld. + MachineOperand &Dst = MI.getOperand(0); + LaneBitmask mask = llvm::getRegMask(Dst, MRI); + unsigned size = llvm::getRegSize(Dst.getReg(), mask, MRI, SIRI); + vmemLdSize += size; + } + return vmemLdSize; +} + +} // namespace + +bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, + MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT, + AliasAnalysis *AA) +{ + if (MF.size() < 2) + return false; + const GCNSubtarget *ST = &MF.getSubtarget(); + + const SIInstrInfo *SIII = ST->getInstrInfo(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + auto &MRI = MF.getRegInfo(); + + RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST); + + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (status.TargetOcc >= MaxOcc) + return false; + + unsigned VLimit = status.TargetVLimit; + unsigned SLimit = status.TargetSLimit; + + int rematVCnt = status.MaxVPressure - VLimit; + int rematSCnt = status.MaxSPressure - SLimit; + + bool bSGPRSpill = false; + if (rematSCnt > 0) { + bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); + } + + // If bound by lds, skip. + if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) && + !bSGPRSpill) + return false; + + bool bBothOutLimit = rematVCnt > 0 && rematSCnt > 0; + // TODO: use check wqm and support vreg remat. + bool bCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; + rematVCnt = bCheckWQM & false; + + // Remat on every hot block. + + // Collect all hot blocks. + std::vector hotBlocks; + for (MachineBasicBlock &MBB : MF) { + // Collect reg pressure. + auto &RP = status.MBBPressureMap[&MBB]; + unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + unsigned maxLocalSPressure = RP.getMaxSGPR(); + + maxLocalSPressure += RegForVCC; + + if (!EnableInBlockRemat) { + if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit) + continue; + } + + // Move inst which input is imm/pass thru reg/out reg to help pressure. + if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) { + maxLocalVPressure = 0; + maxLocalSPressure = 0; + CollectMBBPressure(MBB, LIS, MRI, ST, maxLocalVPressure, + maxLocalSPressure, status); + + maxLocalSPressure += RegForVCC; + + } + if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit) + continue; + + // When both vgpr sgpr out limit, only help vgpr. + if (bBothOutLimit && maxLocalVPressure <= VLimit) + continue; + GCNRPTracker::LiveRegSet liveSet; + hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 }); + } + // Collect vmemLdInput/OutputSize. + if (EnableVmemDegree) { + DenseMap outputVMemLdSizeMap; + for (auto it : hotBlocks) { + MachineBasicBlock *MBB = it.MBB; + // Collect vmemLd when enable split. + int vmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI); + if (vmemLdSize) { + outputVMemLdSizeMap[MBB] = vmemLdSize; + } + } + for (auto &it : hotBlocks) { + MachineBasicBlock *MBB = it.MBB; + + auto oit = outputVMemLdSizeMap.find(MBB); + if (oit != outputVMemLdSizeMap.end()) + it.vmemLdOutputSize = oit->second; + + if (MBB->pred_size() != 1) + continue; + + MachineBasicBlock *Pred = *MBB->pred_begin(); + oit = outputVMemLdSizeMap.find(Pred); + if (oit != outputVMemLdSizeMap.end()) { + it.vmemLdInputSize = oit->second; + } else { + if (Pred->getFirstTerminator() != Pred->end()) + continue; + if (Pred->empty()) + continue; + bool bIsHighLatency = SIII->isHighLatencyInstruction(Pred->back()); + if (!bIsHighLatency) + continue; + int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI); + it.vmemLdInputSize = vmemLdSize; + } + } + } + + if (EnableUniformVectorToScalar) { + if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI, + SIRI, SIII, MLI)) { + // Rebuild LIS. + LIS->reanalyze(MF); + status = GetRematStatus(MF, MLI, LIS, MRI, ST); + bool bSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF); + if (bSgprSpilled) { + bool bNearTarget = false; + hotBlockRemat(Remat, MF, MLI, LIS, pDT, pPDT, bNearTarget); + // Rebuild LIS. + LIS->reanalyze(MF); + status = GetRematStatus(MF, MLI, LIS, MRI, ST); + } + + for (auto &it : hotBlocks) { + MachineBasicBlock *MBB = it.MBB; + + // Update pressure. + auto &RP = status.MBBPressureMap[MBB]; + unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + unsigned maxLocalSPressure = RP.getMaxSGPR(); + + maxLocalSPressure += RegForVCC; + it.maxPressures.first = maxLocalVPressure; + it.maxPressures.second = maxLocalSPressure; + } + } + } + + // Collect all live reg which cross hot blocks. + GCNRPTracker::LiveRegSet liveRegCandidates; + for (auto it : hotBlocks) { + MachineBasicBlock *MBB = it.MBB; + + const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB]; + + const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB]; + + llvm::mergeLiveRegSet(liveRegCandidates, inputLive); + llvm::mergeLiveRegSet(liveRegCandidates, outputLive); + } + + // Check min VGPR bound. + BlockSet PressureUnderLimitSet; + if (EnableSubExpMinReg) { + for (auto &it : hotBlocks) { + MachineBasicBlock *MBB = it.MBB; + unsigned MaxLocalVGPR = 0; + unsigned MaxLocalSGPR = 0; + llvm::getRegBound(MBB, MRI, SIRI, SIII, LIS, MaxLocalVGPR, MaxLocalSGPR); + + if (MaxLocalVGPR < VLimit && MaxLocalSGPR < SLimit) { + PressureUnderLimitSet.insert(MBB); + } else { + if (MaxLocalVGPR < it.maxPressures.first) + it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second); + if (MaxLocalSGPR < it.maxPressures.second) + it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR); + } + } + } + + bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, + ST, LIS, MLI, pDT, MRI, SIRI, SIII); + + return bUpdated; +} + +bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { + if (MF.size() < 2) + return false; + LiveIntervals *LIS = &getAnalysis().getLIS(); + MachineDominatorTree *DT = &getAnalysis().getDomTree(); + MachinePostDominatorTree *PDT = &getAnalysis().getPostDomTree(); + MachineLoopInfo *MLI = &getAnalysis().getLI(); + AliasAnalysis *AA = &getAnalysis().getAAResults(); + + { + llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI); + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (DA.isUniform(&MI)) { + TotalUniformInsts.insert(&MI); + } + } + } + } + + //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)")); + // For non-cs/ps, set target occ as 4. + bool bNearTarget = false; + bool bFinalUpdated = false; + bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget); + bFinalUpdated |= bUpdated; + if (EnableSubExp) { + if (bUpdated) { + // Rebuild LIS. + LIS->reanalyze(MF); + } + + bUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA); + + bFinalUpdated |= bUpdated; + } + return bFinalUpdated; +} + +INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize", + false, false) + +char AMDGPUHotBlockRematerialize::ID = 0; +char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID; + +FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() { + return new AMDGPUHotBlockRematerialize(); +} + diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp new file mode 100644 index 0000000000000..6f44fec08239c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -0,0 +1,2241 @@ +/////////////////////////////////////////////////////////////////////////////// +// // +// AMDGPUMIRUtils.cpp // +// Copyright (C) Microsoft Corporation. All rights reserved. // +// This file is distributed under the University of Illinois Open Source // +// License. See LICENSE.TXT for details. // +// // +// Util functions for llvm MIR Passes. // +// // +/////////////////////////////////////////////////////////////////////////////// + +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" + +//#include "dxc/DXIL/DxilMetadataHelper.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/ADT/IntEqClasses.h" +#include "llvm/Support/GraphWriter.h" + +#include "llvm/Support/Debug.h" + +#include "GCNRegPressure.h" +#include "AMDGPUMIRUtils.h" +#include "AMDGPUSubExpDag.h" +#include + +#define DEBUG_TYPE "xb-mir-util" +using namespace llvm; +namespace { +class CFGWithPhi { +public: + CFGWithPhi(MachineFunction &F) : F(F) { + // Collect phi and phi related insts. + MachineRegisterInfo &MRI = F.getRegInfo(); + + for (MachineBasicBlock &BB : F) { + auto &phiInsts = blockToPhiInstsMap[&BB]; + for (MachineInstr &I : BB) { + if (!I.isPHI()) + break; + phiInsts.insert(&I); + unsigned Reg = I.getOperand(0).getReg(); + // Add incoming values. + for (unsigned i=1;igetParent()].insert(DefMI); + } + // Add users. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + blockToPhiInstsMap[UseMI.getParent()].insert(&UseMI); + } + } + } + } /// Adds custom features for a visualization of the ScheduleDAG. + void addCustomGraphFeatures(llvm::GraphWriter &) const {} + MachineFunction &F; + DenseMap> blockToPhiInstsMap; + void dump(); +}; + +void CFGWithPhi::dump() { +#ifdef DBG + for (MachineBasicBlock &BB : F) { + dbgs() << BB.getName() << "\n"; + auto &phiInsts = blockToPhiInstsMap[&BB]; + for (MachineInstr *I : phiInsts) { + if (!I->isPHI()) + continue; + I->dump(); + } + for (MachineInstr *I : phiInsts) { + if (I->isPHI()) + continue; + I->dump(); + } + } +#endif +} + +} // namespace + +// CFGWithPhi dump. +namespace llvm { + +template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { + + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + static std::string getGraphName(const CFGWithPhi *G) { + return "CFG with Phi graph"; + } + + static std::string getNodeIdentifierLabel(const MachineBasicBlock *Node, + const CFGWithPhi *Graph) { + std::string R; + raw_string_ostream OS(R); + OS << static_cast(Node); + return R; + } + + static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) { + enum { MaxColumns = 8000 }; + std::string Str; + raw_string_ostream OS(Str); + + OS << "BB:" << BB->getName(); + auto it = G->blockToPhiInstsMap.find(BB); + if (it != G->blockToPhiInstsMap.end()) { + + auto &phiInsts = it->second; + for (MachineInstr *I : phiInsts) { + if (!I->isPHI()) + continue; + I->print(OS); + OS << "\n"; + } + for (MachineInstr *I : phiInsts) { + if (I->isPHI()) + continue; + I->print(OS); + OS << "\n"; + } + } + std::string OutStr = OS.str(); + if (OutStr[0] == '\n') + OutStr.erase(OutStr.begin()); + + // Process string output to make it nicer... + unsigned ColNum = 0; + unsigned LastSpace = 0; + for (unsigned i = 0; i != OutStr.length(); ++i) { + if (OutStr[i] == '\n') { // Left justify + OutStr[i] = '\\'; + OutStr.insert(OutStr.begin() + i + 1, 'l'); + ColNum = 0; + LastSpace = 0; + } else if (OutStr[i] == ';') { // Delete comments! + unsigned Idx = OutStr.find('\n', i + 1); // Find end of line + OutStr.erase(OutStr.begin() + i, OutStr.begin() + Idx); + --i; + } else if (ColNum == MaxColumns) { // Wrap lines. + // Wrap very long names even though we can't find a space. + if (!LastSpace) + LastSpace = i; + OutStr.insert(LastSpace, "\\l..."); + ColNum = i - LastSpace; + LastSpace = 0; + i += 3; // The loop will advance 'i' again. + } else + ++ColNum; + if (OutStr[i] == ' ') + LastSpace = i; + } + return OutStr; + } + static std::string getNodeDescription(const MachineBasicBlock *SU, + const CFGWithPhi *G) { + return SU->getName().str(); + } + + static void addCustomGraphFeatures(CFGWithPhi *G, + GraphWriter &GW) { + return G->addCustomGraphFeatures(GW); + } +}; + +template <> struct GraphTraits { + using NodeRef = MachineBasicBlock *; + using ChildIteratorType = MachineBasicBlock::succ_iterator; + using nodes_iterator = pointer_iterator; + + // static NodeRef getEntryNode(const CFGWithPhi *G) { + // return G->F.getFunctionEntry(); + //} + + static ChildIteratorType child_begin(const NodeRef N) { + return N->succ_begin(); + } + + static ChildIteratorType child_end(const NodeRef N) { return N->succ_end(); } + + static nodes_iterator nodes_begin(const CFGWithPhi *G) { + return nodes_iterator(G->F.begin()); + } + + static nodes_iterator nodes_end(const CFGWithPhi *G) { + return nodes_iterator(G->F.end()); + } +}; + +} // namespace llvm + +namespace llvm { + +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) { + unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + Size >>= 5; + LaneBitmask mask = Mask; + if (mask.any()) { + if (unsigned maskSize = mask.getNumLanes()) { + if (maskSize < Size) + Size = maskSize; + } + } + return Size; +} + +void CollectLiveSetPressure(const LiveSet &liveSet, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, unsigned &VPressure, + unsigned &SPressure) { + VPressure = 0; + SPressure = 0; + for (auto liveIt : liveSet) { + unsigned Reg = liveIt.first; + unsigned Size = getRegSize(Reg, liveIt.second, MRI, SIRI); + if (SIRI->isVGPR(MRI, Reg)) { + VPressure += Size; + } else { + SPressure += Size; + } + } +} + +bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) { + bool isExecUpdate = false; + unsigned opcode = MI.getOpcode(); + if (opcode == AMDGPU::S_MOV_B64 || opcode == AMDGPU::S_MOV_B32 || + opcode == AMDGPU::S_OR_B64_term || opcode == AMDGPU::S_OR_B32_term || + opcode == AMDGPU::S_OR_SAVEEXEC_B64 || + opcode == AMDGPU::S_OR_SAVEEXEC_B32 || opcode == AMDGPU::S_AND_B64 || + opcode == AMDGPU::S_AND_B32 || opcode == AMDGPU::S_ANDN2_B64 || + opcode == AMDGPU::S_ANDN2_B32) { + MachineOperand &Dst = MI.getOperand(0); + if (Dst.getReg() == AMDGPU::EXEC || Dst.getReg() == AMDGPU::EXEC_LO) { + isExecUpdate = true; + } + } + return isExecUpdate; +} + +bool IsSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { + // Support multi def for pattern of pointer: + // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // %808.sub1:sgpr_64 = S_MOV_B32 0 + bool bHasSub0 = false; + bool bHasSub1 = false; + for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) { + if (unsigned SubReg = UserDefMO.getSubReg()) { + bool bSingleSubReg = false; + switch (SubReg) { + default: + break; + case AMDGPU::sub0: + if (!bHasSub0) { + bHasSub0 = true; + bSingleSubReg = true; + } + break; + case AMDGPU::sub1: + if (!bHasSub1) { + bHasSub1 = true; + bSingleSubReg = true; + } + break; + } + if (!bSingleSubReg) { + bHasSub0 = false; + break; + } + } else { + bHasSub0 = false; + break; + } + } + + return (bHasSub0 && bHasSub1); +} + +LaneBitmask getRegMask(const MachineOperand &MO, + const MachineRegisterInfo &MRI) { + // We don't rely on read-undef flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0 + ? MRI.getMaxLaneMaskForVReg(MO.getReg()) + : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask( + MO.getSubReg()); +} + +void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) { + for (auto Reg : inputSet) { + unsigned reg = Reg.first; + LaneBitmask mask = Reg.second; + auto targetReg = targetSet.find(reg); + if (targetReg != targetSet.end()) { + LaneBitmask targetMask = targetReg->second; + mask |= targetMask; + } + targetSet[reg] = mask; + } +} + +void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) { + GCNRPTracker::LiveRegSet AndSet; + for (auto Reg : inputSet) { + unsigned reg = Reg.first; + LaneBitmask mask = Reg.second; + auto targetReg = targetSet.find(reg); + if (targetReg != targetSet.end()) { + LaneBitmask targetMask = targetReg->second; + mask &= targetMask; + AndSet[reg] = mask; + } + } + + targetSet = AndSet; +} + +void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) { + for (auto Reg : inputSet) { + unsigned reg = Reg.first; + LaneBitmask mask = Reg.second; + auto targetReg = targetSet.find(reg); + if (targetReg != targetSet.end()) { + LaneBitmask targetMask = targetReg->second; + if ((targetMask | mask) == mask) + targetSet.erase(reg); + else + targetSet[reg] = targetMask & (~mask); + } + } +} + +MachineBasicBlock *split(MachineInstr *Inst) { + + // Create the fall-through block. + MachineBasicBlock *MBB = Inst->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock(); + auto MBBIter = ++(MBB->getIterator()); + MF->insert(MBBIter, SuccMBB); + SuccMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(SuccMBB); + + // Splice the code over. + SuccMBB->splice(SuccMBB->end(), MBB, ++Inst->getIterator(), MBB->end()); + + return SuccMBB; +} + +struct Piece { + unsigned Reg; + unsigned offset; + unsigned size; + static SmallVector split(std::bitset<32> mask) { + + SmallVector pieces; + Piece piece = {0, 0, 0}; + for (unsigned i = 0; i < 32; i++) { + if (mask.test(i)) { + if (piece.size == 0) + piece.offset = i; + + piece.size++; + // Make sure no piece bigger than 8. + if (piece.size == 8) { + pieces.emplace_back(piece); + piece.size = 0; + } + } else { + if (piece.size == 0) { + continue; + } + pieces.emplace_back(piece); + piece.size = 0; + } + } + return pieces; + } +}; + +void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC, + unsigned offset, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { + unsigned size = NewRC->getLaneMask().getNumLanes(); + if (size == 1) { + UseMO.setSubReg(0); + } else { + const uint32_t SubReg = UseMO.getSubReg(); + LaneBitmask Mask = SIRI->getSubRegIndexLaneMask(SubReg); + + unsigned mask = Mask.getAsInteger() >> offset; + + unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask( + NewRC, LaneBitmask(mask)) + .front(); + + UseMO.setSubReg(NewSubReg); + } +} + +bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { + MachineOperand &DstMO = MI.getOperand(0); + // Skip case when dst subReg not 0. + if (DstMO.getSubReg()) { + return false; + } + unsigned Reg = DstMO.getReg(); + + SmallVector UseMOs; + for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) { + UseMOs.emplace_back(&UseMO); + } + + const llvm::TargetRegisterClass *NewRC = + SIRI->getRegClass(desc.operands().front().RegClass); + unsigned size = NewRC->getLaneMask().getNumLanes(); + if (offset > 0) { + // Update offset operand in MI. + MachineOperand *OffsetOp = + SIII->getNamedOperand(MI, AMDGPU::OpName::offset); + + const uint32_t LaneSize = sizeof(uint32_t); + if (OffsetOp) { + if (OffsetOp->isImm()) { + assert(OffsetOp != nullptr); + int64_t Offset = OffsetOp->getImm(); + Offset += offset * LaneSize; + if (!SIII->isLegalMUBUFImmOffset(Offset)) { + return false; + } + OffsetOp->setImm(Offset); + } else { + return false; + } + } else { + OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset); + if (OffsetOp) { + unsigned NewOffsetReg = + MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(), + SIII->get(AMDGPU::S_ADD_U32)) + .addDef(NewOffsetReg) + .add(*OffsetOp) + .addImm(offset * LaneSize); + MachineInstr *OffsetAddMI = OffsetAdd.getInstr(); + MachineBasicBlock::iterator InsertPoint = + llvm::FindOrCreateInsertionPointForSccDef( + MI.getParent(), MI, SIRI, SIII, &MRI + ); + MI.getParent()->insert(InsertPoint, OffsetAddMI); + SIII->legalizeOperands(*OffsetAddMI); + OffsetOp->setReg(NewOffsetReg); + OffsetOp->setSubReg(0); + if (SlotIndexes) + SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI); + } else { + return false; + } + } + // Update subReg for users. + for (MachineOperand *UseMO : UseMOs) { + updateSubReg(*UseMO, NewRC, offset, SIRI, SIII); + } + } else if (size == 1) { + // Clear subReg when size is 1. + for (MachineOperand *UseMO : UseMOs) { + UseMO->setSubReg(0); + } + } + + MI.setDesc(desc); + // Mutate reg class of Reg. + MRI.setRegClass(Reg, NewRC); + return true; +} + +bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + SlotIndexes *SlotIndexes) { + bool bImm = false; + switch (MI.getOpcode()) { + default: + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM: + bImm = true; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { + unsigned Reg = MI.getOperand(0).getReg(); + if (!MRI.getUniqueVRegDef(Reg)) + return false; + LaneBitmask dstMask = getRegMask(MI.getOperand(0), MRI); + LaneBitmask UseMask; + for (MachineOperand &MO : MRI.use_operands(Reg)) { + UseMask |= llvm::getRegMask(MO, MRI); + } + + const unsigned fullMask = dstMask.getAsInteger(); + unsigned mask = UseMask.getAsInteger(); + if (mask == fullMask) + return false; + // Split mask when there's gap. Then group mask to 2/4/8. + auto pieces = Piece::split(std::bitset<32>(mask)); + // Now only support 1 piece. + if (pieces.size() != 1) + return false; + auto piece = pieces[0]; + if (piece.size > 8) + return false; + + // TODO: enable offset support when bImm is true. + // Now if break different test when mul LaneSize or not mul for the offset. + if (bImm && piece.offset != 0) + return false; + + switch (piece.size) { + default: + return false; + case 1: + return reduceChannel(piece.offset, MI, + SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM + : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 2: + return reduceChannel(piece.offset, MI, + SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 3: + if (fullMask == 0xf) + return false; + case 4: + return reduceChannel(piece.offset, MI, + SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR), + MRI, SIRI, SIII, SlotIndexes); + case 5: + case 6: + case 7: + if (fullMask == 0xff) + return false; + case 8: + return reduceChannel(piece.offset, MI, + SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR), + MRI, SIRI, SIII, SlotIndexes); + } + + } break; + } + return false; +} + +// LoopInfo contains a mapping from basic block to the innermost loop. Find +// the outermost loop in the loop nest that contains BB. +const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI, + const MachineBasicBlock *BB) { + const MachineLoop *L = LI->getLoopFor(BB); + if (L) { + while (const MachineLoop *Parent = L->getParentLoop()) + L = Parent; + } + return L; +} + +// True if there is a loop which contains both BB1 and BB2. +bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1, + const MachineBasicBlock *BB2) { + const MachineLoop *L1 = getOutermostLoop(LI, BB1); + const MachineLoop *L2 = getOutermostLoop(LI, BB2); + return L1 != nullptr && L1 == L2; +} + +bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *LI, + MachineBasicBlock *ToBB) { + if (FromBB == ToBB) { + return true; + } + + if (DT->dominates(FromBB, ToBB)) { + return true; + } + + if (PDT->dominates(ToBB, FromBB)) { + return true; + } + + if (loopContainsBoth(LI, ToBB, FromBB)) { + return true; + } + // TODO: cover case hotBB in loop, + // one block in that loop dom BB or + // BB post dom one block in that loop. + return false; +} + +// If BB can reach hotMBBs. +bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *LI, + DenseSet &hotMBBs) { + bool bCross = false; + for (MachineBasicBlock *hotBB : hotMBBs) { + if (reach_block(BB, DT, PDT, LI, hotBB)) { + bCross = true; + break; + } + } + return bCross; +} + +} + +namespace llvm { +void viewCFGWithPhi(llvm::MachineFunction &F) { +#ifdef DBG + CFGWithPhi G(F); + ViewGraph(const_cast(&G), F.getName(), false, F.getName()); + G.dump(); +#endif +} +} // namespace llvm + +namespace llvm { +bool GetNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, + MachineBasicBlock &MBB) { + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + while (BBEnd != MBB.rend() && BBEnd->isDebugInstr()) + BBEnd++; + return BBEnd != MBB.rend(); +} +} // namespace llvm + +// Helper functions to write jason. +namespace { +void json_name(StringRef Val, raw_ostream &os) { os << "\"" << Val << "\":"; } + +template +void json_pair(StringRef Val, write_fn &fn, raw_ostream &os) { + json_name(Val, os); + os << "\""; + fn(); + os << "\""; +} + +template +void json_obj_pair(StringRef Val, write_fn &fn, raw_ostream &os) { + json_name(Val, os); + + fn(); +} + +template +void json_array(StringRef Val, write_fn &fn, raw_ostream &os) { + json_name(Val, os); + os << "["; + fn(); + os << "]"; +} +} // namespace + +namespace llvm { +namespace pressure { + +void write_inst(MachineInstr &MI, const SlotIndexes *SlotIndexes, + const SIInstrInfo *SIII, raw_ostream &os) { + os << "{"; + SlotIndex Slot = SlotIndexes->getInstructionIndex(MI); + auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + + json_pair("slot_index", writeSlot, os); + + os << ","; + + auto writeOpcode = [&MI, &SIII, &os]() { + os << SIII->getName(MI.getOpcode()); + }; + + json_pair("opcode", writeOpcode, os); + + os << ","; + + auto writeAsm = [&MI, &SIII, &os]() { + MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false, + /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII); + }; + json_pair("asm", writeAsm, os); + + os << "}"; +} + +void print_reg(Register Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &os) { + if (Reg.isVirtual()) { + StringRef Name = MRI.getVRegName(Reg); + if (Name != "") { + os << '%' << Name; + } else { + os << '%' << Register::virtReg2Index(Reg); + } + } else if (Reg < SIRI->getNumRegs()) { + os << '$'; + printLowerCase(SIRI->getName(Reg), os); + } else { + llvm_unreachable("invalid reg"); + } +} + +void write_reg(unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &os) { + os << "{"; + + auto writeReg = [&MRI, &SIRI, &Reg, &os]() { print_reg(Reg, MRI, SIRI, os); }; + json_pair("reg", writeReg, os); + + os << ","; + + auto writeSubReg = [&SubReg, &os]() { os << SubReg; }; + + json_pair("sub_reg", writeSubReg, os); + + os << ","; + auto writeIsSgpr = [&Reg, &MRI, &SIRI, &os]() { + if (SIRI->isSGPRReg(MRI, Reg)) + os << "true"; + else + os << "false"; + }; + json_obj_pair("is_sgpr", writeIsSgpr, os); + os << "}"; +} + +unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + return SIRI->getRegClassForReg(MRI, Reg)->getLaneMask().getNumLanes(); +} + +void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &os) { + if (Mask.none()) { + unsigned size = get_reg_size(Reg, MRI, SIRI); + Mask = LaneBitmask((1 << size) - 1); + } + unsigned mask = Mask.getAsInteger(); + for (unsigned i = 0; i <= Mask.getHighestLane(); i++) { + if (mask & (1 << i)) { + write_reg(Reg, i, MRI, SIRI, os); + os << ",\n"; + } + } +} + +void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &os) { + os << "{"; + auto writeID = [&ID, &os]() { os << ID; }; + + json_pair("ID", writeID, os); + + os << ","; + + auto writeReg = [®, &MRI, &SIRI, &os]() { print_reg(reg, MRI, SIRI, os); }; + + json_pair("reg", writeReg, os); + + os << ","; + + auto writeMask = [&mask, &os]() { os << mask; }; + + json_pair("mask", writeMask, os); + + os << "},\n"; +} + +void write_dag_inst_node(unsigned ID, SlotIndex Slot, + GCNRPTracker::LiveRegSet LiveReg, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, SUnit *SU, + raw_ostream &os) { + os << "{"; + auto writeID = [&ID, &os]() { os << ID; }; + + json_pair("ID", writeID, os); + + os << ","; + + auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + + json_pair("slot_index", writeSlot, os); + + os << ","; + + auto writeRegs = [&LiveReg, &MRI, &SIRI, &os]() { + for (auto it : LiveReg) { + unsigned Reg = it.first; + LaneBitmask Mask = it.second; + write_live(Reg, Mask, MRI, SIRI, os); + } + }; + json_array("regs", writeRegs, os); + + os << ","; + + auto writePreds = [&SU, &os]() { + for (auto &Pred : SU->Preds) { + + os << Pred.getSUnit()->NodeNum << ","; + } + }; + + json_array("preds", writePreds, os); + + os << "},\n"; +} + +void write_block(MachineBasicBlock &Blk, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, raw_ostream &os) { + os << "{\n"; + auto writeName = [&Blk, &os]() { os << Blk.getName(); }; + json_pair("name", writeName, os); + + os << ","; + + auto writeIndex = [&Blk, &os]() { os << Blk.getNumber(); }; + json_pair("id", writeIndex, os); + + os << ","; + + const SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); + + SlotIndex BeginSlot = SlotIndexes->getMBBStartIdx(&Blk); + auto writeSlot = [&BeginSlot, &os]() { BeginSlot.print(os); }; + json_pair("begin_slot", writeSlot, os); + + os << ","; + + SlotIndex EndSlot = SlotIndexes->getMBBEndIdx(&Blk); + auto writeEndSlot = [&EndSlot, &os]() { EndSlot.print(os); }; + json_pair("end_slot", writeEndSlot, os); + + os << ","; + + auto writeInsts = [&Blk, &SlotIndexes, &SIII, &os]() { + for (MachineInstr &MI : Blk) { + if (MI.isDebugInstr()) + continue; + write_inst(MI, SlotIndexes, SIII, os); + os << ",\n"; + } + }; + + json_array("instructions", writeInsts, os); + + os << ","; + + BlockExpDag dag(&Blk, LIS, MRI, SIRI, SIII); + dag.buildWithPressure(); + + const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *dag.LIS, dag.MRI); + auto writeInputs = [&StartLiveReg, &dag, &os]() { + for (auto it : StartLiveReg) { + unsigned Reg = it.first; + LaneBitmask mask = it.second; + SUnit *SU = dag.InputSUnitMap[Reg]; + // Write Reg and mask to the nodes. + write_dag_input_node(SU->NodeNum, Reg, mask.getAsInteger(), dag.MRI, + dag.SIRI, os); + } + }; + + json_array("input_nodes", writeInputs, os); + + os << ","; + + auto writeNodes = [&SlotIndexes, &dag, &os]() { + for (auto it : dag.MISUnitMap) { + MachineInstr *MI = it.first; + SUnit *SU = it.second; + // Use SlotIndex of MI. + SlotIndex SlotIndex; + if (!MI->isDebugInstr()) + SlotIndex = SlotIndexes->getInstructionIndex(*MI); + GCNRPTracker::LiveRegSet LiveReg = dag.DagPressureMap[SU]; + // Write slot, live to the nodes. + write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, dag.MRI, dag.SIRI, + SU, os); + } + }; + + json_array("inst_nodes", writeNodes, os); + + os << ","; + + auto writePreds = [&Blk, &os]() { + for (MachineBasicBlock *Pred : Blk.predecessors()) { + os << Pred->getNumber() << ","; + } + }; + + json_array("preds", writePreds, os); + + os << ","; + + auto writeSuccs = [&Blk, &os]() { + for (MachineBasicBlock *Succ : Blk.successors()) { + os << Succ->getNumber() << ","; + } + }; + + json_array("succs", writeSuccs, os); + + os << "}"; +} + +void write_define(SlotIndex &Slot, unsigned Reg, unsigned SubReg, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &os) { + os << "{"; + auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + + json_pair("slot_index", writeSlot, os); + + os << ","; + + auto writeReg = [&MRI, &SIRI, &Reg, &SubReg, &os]() { + write_reg(Reg, SubReg, MRI, SIRI, os); + }; + json_obj_pair("reg", writeReg, os); + + os << "}\n"; + + os << ","; +} + +void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &os) { + // Split subReg? MO.getSubReg(); + unsigned Reg = MO.getReg(); + unsigned SubReg = MO.getSubReg(); + MachineInstr *MI = MO.getParent(); + SlotIndex Slot = SlotIndexes->getInstructionIndex(*MI); + if (SubReg == 0) { + unsigned size = get_reg_size(Reg, MRI, SIRI); + for (unsigned i = 0; i < size; i++) { + write_define(Slot, Reg, i, MRI, SIRI, os); + } + } else { + switch (SubReg) { + default: + assert(0 && "SubReg not supported yet."); + write_define(Slot, Reg, SubReg, MRI, SIRI, os); + break; + case AMDGPU::sub0: + write_define(Slot, Reg, 0, MRI, SIRI, os); + break; + case AMDGPU::sub1: + write_define(Slot, Reg, 1, MRI, SIRI, os); + break; + case AMDGPU::sub2: + write_define(Slot, Reg, 2, MRI, SIRI, os); + break; + case AMDGPU::sub3: + write_define(Slot, Reg, 3, MRI, SIRI, os); + break; + case AMDGPU::sub4: + write_define(Slot, Reg, 4, MRI, SIRI, os); + break; + case AMDGPU::sub5: + write_define(Slot, Reg, 5, MRI, SIRI, os); + break; + case AMDGPU::sub6: + write_define(Slot, Reg, 6, MRI, SIRI, os); + break; + case AMDGPU::sub7: + write_define(Slot, Reg, 7, MRI, SIRI, os); + break; + case AMDGPU::sub8: + write_define(Slot, Reg, 8, MRI, SIRI, os); + break; + case AMDGPU::sub9: + write_define(Slot, Reg, 9, MRI, SIRI, os); + break; + case AMDGPU::sub10: + write_define(Slot, Reg, 10, MRI, SIRI, os); + break; + case AMDGPU::sub11: + write_define(Slot, Reg, 11, MRI, SIRI, os); + break; + case AMDGPU::sub12: + write_define(Slot, Reg, 12, MRI, SIRI, os); + break; + case AMDGPU::sub13: + write_define(Slot, Reg, 13, MRI, SIRI, os); + break; + case AMDGPU::sub14: + write_define(Slot, Reg, 14, MRI, SIRI, os); + break; + case AMDGPU::sub15: + write_define(Slot, Reg, 15, MRI, SIRI, os); + break; + case AMDGPU::sub0_sub1: + write_define(Slot, Reg, 0, MRI, SIRI, os); + write_define(Slot, Reg, 1, MRI, SIRI, os); + break; + case AMDGPU::sub2_sub3: + write_define(Slot, Reg, 2, MRI, SIRI, os); + write_define(Slot, Reg, 3, MRI, SIRI, os); + break; + case AMDGPU::sub4_sub5: + write_define(Slot, Reg, 4, MRI, SIRI, os); + write_define(Slot, Reg, 5, MRI, SIRI, os); + break; + case AMDGPU::sub1_sub2: + write_define(Slot, Reg, 1, MRI, SIRI, os); + write_define(Slot, Reg, 2, MRI, SIRI, os); + break; + case AMDGPU::sub0_sub1_sub2: + write_define(Slot, Reg, 0, MRI, SIRI, os); + write_define(Slot, Reg, 1, MRI, SIRI, os); + write_define(Slot, Reg, 2, MRI, SIRI, os); + break; + case AMDGPU::sub0_sub1_sub2_sub3: + write_define(Slot, Reg, 0, MRI, SIRI, os); + write_define(Slot, Reg, 1, MRI, SIRI, os); + write_define(Slot, Reg, 2, MRI, SIRI, os); + write_define(Slot, Reg, 3, MRI, SIRI, os); + break; + case AMDGPU::sub2_sub3_sub4_sub5: + write_define(Slot, Reg, 2, MRI, SIRI, os); + write_define(Slot, Reg, 3, MRI, SIRI, os); + write_define(Slot, Reg, 4, MRI, SIRI, os); + write_define(Slot, Reg, 5, MRI, SIRI, os); + break; + case AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7: + write_define(Slot, Reg, 0, MRI, SIRI, os); + write_define(Slot, Reg, 1, MRI, SIRI, os); + write_define(Slot, Reg, 2, MRI, SIRI, os); + write_define(Slot, Reg, 3, MRI, SIRI, os); + write_define(Slot, Reg, 4, MRI, SIRI, os); + write_define(Slot, Reg, 5, MRI, SIRI, os); + write_define(Slot, Reg, 6, MRI, SIRI, os); + write_define(Slot, Reg, 7, MRI, SIRI, os); + break; + } + } +} + +void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &os) { + + for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { + auto Reg = Register::index2VirtReg(i); + + for (MachineOperand &MO : MRI.def_operands(Reg)) { + write_define(MO, SlotIndexes, MRI, SIRI, os); + } + } +} + +void write_uses(MachineFunction &MF, const SlotIndexes *SlotIndexes, + + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &os) { + + for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { + auto Reg = Register::index2VirtReg(i); + + for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + // TODO: create write_use if use has more info. + write_define(MO, SlotIndexes, MRI, SIRI, os); + } + } +} + +void write_liveness(SlotIndex Slot, GCNRPTracker::LiveRegSet &LiveSet, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &os) { + os << "{"; + auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + + json_pair("slot_index", writeSlot, os); + + os << ","; + + auto writeRegs = [&LiveSet, &MRI, &SIRI, &os]() { + for (auto it : LiveSet) { + unsigned Reg = it.first; + LaneBitmask Mask = it.second; + write_live(Reg, Mask, MRI, SIRI, os); + } + }; + json_array("regs", writeRegs, os); + os << "\n},\n"; +} + +void write_segment(const LiveInterval::Segment &S, raw_ostream &os) { + os << "{"; + auto writeBegin = [&S, &os]() { S.start.print(os); }; + + json_pair("begin", writeBegin, os); + + os << ","; + + auto writeEnd = [&S, &os]() { S.end.print(os); }; + + json_pair("end", writeEnd, os); + + os << ","; + + auto writeValNum = [&S, &os]() { + if (S.valno) + os << S.valno->id; + else + os << 0xFFFFFFFF; + }; + + json_pair("val_num", writeValNum, os); + + os << "},\n"; +} + +void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &os) { + os << "{\n"; + auto writeMask = [&SR, &os]() { os << SR.LaneMask.getAsInteger(); }; + + json_pair("mask", writeMask, os); + + os << ","; + + // Segments. + auto writeSegments = [&SR, &os]() { + for (auto &S : SR.segments) { + write_segment(S, os); + } + }; + + json_array("segments", writeSegments, os); + + os << "\n},\n"; +} + +void write_live_interval(LiveInterval &LI, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, raw_ostream &os) { + os << "{\n"; + + auto writeReg = [&LI, &MRI, &SIRI, &os]() { + write_reg(LI.reg(), 0, MRI, SIRI, os); + }; + + json_obj_pair("reg", writeReg, os); + + os << ","; + + auto writeSegments = [&LI, &os]() { + for (auto &S : LI.segments) { + write_segment(S, os); + } + }; + + json_array("segments", writeSegments, os); + + os << ","; + + auto writeSubRanges = [&LI, &os]() { + for (auto &SR : LI.subranges()) { + write_subrange(SR, os); + } + }; + + json_array("subranges", writeSubRanges, os); + + os << "},\n"; +} + +std::string get_legal_str(const MDString *MDStr) { + std::string str; + raw_string_ostream Stream(str); + MDStr->print(Stream); + Stream.flush(); + // Remove !. + str = str.substr(1); + // Remove "" + str = str.substr(1); + str.pop_back(); + std::replace(str.begin(), str.end(), '\\', '#'); + return str; +} + +void write_file(const MDNode *FileNode, raw_ostream &os) { + const MDString *FileName = cast(FileNode->getOperand(0).get()); + StringRef fileNameStr = FileName->getString(); + if (fileNameStr.find("__AMDGPU_GPUMAP_") == 0) + return; + if (fileNameStr.find("__AMDGPU_DWARF_") == 0) + return; + + os << "{"; + + std::string str0 = get_legal_str(FileName); + auto writeName = [&str0, &os]() { os << str0; }; + json_pair("filename", writeName, os); + + os << ",\n"; + + const MDString *Content = cast(FileNode->getOperand(1).get()); + std::string str = get_legal_str(Content); + auto writeContent = [&str, &os]() { os << str; }; + json_pair("content", writeContent, os); + os << "\n},\n"; +} + +void write_DIFile(const DIFile *File, raw_ostream &os) { + if (File) { + std::string name = get_legal_str(File->getRawFilename()); + std::string dir = ""; + if (MDString *MDDir = File->getRawDirectory()) + dir = get_legal_str(MDDir); + os << dir << name; + } else { + os << "ArtificialFile"; + } +} + +void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &os) { + os << "{"; + + auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + + json_pair("slot_index", writeSlot, os); + + os << ",\n"; + + MDNode *Scope = DL.getScope(); + unsigned line = DL.getLine(); + unsigned col = DL.getCol(); + + auto writeLine = [&line, &os]() { os << line; }; + json_pair("line", writeLine, os); + + os << ",\n"; + + auto writeCol = [&col, &os]() { os << col; }; + json_pair("col", writeCol, os); + + os << ",\n"; + + auto writeFile = [&Scope, &os]() { + const DIFile *File = cast(Scope)->getFile(); + write_DIFile(File, os); + }; + json_pair("file", writeFile, os); + + if (DILocation *inlineDL = DL.getInlinedAt()) { + os << ",\n"; + unsigned inlineLine = inlineDL->getLine(); + auto writeLine = [&inlineLine, &os]() { os << inlineLine; }; + json_pair("inline_line", writeLine, os); + + os << ",\n"; + + unsigned inlineCol = inlineDL->getColumn(); + auto writeCol = [&inlineCol, &os]() { os << inlineCol; }; + json_pair("inline_col", writeCol, os); + + os << ",\n"; + + const MDNode *InlineScope = DL.getInlinedAtScope(); + auto writeFile = [&InlineScope, &os]() { + const DIFile *File = cast(InlineScope)->getFile(); + write_DIFile(File, os); + }; + json_pair("inline_file", writeFile, os); + } + + os << "\n},\n"; +} + +void write_dbg_val(unsigned Reg, const DIVariable *V, const DIExpression *Exp, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + raw_ostream &os) { + os << "{"; + + auto writeReg = [&MRI, &SIRI, &Reg, &os]() { + const unsigned SubReg = 0; + write_reg(Reg, SubReg, MRI, SIRI, os); + }; + json_obj_pair("reg", writeReg, os); + + os << ",\n"; + + if (V) { + auto writeName = [&V, &os]() { os << V->getName(); }; + json_pair("debug_val_name", writeName, os); + os << ",\n"; + + auto writeFile = [&V, &os]() { + const DIFile *File = V->getFile(); + write_DIFile(File, os); + }; + json_pair("debug_val_file", writeFile, os); + os << ",\n"; + + auto writeLine = [&V, &os]() { os << V->getLine(); }; + json_pair("debug_val_line", writeLine, os); + } + + if (Exp->isValid() && Exp->getNumElements()) { + os << ",\n"; + auto writeV = [&Exp, &os]() { + os << '['; + bool NeedSep = false; + for (auto Op : Exp->expr_ops()) { + if (NeedSep) + os << ", "; + else + NeedSep = true; + os << dwarf::OperationEncodingString(Op.getOp()); + for (unsigned I = 0; I < Op.getNumArgs(); ++I) + os << ' ' << Op.getArg(I); + } + os << "] "; + }; + json_pair("debug_exp", writeV, os); + } + os << "\n},\n"; +} + +void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI, const SlotIndexes *SlotIndexes, + const NamedMDNode *SourceMD, raw_ostream &os) { + os << ",\n"; + + auto writeFiles = [&SourceMD, &os]() { + for (const MDNode *FileNode : SourceMD->operands()) { + write_file(FileNode, os); + } + }; + + json_array("files", writeFiles, os); + + os << ",\n"; + + auto writeLineMapping = [&MF, &SlotIndexes, &os]() { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) { + continue; + } + const DebugLoc DL = MI.getDebugLoc(); + if (!DL) + continue; + SlotIndex Slot = SlotIndexes->getInstructionIndex(MI); + write_line_mapping(Slot, DL, os); + } + } + }; + + json_array("line_mapping", writeLineMapping, os); + + os << ",\n"; + + auto writeDebugVals = [&MF, &MRI, &SIRI, &os]() { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!MI.isDebugValue()) + continue; + + MachineOperand &Reg = MI.getOperand(0); + if (!Reg.isReg()) + continue; + + if (Reg.getReg() == 0) + continue; + + const DIVariable *V = MI.getDebugVariable(); + const DIExpression *Exp = MI.getDebugExpression(); + write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, os); + } + } + }; + + json_array("debug_vals", writeDebugVals, os); +} + +void write_function(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI, raw_ostream &os) { + const SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); + + os << "{\n"; + auto writeName = [&MF, &os]() { os << MF.getName(); }; + json_pair("name", writeName, os); + + os << ",\n"; + + auto writeBlocks = [&MF, &SlotIndexes, &LIS, &MRI, &SIRI, &SIII, &os]() { + for (MachineBasicBlock &MBB : MF) { + write_block(MBB, LIS, MRI, SIRI, SIII, os); + os << ",\n"; + } + }; + + json_array("blocks", writeBlocks, os); + + os << ",\n"; + + auto writeDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() { + write_defines(MF, SlotIndexes, MRI, SIRI, os); + }; + + json_array("defines", writeDefines, os); + + os << ",\n"; + + auto writeUses = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() { + write_uses(MF, SlotIndexes, MRI, SIRI, os); + }; + + json_array("uses", writeUses, os); + + os << ",\n"; + + auto writeLiveness = [&MF, &LIS, &MRI, &SIRI, &os]() { + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + const SlotIndex &SI = LIS->getInstructionIndex(MI).getBaseIndex(); + GCNRPTracker::LiveRegSet LISLR = llvm::getLiveRegs(SI, *LIS, MRI); + write_liveness(SI, LISLR, MRI, SIRI, os); + } + }; + + json_array("liveness", writeLiveness, os); + + os << ",\n"; + + auto writeLiveIntervals = [&MRI, &SIRI, &LIS, &os]() { + for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { + auto Reg = Register::index2VirtReg(i); + if (!LIS->hasInterval(Reg)) + continue; + auto &LI = LIS->getInterval(Reg); + write_live_interval(LI, MRI, SIRI, os); + } + }; + + json_array("live_intervals", writeLiveIntervals, os); + +#if 0 // TODO: Do we need this? + // Check debug info. + const Function &F = MF.getFunction(); + const Module *M = F.getParent(); + const NamedMDNode *SourceMD = + M->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceContentsMDName); + if (SourceMD) { + write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, os); + } +#endif + + os << "\n}"; +} + +void write_pressure(MachineFunction &MF, LiveIntervals *LIS, + const char *Filename) { + int FD = -1; + SmallString<128> TmpFilename(Filename); + std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename); + if (EC) { + errs() << "Error: " << EC.message() << "\n"; + return; + } + + raw_fd_ostream O(FD, /*shouldClose=*/true); + + const GCNSubtarget *ST = &MF.getSubtarget(); + const auto *SIII = ST->getInstrInfo(); + const auto *SIRI = ST->getRegisterInfo(); + auto &MRI = MF.getRegInfo(); + write_function(MF, LIS, MRI, SIII, SIRI, O); + O.flush(); + O.close(); +} + +void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) { + const GCNSubtarget *ST = &MF.getSubtarget(); + const auto *SIII = ST->getInstrInfo(); + const auto *SIRI = ST->getRegisterInfo(); + auto &MRI = MF.getRegInfo(); + write_function(MF, LIS, MRI, SIII, SIRI, os); + os.flush(); +} + +} // namespace pressure +}// namespace llvm + +namespace { +class ContributionList { +public: + ContributionList(MachineFunction &MF) : MF(MF){}; + void build(); + bool propagateContribution(); + MachineFunction &MF; + DenseMap MIIndexMap; + // Set of inst which contribute to build the key MachineInstr. + DenseMap> MIContributorMap; + // Set of inst which been contributed by the key MachineInstr. + DenseMap> MIContributedToMap; + void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &os); + void writeBlock(MachineBasicBlock &MBB, const SIInstrInfo *SIII, + raw_ostream &os); + void write(raw_ostream &os); +}; + +void buildMIContribution(MachineInstr &MI, + DenseSet &ContributorSet, + DenseSet &ContributedSet, + const SIRegisterInfo &SIRI, MachineRegisterInfo &MRI) { + for (MachineOperand &UseMO : MI.uses()) { + if (!UseMO.isReg()) + continue; + Register Reg = UseMO.getReg(); + if (Reg.isPhysical()) + continue; + if (UseMO.isImplicit()) { + // if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || + // Reg == AMDGPU::SCC) + continue; + } + for (MachineInstr &DefMI : MRI.def_instructions(Reg)) { + ContributorSet.insert(&DefMI); + } + } + + for (MachineOperand &DstMO : MI.defs()) { + if (!DstMO.isReg()) + continue; + if (DstMO.isImplicit()) + continue; + Register Reg = DstMO.getReg(); + if (Reg.isPhysical()) + continue; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + ContributedSet.insert(&UseMI); + } + } +} + +bool ContributionList::propagateContribution() { + bool bUpdated = false; + ReversePostOrderTraversal RPOT(&MF); + for (auto *MBB : RPOT) { + for (auto &MI : *MBB) { + auto &contributors = MIContributorMap[&MI]; + unsigned size = contributors.size(); + DenseSet parentContributors; + for (auto *CMI : contributors) { + auto &pContributors = MIContributorMap[CMI]; + parentContributors.insert(pContributors.begin(), pContributors.end()); + } + contributors.insert(parentContributors.begin(), parentContributors.end()); + bUpdated |= size < contributors.size(); + } + } + return bUpdated; +} + +void ContributionList::build() { + // Build contribution. + auto &MRI = MF.getRegInfo(); + const GCNSubtarget *ST = &MF.getSubtarget(); + const auto *SIRI = ST->getRegisterInfo(); + for (auto &MBB : MF) { + for (auto &MI : MBB) { + auto &contributors = MIContributorMap[&MI]; + auto &contributed = MIContributedToMap[&MI]; + buildMIContribution(MI, contributors, contributed, *SIRI, MRI); + } + } + // propagate contribution. + bool bUpdated = true; + while (bUpdated) { + bUpdated = propagateContribution(); + } +} + +void ContributionList::writeInst(MachineInstr &MI, const SIInstrInfo *SIII, + raw_ostream &os) { + os << "\n{\n"; + unsigned ID = MIIndexMap[&MI]; + auto writeSlot = [&ID, &os]() { os << ID; }; + + json_pair("ID", writeSlot, os); + + os << ","; + + auto writeAsm = [&MI, &SIII, &os]() { + MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false, + /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII); + }; + json_pair("asm", writeAsm, os); + + os << ",\n"; + + auto &contributors = MIContributorMap[&MI]; + auto writeContributor = [&contributors, this, &os]() { + for (auto *MI : contributors) { + unsigned ID = MIIndexMap[MI]; + os << ID << ","; + } + }; + + json_array("contributors", writeContributor, os); + os << ",\n"; + + auto &contributeds = MIContributedToMap[&MI]; + auto writeContributed = [&contributeds, this, &os]() { + for (auto *MI : contributeds) { + unsigned ID = MIIndexMap[MI]; + os << ID << ","; + } + }; + + json_array("contributed", writeContributed, os); + os << "\n}\n"; +} + +void ContributionList::writeBlock(MachineBasicBlock &MBB, + const SIInstrInfo *SIII, raw_ostream &os) { + os << "{\n"; + auto writeName = [&MBB, &os]() { os << MBB.getName(); }; + json_pair("name", writeName, os); + + os << ","; + + auto writeIndex = [&MBB, &os]() { os << MBB.getNumber(); }; + json_pair("id", writeIndex, os); + + os << ",\n"; + + auto writeInsts = [this, &MBB, &SIII, &os]() { + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + writeInst(MI, SIII, os); + os << ",\n"; + } + }; + + json_array("instructions", writeInsts, os); + + os << ",\n"; + + auto writePreds = [&MBB, &os]() { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + os << Pred->getNumber() << ","; + } + }; + + json_array("preds", writePreds, os); + + os << ","; + + auto writeSuccs = [&MBB, &os]() { + for (MachineBasicBlock *Succ : MBB.successors()) { + os << Succ->getNumber() << ","; + } + }; + + json_array("succs", writeSuccs, os); + + os << "}"; +} + +void ContributionList::write(raw_ostream &os) { + unsigned ID = 0; + // Build ID for write. + ReversePostOrderTraversal RPOT(&MF); + for (auto *MBB : RPOT) { + for (auto &MI : *MBB) { + MIIndexMap[&MI] = ID++; + } + } + + const GCNSubtarget *ST = &MF.getSubtarget(); + const auto *SIII = ST->getInstrInfo(); + + os << "{\n"; + auto writeName = [this, &os]() { os << MF.getName(); }; + json_pair("name", writeName, os); + + os << ",\n"; + + auto writeBlocks = [this, &SIII, &RPOT, &os]() { + for (auto *MBB : RPOT) { + writeBlock(*MBB, SIII, os); + os << ",\n"; + } + }; + + json_array("blocks", writeBlocks, os); + + os << "\n}"; +} +} // namespace + +namespace llvm { + +void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) { + int FD = -1; + SmallString<128> TmpFilename(Filename); + std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename); + if (EC) { + errs() << "Error: " << EC.message() << "\n"; + return; + } + + raw_fd_ostream O(FD, /*shouldClose=*/true); + ContributionList CL(MF); + CL.build(); + + CL.write(O); + + O.flush(); + O.close(); +} +} // namespace llvm + +static bool IsPhysReg(const MachineOperand &Op) +{ + return Op.isReg() && Op.getReg().isPhysical(); +} + +// Sometimes split bb uses physical registers defined in BB, have to add them to +// live-in or the ir is malformed. +void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI) +{ + // Initialize with current set of liveins. For new blocks this will be empty. + SmallDenseSet DefSet; + for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins()) + { + DefSet.insert(P.PhysReg); + } + + for (auto &MI : *NewBB) + { + // Add all undefined physical registers to the live in set. + for (MachineOperand &Use : MI.operands()) + { + // Only process physreg uses. + if (!IsPhysReg(Use) || !Use.isUse()) continue; + + // Reserved regs do not need to be tracked through live-in sets. + unsigned Reg = Use.getReg(); + if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue; + + if (!DefSet.count(Reg)) + NewBB->addLiveIn(Reg); + } + + // Add all physical register defs (exlicit+implicit) to the def register set. + for (MachineOperand &Def : MI.operands()) + { + // Only process physreg defs. + if (!IsPhysReg(Def) || !Def.isDef()) continue; + DefSet.insert(Def.getReg()); + } + } +} + +void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, + SmallDenseSet &LiveOutSet, + const MachineRegisterInfo *MRI) { + for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) { + auto &MI = *rit; + // Add all physical register defs (exlicit+implicit) to the def register + // set. + for (MachineOperand &Def : MI.operands()) { + // Only process physreg defs. + if (!IsPhysReg(Def) || !Def.isDef()) + continue; + LiveOutSet.erase(Def.getReg()); + } + // Add all undefined physical registers to the live in set. + for (MachineOperand &Use : MI.operands()) { + // Only process physreg uses. + if (!IsPhysReg(Use) || !Use.isUse()) + continue; + + // Reserved regs do not need to be tracked through live-in sets. + unsigned Reg = Use.getReg(); + if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) + continue; + + if (!LiveOutSet.count(Reg)) + LiveOutSet.insert(Reg); + } + } + for (unsigned Reg : LiveOutSet) { + NewBB->addLiveIn(Reg); + } +} + +MachineReg llvm::CreateVirtualRegForOperand( + MachineOpcode Opcode, + unsigned OpNum, + MachineFunction &MF +) +{ + const TargetSubtargetInfo &ST = MF.getSubtarget(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetInstrInfo *TII = ST.getInstrInfo(); + const MCInstrDesc &Desc = TII->get(Opcode); + const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF); + if (!RC) + { + llvm::report_fatal_error("Unable to create virtual reg for instruction operand"); + } + + MachineRegisterInfo &MRI = MF.getRegInfo(); + return MRI.createVirtualRegister(RC); +} + +MachineReg llvm::CreateVirtualDstReg( + MachineOpcode Opcode, + MachineFunction &MF +) +{ + return llvm::CreateVirtualRegForOperand(Opcode, 0, MF); +} + +// Return true if the MI is a copy of exec. +// If true then sets pDst to the destination register. +bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst) +{ + enum {DST=0, SRC=1}; + bool FoundCopy = false; + if (MI.getOpcode() == AMDGPU::COPY + || MI.getOpcode() == AMDGPU::S_MOV_B32 + || MI.getOpcode() == AMDGPU::S_MOV_B64) + { + const MachineOperand &Src = MI.getOperand(SRC); + if (Src.isReg() && Src.getReg() == Exec) + { + FoundCopy = true; + } + } +#if 0 // TODO: Delete this. + else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO || + MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32) + { + FoundCopy = true; + } +#endif + + if (FoundCopy) + { + *pDst = MI.getOperand(DST).getReg(); + } + + return FoundCopy; +} + +llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) +{ + llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister}; + if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF)) + { + LiveLaneMask.Reg = MI->getOperand(0).getReg(); + LiveLaneMask.SubReg = MI->getOperand(0).getSubReg(); + } + + return LiveLaneMask; +} + +MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) +{ +#if 0 // TODO: Get rid of this + // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction. + // This instruction is added by the SIWholeQuadMode pass. + MachineBasicBlock &MBB = MF.front(); + for (MachineInstr &MI : MBB) + { + if (MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK || + MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK_32) + { + return &MI; + } + } +#endif + + return nullptr; +} + +bool llvm::IsFetchShaderCall(const MachineInstr *MI) +{ +#if 0 // TODO: Get rid of this. + return + MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER || + MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall); +#else + return false; +#endif +} + +bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) { + const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo(); + for (auto it = MI; it != MBB->end(); ++it) { + const MachineInstr &CurMI = *it; + // Hit use of scc, it is live. + if (CurMI.readsRegister(AMDGPU::SCC, TRI)) + return true; + // Hit def of scc first, not live. + if (CurMI.definesRegister(AMDGPU::SCC, TRI)) + return false; + } + // Reach the end of MBB, check live-ins of MBB successors. + for (const MachineBasicBlock *Succ : MBB->successors()) { + if (Succ->isLiveIn(AMDGPU::SCC)) + return true; + } + return false; +} + +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator MI, + const TargetRegisterInfo* TRI, + const SIInstrInfo* TII, + MachineRegisterInfo* MRI, + SccDefInsertPointConstraintFlags Constraints +) +{ + // If SCC is dead at MI when we can use MI as the insert point. + if (!llvm::IsSccLiveAt(MBB, MI)) + { + return MI; + } + + const bool CheckForExecWrite = + Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; + + // Get the starting reverse iterator taking care to handle the MBB->end() case. + MachineBasicBlock::reverse_iterator Start; + if (MI == MBB->end()) + { + Start = MBB->rbegin(); + } + else + { + Start = MI.getReverse(); + } + + // Otherwise, walk backwards through the block looking for a location where + // SCC is dead. + for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It) + { + // If the instruction modifies exec then we cannot use it as + // an insertion point (if that is a constraint from the caller). + // The check for EXEC works for both wave64 and wave32 because + // it will also catch writes to the subregisters (e.g. exec_lo). + if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) + { + break; + } + + if (It->modifiesRegister(AMDGPU::SCC, TRI) + && !It->readsRegister(AMDGPU::SCC, TRI)) + { + return It->getIterator(); + } + } + + // If no safe location can be found in the block we can save and restore + // SCC around MI. There is no way to directly read or write SCC so we use + // s_cselect to read the current value of SCC and s_cmp to write the saved + // value back to SCC. + // + // The generated code will look like this; + // + // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC + // <----- Newly created safe insert point. + // MI + // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC + // + unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) + .addImm(-1) + .addImm(0); + BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(TmpScc, RegState::Kill) + .addImm(0); + + return MI; +} + + +namespace { +bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes, + SmallDenseSet &touchedMBBSet) { + MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end); + // Treat non inst as not local. + if (!startMI || !endMI) + return false; + // is local when parent MBB the same. + bool bSameMBB = startMI->getParent() == endMI->getParent(); + if (!bSameMBB) + return false; + // Collect touched MBB. + MachineBasicBlock *MBB = startMI->getParent(); + touchedMBBSet.insert(MBB); + return true; +} + +bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes, + SmallDenseSet &touchedMBBSet) { + for (const LiveRange::Segment &Seg : Range->segments) { + if (!isLocalSegment(&Seg, Indexes, touchedMBBSet)) + return false; + } + return true; +} + +bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) { + MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end); + // Treat non inst as not local. + if (!startMI || !endMI) + return false; + // is local when parent MBB the same. + return startMI->getParent() == endMI->getParent(); +} + +bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) { + for (const LiveRange::Segment &Seg : Range->segments) { + if (!isLocalSegment(&Seg, Indexes)) + return false; + } + return true; +} + +} // namespace + +// In case like float4 v, v.x used and defined in one block, v.y used and define +// in another block, one live interval could touch more than one MBB. +// touchedMBBSet is used for scheduling where local live interval could cross +// multiple regions, need to calculate livereg for each region inside touched +// MBB. +bool llvm::isLocalLiveInterval( + const LiveInterval &LI, SlotIndexes *Indexes, + SmallDenseSet &touchedMBBSet) { + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!isLocalLiveRange(&S, Indexes, touchedMBBSet)) + return false; + } + } + return isLocalLiveRange(&LI, Indexes, touchedMBBSet); +} + + +bool llvm::isLocalLiveInterval( + const LiveInterval &LI, SlotIndexes *Indexes) { + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!isLocalLiveRange(&S, Indexes)) + return false; + } + } + return isLocalLiveRange(&LI, Indexes); +} + +// This is used to speed up reg pressure calculation. +// If instruction is moved, the cached liveset will be out of date. +// Before instruction is moved, the value will be correct. +void llvm::buildEndLiveMap( + llvm::LiveIntervals *LIS, llvm::MachineFunction &MF, + const llvm::MachineRegisterInfo &MRI, + llvm::DenseMap + &MBBLiveMap, bool After) { + // When only have one block, end live reg must be empty. + if (MF.size() == 1) + return; + auto *SlotIndexes = LIS->getSlotIndexes(); + DenseMap MBBOutputSlotMap; + for (MachineBasicBlock &MBB : MF) { + auto BBEnd = MBB.rbegin(); + + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) { + auto SI = SlotIndexes->getInstructionIndex(*BBEnd); + MBBOutputSlotMap[&MBB] = After ? SI.getDeadSlot() : SI.getBaseIndex(); + } + } + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + + LaneBitmask LiveMask; + const auto &LI = LIS->getInterval(Reg); + + // Skip local live interval to make live input/ouput faster. + if (llvm::isLocalLiveInterval(LI, SlotIndexes)) + continue; + + for (auto outputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = outputIt.first; + auto SI = outputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + MBBLiveMap[MBB][Reg] = LiveMask; + } + } +} + +unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) { + auto &MRI = MF.getRegInfo(); + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + return SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::VGPR0) + 1; + } + } + return 0; +} + +unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned MaxSGPR = 0; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + // Skip scratch reserved reg, which is a big register that don't really contribute to this stat. + if (ScratchRSrcReg != 0) { + if (SIRI->isSubRegister(ScratchRSrcReg, Reg)) + continue; + } + MaxSGPR = SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::SGPR0); + break; + } + } + return 1 + llvm::RegForVCC + MaxSGPR; +} + +void llvm::dumpLiveSet(const LiveSet &LiveSet, + const SIRegisterInfo *SIRI) { + + dbgs() << "\n live set: \n"; + for (auto it : LiveSet) { + int Reg = it.first; + dbgs() << printReg(Reg, SIRI); + if (it.second.any()) { + dbgs() << " mask:" << it.second.getAsInteger(); + } + dbgs() << "\n"; + } +} + +// Test if all fast math flags of this Machine Instr are set. This allows +// all non-strict floating-point transforms. +bool llvm::isFastMathInst(llvm::MachineInstr &MI) { + // Follow the checks in isFast() in SelectionDAGNodes.h + return MI.getFlag(llvm::MachineInstr::MIFlag::FmNsz) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmArcp) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmNoNans) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmNoInfs) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmContract) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmAfn) && + MI.getFlag(llvm::MachineInstr::MIFlag::FmReassoc); +} +#if 0 +bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage) +{ + switch (Stage) + { + case xmd::HwStage::PS: + case xmd::HwStage::CS: + return true; + default: + return false; + } +} +#endif + +MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ) +{ + for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It) + { + if (*It == Succ) + { + return It; + } + } + + return MBB->succ_end(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h new file mode 100644 index 0000000000000..16b55c5c94583 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -0,0 +1,217 @@ +#pragma once + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/CodeGen/MachineBasicBlock.h" + +namespace llvm { + +class MachineFunction; +class LiveIntervals; +class LiveInterval; +class MachineRegisterInfo; +class SIRegisterInfo; +class SIInstrInfo; +class MachineInstr; +class MachinePostDominatorTree; +class MachineLoopInfo; +class MachineDominatorTree; +class raw_ostream; +class TargetInstrInfo; +class TargetRegisterInfo; + +typedef unsigned MachineReg; +typedef unsigned MachineOpcode; + +constexpr unsigned RegForVCC = 2; +constexpr unsigned VGPR_LIMIT = 256; +// Post RA remat only try to help case when pressue is OK before RA but RA +// result is higher. The diff should not be too much. So just use 4 as threshold +// here. +constexpr unsigned PostRARematThreshHold = 4; + +using LiveSet = llvm::DenseMap; + +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); +void CollectLiveSetPressure( + const LiveSet &liveSet, + const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, + unsigned &VPressure, unsigned &SPressure); + +bool isExecUpdateForControlFlow(llvm::MachineInstr &MI); + +bool IsSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); + +llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO, + const llvm::MachineRegisterInfo &MRI); +void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet); +void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet); +void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet); +llvm::MachineBasicBlock *split(llvm::MachineInstr *I); + +// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only +// used 4 lanes. +bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *TRI, + const llvm::SIInstrInfo *TII, + llvm::SlotIndexes *SlotIndexes); + +bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT, + llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI, + llvm::MachineBasicBlock *ToBB); + + +void viewCFGWithPhi(llvm::MachineFunction &MF); +void write_contribution_list(llvm::MachineFunction &MF, const char *Filename); + +llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII); + +bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, + llvm::MachineBasicBlock &MBB); + +void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI); + +void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, + llvm::SmallDenseSet &LiveOutSet, + const llvm::MachineRegisterInfo *MRI); + +MachineReg CreateVirtualRegForOperand( + MachineOpcode Opcode, + unsigned Operand, + llvm::MachineFunction &MF +); + +MachineReg CreateVirtualDstReg( + MachineOpcode Opcode, + llvm::MachineFunction &MF +); + +bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst); +struct MachineRegWithSubReg { + MachineReg Reg = AMDGPU::NoRegister; + unsigned SubReg = AMDGPU::NoSubRegister; +}; +MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF); +llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF); + +// Return true if this machine instruction represents a call to the fetch shader. +// We curently have two mechanisims for calling fetch shader: +// 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction +// 2. A CALL instruction with the `FetchShaderCall` flag set to true. +bool IsFetchShaderCall(const llvm::MachineInstr* MI); + +bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI); + + +// An enum used to pass additional constraints to +// `FindOrCreateInsertionPointForSccDef()`. This will further +// constrain the location where the scc def can be inserted. +enum SccDefInsertPointConstraintFlags +{ + None = 0, // No additional constraints. + NoExecWrite = 1, // Should be no modification of exec between BeforeInst and insert point. +}; + +// Look for a safe place to insert an instruction that defines scc. +// +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef( + llvm::MachineBasicBlock* MBB, + llvm::MachineBasicBlock::iterator BeforeInst, + const llvm::TargetRegisterInfo* TRI, + const llvm::SIInstrInfo* TII, + llvm::MachineRegisterInfo* MRI, + SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None +); + +// Check if LI live cross basic blocks, save all touched basic block if is +// local. +bool isLocalLiveInterval( + const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes, + llvm::SmallDenseSet &touchedMBBSet); +bool isLocalLiveInterval( + const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes); + +// build liveRegSet at end of each MBB. +void buildEndLiveMap( + llvm::LiveIntervals *LIS, llvm::MachineFunction &MF, + const llvm::MachineRegisterInfo &MRI, + llvm::DenseMap + &MBBLiveMap, bool After); + +void dumpLiveSet(const LiveSet &LiveSet, + const llvm::SIRegisterInfo *SIRI); + +unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI); +unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI); + +bool isFastMathInst(llvm::MachineInstr &MI); + +namespace pressure { +void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + llvm::raw_ostream &os); +void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, + const char *Filename); +void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, + llvm::raw_ostream &os); +} +// bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage); + +// Look for the successor `Succ` of the given `MBB`. +// Returns MBB->succ_end() if `Succ` is not a successor of MBB. +llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ); + +// The enum and helper function for v_perm selection mask. +// +// The input byte layout of v_perm is as below: +// +// BYTE in[8] +// in[0] = $src1_BYTE0; +// in[1] = $src1_BYTE1; +// in[2] = $src1_BYTE2; +// in[3] = $src1_BYTE3; +// in[4] = $src0_BYTE0; +// in[5] = $src0_BYTE1; +// in[6] = $src0_BYTE2; +// in[7] = $src0_BYTE3; +// +enum class V_PERM_IN_BYTE_POS { + src1_BYTE0 = 0, + src1_BYTE1, + src1_BYTE2, + src1_BYTE3, + src0_BYTE0, + src0_BYTE1, + src0_BYTE2, + src0_BYTE3 +}; + +// The 4 arguments specify which input byte will be output +// out[0] = Sel_0; +// out[1] = Sel_1; +// out[2] = Sel_2; +// out[3] = Sel_3; +// +constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0, + V_PERM_IN_BYTE_POS Sel_1, + V_PERM_IN_BYTE_POS Sel_2, + V_PERM_IN_BYTE_POS Sel_3) { + return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) | + ((int)Sel_1 << 8) | (int)Sel_0); +} +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp new file mode 100644 index 0000000000000..ceb22b5ff9243 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp @@ -0,0 +1,2767 @@ +//===- MirDivergenceAnalysis.cpp -- Mir Divergence Analysis Implementation -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is based on Analysis/DivergenceAnalysis.cpp, +// The most important difference is +// introduction of the idea of "Bit-Divergence". +// +// The way booleans are represented in in AMD GPU is a 64-bit uint in a pair of +// scalar registers, where each bit represents a boolean value for one lane. If +// all active lanes have the same bool value (all 1's or all 0's), then we can +// generate a scalar branch, otherwise we must use exec mask to selectively +// execute lanes based on the boolean mask. When all values in a boolean mask +// are the same for all active lanes, we call that mask "bit-uniform", +// otherwise we call it "bit-divergent". This differs from the normal concept +// of "uniform" and "divergent", which represents whether the value may be +// different across the 64 lanes. A "bit-divergent" value is still "uniform" in +// the sense that it is the same 64-bit value from the perspective of all the +// lanes, but when used as branch condition, will cause the branch to be +// divergent, which will cause the uses of any values outside of the control +// flow region to be divergent. +// +// The original DA marks everything including bools as divergent or uniform +// based on the propagation of divergent sources. However, booleans in AMDGPU +// are in fact never "divergent". Comparison operations that receive divergent +// operands instead produce "bit-divergent" or "bit-uniform" 64-bit booleans. +// Between the definition of any boolean mask and its use (particularly in +// branches, cndmasks, or anything that specifially consumes booleans), there +// can be any arbitrary number and types of operations performed on it, +// including combining it with other boolean masks via bit operations. +// +// The XDA algorithm is a modified version of the original DA algorithm to +// simultaneously propagate regular divergence and bit-divergence. +// +// First off, XDA identifies all sources of divergence as well as +// bit-divergence and adds them to the worklist. Then, just like with LLVM DA, +// it pops values off of the worklist to propagate (bit-)divergence to all its +// users, unless the user is always (bit-)uniform when given (bit-)divergent +// operand. It's possible for a value to be marked as both divergent and +// bit-divergent, in which case the regular divergence will trump +// bit-divergence. +// +// The important difference in this propagation step is that there are special +// instructions that when given bit-divergent operands, produce divergent +// values and vice versa. +// +// An example is comparison: +// +// v0 = interp ... ; divergent +// v1 = interp ... ; divergent +// s[0:1] = v_cmp v0, v1 ; bit-divergent +// +// v0 and v1 are both divergent, but when propagating them, the v_cmp (and its +// result) is bit-divergent value instead of divergent. +// +// +// An example of the reverse: +// +// v0 = ... ; uniform +// s[0:1] = v_cmp v0, v1 ; bit-divergent +// ... +// branch s[0:1], label ; divergent! +// ... +// v1 = ... ; uniform +// ... +// +// label: +// v3 = phi v0, v1 ; divergent! because of divergent branch. +// +// The boolean value is bit-divergent. When passed to the branch as an operand, +// the branch becomes divergent, whose sync dependency will be computed as +// normal to mark the appropriate values divergent (see description in normal +// DA on how this works). +// +// Another difference is in MIR, some branch will be changed into exec update, +// so only propagate control flow divergent on branch inst will not cover exec +// control flow. +// For case like +// %163:sreg_64_xexec = S_MOV_B64 $exec +//bb.1: +//; predecessors: %bb.1, %bb.0 +// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%) +// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 +// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec +// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec +// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec +//... +// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc +// S_CBRANCH_EXECNZ %bb.1, implicit $exec +// The ... code after SAVEEXEC will be divergent if %168 is divergent. +// The PHI should be divergent when %40 is inside the ... +// To propagate divergent from %168 to the PHI, need to start the propagate from +// SAVEEXEC which is the control flow by update exec. +// +// +// Original: +// This file implements a general divergence analysis for loop vectorization +// and GPU programs. It determines which branches and values in a loop or GPU +// program are divergent. It can help branch optimizations such as jump +// threading and loop unswitching to make better decisions. +// +// GPU programs typically use the SIMD execution model, where multiple threads +// in the same execution group have to execute in lock-step. Therefore, if the +// code contains divergent branches (i.e., threads in a group do not agree on +// which path of the branch to take), the group of threads has to execute all +// the paths from that branch with different subsets of threads enabled until +// they re-converge. +// +// Due to this execution model, some optimizations such as jump +// threading and loop unswitching can interfere with thread re-convergence. +// Therefore, an analysis that computes which branches in a GPU program are +// divergent can help the compiler to selectively run these optimizations. +// +// This implementation is derived from the Vectorization Analysis of the +// Region Vectorizer (RV). That implementation in turn is based on the approach +// described in +// +// Improving Performance of OpenCL on CPUs +// Ralf Karrenberg and Sebastian Hack +// CC '12 +// +// This DivergenceAnalysis implementation is generic in the sense that it does +// not itself identify original sources of divergence. +// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and +// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence +// (e.g., special variables that hold the thread ID or the iteration variable). +// +// The generic implementation propagates divergence to variables that are data +// or sync dependent on a source of divergence. +// +// While data dependency is a well-known concept, the notion of sync dependency +// is worth more explanation. Sync dependence characterizes the control flow +// aspect of the propagation of branch divergence. For example, +// +// %cond = icmp slt i32 %tid, 10 +// br i1 %cond, label %then, label %else +// then: +// br label %merge +// else: +// br label %merge +// merge: +// %a = phi i32 [ 0, %then ], [ 1, %else ] +// +// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid +// because %tid is not on its use-def chains, %a is sync dependent on %tid +// because the branch "br i1 %cond" depends on %tid and affects which value %a +// is assigned to. +// +// The sync dependence detection (which branch induces divergence in which join +// points) is implemented in the SyncDependenceAnalysis. +// +// The current DivergenceAnalysis implementation has the following limitations: +// 1. intra-procedural. It conservatively considers the arguments of a +// non-kernel-entry function and the return value of a function call as +// divergent. +// 2. memory as black box. It conservatively considers values loaded from +// generic or local address as divergent. This can be improved by leveraging +// pointer analysis and/or by modelling non-escaping memory objects in SSA +// as done in RV. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMirDivergenceAnalysis.h" +#include "GCNSubtarget.h" +#include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUAsmUtils.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" +#include "SIInstrInfo.h" +//#include "llvm/Analysis/Passes.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/Support/Debug.h" +//#include "newbe/cli/newbe_opts.h" // AMDGPU change. +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "mir-divergence-analysis" + +namespace llvm { +bool isAMDGPUOpcodeDivergent(class MachineInstr *MI); +} + +// +// TODO: TableGen these +// +bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) { + switch (MI->getOpcode()) { + // case R600::INTERP_LOAD_P0: + // case R600::INTERP_PAIR_XY: + // case R600::INTERP_PAIR_ZW: + // case R600::INTERP_VEC_LOAD: + // case R600::INTERP_XY: + // case R600::INTERP_ZW: + case AMDGPU::V_WRITELANE_B32: + + case AMDGPU::V_INTERP_MOV_F32: + case AMDGPU::V_INTERP_MOV_F32_e64: + case AMDGPU::V_INTERP_MOV_F32_e64_vi: + case AMDGPU::V_INTERP_MOV_F32_si: + case AMDGPU::V_INTERP_MOV_F32_vi: + case AMDGPU::V_INTERP_P1LL_F16: + case AMDGPU::V_INTERP_P1LL_F16_vi: + case AMDGPU::V_INTERP_P1LV_F16: + case AMDGPU::V_INTERP_P1LV_F16_vi: + case AMDGPU::V_INTERP_P1_F32: + case AMDGPU::V_INTERP_P1_F32_16bank: + case AMDGPU::V_INTERP_P1_F32_16bank_si: + case AMDGPU::V_INTERP_P1_F32_16bank_vi: + case AMDGPU::V_INTERP_P1_F32_e64: + case AMDGPU::V_INTERP_P1_F32_e64_vi: + case AMDGPU::V_INTERP_P1_F32_si: + case AMDGPU::V_INTERP_P1_F32_vi: + case AMDGPU::V_INTERP_P2_F16: + case AMDGPU::V_INTERP_P2_F16_vi: + case AMDGPU::V_INTERP_P2_F32: + case AMDGPU::V_INTERP_P2_F32_e64: + case AMDGPU::V_INTERP_P2_F32_e64_vi: + case AMDGPU::V_INTERP_P2_F32_si: + case AMDGPU::V_INTERP_P2_F32_vi: + + case AMDGPU::V_MBCNT_HI_U32_B32_e32: + case AMDGPU::V_MBCNT_HI_U32_B32_e32_gfx6_gfx7: + case AMDGPU::V_MBCNT_HI_U32_B32_e64: + case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx10: + case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx6_gfx7: + case AMDGPU::V_MBCNT_HI_U32_B32_e64_vi: + case AMDGPU::V_MBCNT_HI_U32_B32_sdwa: + case AMDGPU::V_MBCNT_LO_U32_B32_e32: + case AMDGPU::V_MBCNT_LO_U32_B32_e32_gfx6_gfx7: + case AMDGPU::V_MBCNT_LO_U32_B32_e64: + case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx10: + case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx6_gfx7: + case AMDGPU::V_MBCNT_LO_U32_B32_e64_vi: + case AMDGPU::V_MBCNT_LO_U32_B32_sdwa: + + case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64: + case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_AND_ADDR64: + case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64: + case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_INC_ADDR64: + case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_OR_ADDR64: + case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64: + case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64: + case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64: + case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_vi: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx10: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx6_gfx7: + case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_vi: + + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si: + //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_vi: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_si: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_vi: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_si: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_vi: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_si: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_vi: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_si: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_vi: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_si: + case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_vi: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_nsa_gfx10: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_si: + case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_vi: + + case AMDGPU::SI_PS_LIVE: + + case AMDGPU::DS_SWIZZLE_B32: + case AMDGPU::DS_SWIZZLE_B32_gfx10: + case AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7: + case AMDGPU::DS_SWIZZLE_B32_vi: + + return true; + + default: + break; + } + return false; +} + +namespace { +bool hasImmOperandWithVal(const MachineInstr *MI, uint16_t srcNameIdx, + uint16_t srcModNameIdx, uint64_t Val) { + unsigned Op = MI->getOpcode(); + unsigned srcIdx = AMDGPU::getNamedOperandIdx(Op, srcNameIdx); + if (srcIdx == -1) + return false; + const MachineOperand &srcMO = MI->getOperand(srcIdx); + if (srcMO.isImm() && srcMO.getImm() == Val) { + + unsigned modIdx = AMDGPU::getNamedOperandIdx(Op, srcModNameIdx); + if (modIdx == -1) + return true; + + const MachineOperand &modMO = MI->getOperand(modIdx); + if (modMO.getImm() == 0) + return true; + } + return false; +} + +bool isConstant(const MachineInstr *MI) { + unsigned Op = MI->getOpcode(); + switch (Op) { + default: + break; + case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: { + // Check special case or -1, which will get result -1. + const uint64_t kImm = -1; + if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0, + AMDGPU::OpName::src0_modifiers, kImm)) + return true; + if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1, + AMDGPU::OpName::src1_modifiers, kImm)) + return true; + } break; + case AMDGPU::S_OR_B32: + case AMDGPU::S_OR_B64: { + // Check special case or -1, which will get result -1. + const uint64_t kImm = -1; + if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0, + AMDGPU::OpName::src0_modifiers, kImm)) + return true; + if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1, + AMDGPU::OpName::src1_modifiers, kImm)) + return true; + } break; + case AMDGPU::S_AND_B32: + case AMDGPU::S_AND_B64: + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: { + // Check special case and 0, which will get result 0. + const uint64_t kImm = 0; + if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0, + AMDGPU::OpName::src0_modifiers, kImm)) + return true; + if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1, + AMDGPU::OpName::src1_modifiers, kImm)) + return true; + } break; + } + return false; +} + +bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI, + const MachineRegisterInfo &MRI) { + const auto *BoolRC = SIRI->getBoolRC(); + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || + Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) + return true; + + // Check if the written register class overlaps the bool register class. + // + // Note that this check is insufficent to catch all of the cases where + // a "bool" value could be created (for example writing to a register + // pair s[0:1], then using s0 as a bool value in wave32). + // + // The underlying problem is that we have two notions of divergence + // (bit divergence and wave divergence) but the algorithm only propagates + // wave divergence. The bit divergence is important for bools because it determines + // if a branch is uniform or not (and thus catches cases where a uniform value is + // used outside of a divergent control flow region). For bool values the + // algorithm will treat normally uniform values (i.e. scalar registers) as divergent + // in order to try and propagate bit divergence. + // + // To fix all the possible bugs here I think we need to actually proagate bit + // divergence as well as wave divergences. That is a bigger fix and this check should + // cover most cases of treating a bool value as divergent. + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + if (SIRI->getCommonSubClass(BoolRC, RC)) + return true; + } + return false; +} + +bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI, + const MachineRegisterInfo &MRI) { + unsigned Op = MI->getOpcode(); + switch (Op) { + default: + // Mark all s_inst always uniform except write to bool dst. This doesn't + // mean it is bit uniform. When check branch/exec region, will use + // isBitUniform. A bool might be sreg, but still divergent, since it is just + // put all lanes in one 64/32 bits sreg. + if (SIII->isScalarUnit(*MI) && !writeBoolDst(MI, SIRI, MRI) && + !MI->isTerminator()) + return true; + break; + //case AMDGPU::AMDGPU_MAKE_UNIFORM: + //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST: + case AMDGPU::V_READFIRSTLANE_B32: + case AMDGPU::V_READLANE_B32: + //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32: + //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64: + // bool readfirstlane should be 1 bit, which means bit uniform. + return true; + case AMDGPU::S_OR_B32: + case AMDGPU::S_OR_B64: { + // Check special case or -1, which will get result -1. + if (isConstant(MI)) + return true; + + return !writeBoolDst(MI, SIRI, MRI); + } break; + case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: { + // Check special case or -1, which will get result -1. + if (isConstant(MI)) + return true; + } break; + case AMDGPU::S_AND_B32: + case AMDGPU::S_AND_B64: { + // Check special case and 0, which will get result 0. + if (isConstant(MI)) + return true; + + return !writeBoolDst(MI, SIRI, MRI); + } break; + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: { + // Check special case and 0, which will get result 0. + if (isConstant(MI)) + return true; + } break; + } + return false; +} + +bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) { + return reg.isPhysical();; +} + +bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) { + return MRI.getRegClass(reg)->getID() == regClassID; +} + +// For input reg of MF, vgpr will be divergent. +bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { + if (isPhysicalReg(MRI, Reg)) { + unsigned vir_reg = MRI.getLiveInVirtReg(Reg); + if (SIRI->isVGPR(MRI, vir_reg)) + return true; + } else { + if (SIRI->isVGPR(MRI, Reg)) + return true; + } + return false; +} + +bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent)) + // return true; + if (isAMDGPUOpcodeDivergent(MI)) + return true; + + if (isAlwaysUniformMI(MI, SIII, SIRI, MRI)) + return false; + + // If the instruction is neither guaranteed to + // be uniform or divergent, check whether any + // of its operands are passed in to the shader as + // args through vector regs. + // + // This makes them divergent. + for (MachineOperand &op : MI->operands()) { + if (!op.isReg()) + continue; + if (op.isDef()) + continue; + unsigned reg = op.getReg(); + if (MRI.isLiveIn(reg)) { + if (isDivergentInputReg(reg, MRI, SIRI)) + return true; + } + } + + return false; +} + +// For VCC, try to find the nearest define inside same BB. +const MachineInstr *findPhysicalDefineInSameMBB(const MachineInstr *MI, + unsigned PhyReg) { + const MachineBasicBlock *MBB = MI->getParent(); + auto it = MI->getReverseIterator(); + for (it++; it != MBB->rend(); it++) { + const MachineInstr &TmpMI = *it; + for (const MachineOperand &DefMO : TmpMI.operands()) { + if (!DefMO.isReg()) + continue; + if (DefMO.isUse()) + continue; + if (DefMO.getReg() == PhyReg) + return &TmpMI; + } + } + return nullptr; +} + +bool isWriteExec(const MachineInstr *MI) { + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == AMDGPU::EXEC || + Reg == AMDGPU::EXEC_LO) + return true; + } + return false; +} + +bool isVCndMask(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case AMDGPU::V_CNDMASK_B32_e32: + case AMDGPU::V_CNDMASK_B32_e64: + case AMDGPU::V_CNDMASK_B32_dpp: + case AMDGPU::V_CNDMASK_B32_sdwa: + case AMDGPU::V_CNDMASK_B64_PSEUDO: + return true; + } +} + + +bool isExecRegionOp(unsigned Op) { + switch (Op) { + default: + return false; + case AMDGPU::COPY: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + return true; + } +} + +bool isRestoreExec(const MachineInstr *MI) { + unsigned Op = MI->getOpcode(); + if (!isExecRegionOp(Op)) + return false; + + return isWriteExec(MI); +} + +const MachineInstr * +findExecRegionBeginFromRegionEnd(const MachineInstr *MI, + const MachineRegisterInfo &MRI) { + const MachineOperand &MO = MI->getOperand(1); + if (!MO.isReg()) + return nullptr; + unsigned Reg = MO.getReg(); + const MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (!Def) + return nullptr; + // Make sure the def is S_MOV Reg, Exec. + if (!isExecRegionOp(Def->getOpcode())) + return nullptr; + const MachineOperand &ExecMO = Def->getOperand(1); + if (!ExecMO.isReg()) + return nullptr; + unsigned ExecReg = ExecMO.getReg(); + if (ExecReg == AMDGPU::EXEC || ExecReg == AMDGPU::EXEC_LO) + return Def; + else + return nullptr; +} + +bool isInsideExecRegion(const MachineInstr &MI, const MachineInstr &RegionBegin, + const MachineInstr &RegionEnd, + const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT) { + if (!DT.dominates(&RegionBegin, &MI)) + return false; + + const MachineBasicBlock *MBB = MI.getParent(); + const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent(); + if (MBB != RegionEndMBB) { + return PDT.dominates(RegionEndMBB, MBB); + } else { + // MachineLoop through the basic block until we find A or B. + MachineBasicBlock::const_iterator I = MBB->begin(); + for (; I != MI.getIterator() && I != RegionEnd.getIterator(); ++I) + /*empty*/; + + // RegionEnd post-dominates MI if MI is found first in the basic block. + return I == MI.getIterator(); + } +} + +bool isInsideExecRegion(const MachineBasicBlock &MBB, + const MachineInstr &RegionBegin, + const MachineInstr &RegionEnd, + const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT) { + const MachineBasicBlock *RegionBeginMBB = RegionBegin.getParent(); + const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent(); + if (!DT.dominates(RegionBeginMBB, &MBB)) + return false; + return PDT.dominates(RegionEndMBB, &MBB); +} + +// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region? +// Then when hit saveExec, propagate leaked users of define inside the exec region. + +} // namespace + +namespace llvm { +// class DivergenceAnalysis +DivergenceAnalysis::DivergenceAnalysis( + const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI, + SyncDependenceAnalysis &SDA, bool IsLCSSAForm, + // AMDGPU change begin. + DivergentJoinMapTy &JoinMap + // AMDGPU change end. + ) + : F(F), MRI(F.getRegInfo()), RegionLoop(RegionLoop), DT(DT), PDT(PDT), + LI(LI), SDA(SDA), DivergentJoinMap(JoinMap), // AMDGPU change + IsLCSSAForm(IsLCSSAForm) { + const GCNSubtarget *ST = &F.getSubtarget(); + SIRI = ST->getRegisterInfo(); + SIII = ST->getInstrInfo(); +} + +void DivergenceAnalysis::markDivergent(const ValueTy DivVal) { + assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); + // AMDGPU change begin. + LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI);); + //AMDGPU change end. + DivergentValues.insert(DivVal); +} + +// Mir change. +void DivergenceAnalysis::markDivergent(const MachineInstr &I) { + for (const MachineOperand &DstMO : I.defs()) { + unsigned Reg = DstMO.getReg(); + markDivergent(Reg); + } + DivergentInsts.insert(&I); +} + +void DivergenceAnalysis::addUniformOverride(const ValueTy UniVal) { + // TODO: support uniform multi-def. + if (MRI.getUniqueVRegDef(UniVal) == nullptr) + return; + + UniformOverrides.insert(UniVal); +} + +void DivergenceAnalysis::addUniformOverride(const MachineInstr &I) { + for (const MachineOperand &DstMO : I.defs()) { + unsigned Reg = DstMO.getReg(); + addUniformOverride(Reg); + } + UniformOverridesInsts.insert(&I); +} + +bool DivergenceAnalysis::isBitUniform( + const MachineInstr &I, const llvm::MachineOperand &UseMO, + llvm::DenseMap &Processed) const { + if (UseMO.isImm()) { + uint64_t val = UseMO.getImm(); + // 0 and -1 are OK since all lanes are still the same. + if (val == 0 || val == -1) + return true; + else + return false; + } + if (!UseMO.isReg()) + return true; + unsigned Reg = UseMO.getReg(); + // Exec is always bituniform, because all active lanes are 1. + if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || + // SCC only has 1 bit. Always bituniform. + Reg == AMDGPU::SCC) + return true; + + const MachineInstr *UseMI = nullptr; + if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) { + // Try to find define of this VCC. + UseMI = findPhysicalDefineInSameMBB(&I, Reg); + } else { + UseMI = MRI.getUniqueVRegDef(Reg); + } + if (!UseMI) { + return false; + } + + bool bResult = isBitUniform(*UseMI, Processed); + Processed[UseMI] = bResult; + return bResult; +} + +bool DivergenceAnalysis::isBitUniform( + const MachineInstr &I, + llvm::DenseMap &Processed) const { + auto it = Processed.find(&I); + if (it != Processed.end()) + return it->second; + // For branch on MIR, need to make sure all activi lanes are the same. + // cmp of uniform value will make sure all active lanes are the same. + // Imm is also the same for all active lanes. + if (isDivergent(I)) + return false; + // Uniform cmp is bit uniform. + if (I.isCompare()) + return true; + if (isConstant(&I)) + return true; + + // Conservatively consider bituniform to be false. + Processed[&I] = false; + + // If all operand is bit uniform, then result is bit uniform. + bool bAllOperandBitUniform = true; + for (const MachineOperand &UseMO : I.uses()) { + if (isBitUniform(I, UseMO, Processed)) + continue; + bAllOperandBitUniform = false; + break; + } + return bAllOperandBitUniform; +} + +bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const { + if (Term.getParent()->succ_size() <= 1) + return false; + switch (Term.getOpcode()) { + default: { + if (updateNormalInstruction(Term)) + return true; + llvm::DenseMap Processed; + // Check bit uniform here if not divergent. + return !isBitUniform(Term, Processed); + } + //case AMDGPU::AMDGPU_CALL_INDIRECT: + case AMDGPU::SI_CALL: + return true; + } +} + +bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const { + // TODO function calls with side effects, etc + if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end()) + return false; + if (DivergentInsts.find(&I) != DivergentInsts.end()) + return true; + for (const auto &Op : I.uses()) { + if (!Op.isReg()) + continue; + Register Reg = Op.getReg(); + if (Reg.isPhysical()) { + if (Reg == AMDGPU::EXEC || + Reg == AMDGPU::EXEC_LO || + Reg == AMDGPU::SCC) + continue; + else + if (const MachineInstr *DefMI = + findPhysicalDefineInSameMBB(Op.getParent(), Reg)) { + if (isDivergent(*DefMI)) + return true; + } else { + // If cannot find def in same MBB, just treat it as divergent. + return true; + } + } else { + if (isDivergent(Op.getReg())) + return true; + } + } + return false; +} + +bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock, + const ValueTy Val, + const MachineBasicBlock &IncomingBlock) const { // AMDGPU change + const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants. + const auto *Inst = MRI.getUniqueVRegDef(Val); + if (Inst == nullptr) + return true; + if (Inst) + DefBlock = Inst->getParent(); + + // check whether any divergent loop carrying Val terminates before control + // proceeds to ObservingBlock + for (const auto *MachineLoop = LI.getLoopFor(DefBlock); // AMDGPU change + MachineLoop != RegionLoop && !MachineLoop->contains(&ObservingBlock); + MachineLoop = MachineLoop->getParentLoop()) { + if (DivergentLoops.find(MachineLoop) != DivergentLoops.end()) + return true; + } + + return false; +} + +// AMDGPU CHANGE BEGIN +static bool HasIncomingUndefValue(const PHINode_ *Phi) { + for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) { + const MachineOperand &Op = Phi->getOperand(I); + if (Op.isUndef()) + return true; + } + return false; +} + +// For case like +// %163:sreg_64_xexec = S_MOV_B64 $exec +//bb.1: +//; predecessors: %bb.1, %bb.0 +// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%) +// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 +// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec +// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec +// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec +//... +// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc +// S_CBRANCH_EXECNZ %bb.1, implicit $exec +// The ... code after SAVEEXEC will be divergent if %168 is divergent. +// Return the SaveExec which affect MI. +// If not exist, return nullptr. +static const MachineInstr * +findSaveExec(const MachineInstr *MI, + const SmallVector &SaveExecs) { + // No save exec. + if (SaveExecs.empty()) + return nullptr; + if (SaveExecs.size() > 1) + llvm::report_fatal_error( + "Not support case where, MBB has more than one SaveExec"); + const MachineInstr *SaveExec = SaveExecs.front(); + const MachineBasicBlock *MBB = SaveExec->getParent(); + // Make sure MI is after SaveExec by check it is not before SaveExec. + // Assume MBB.begin to SaveExec is short here. + bool bIsAfterSaveExec = true; + for (auto it = MBB->begin(); it != SaveExec->getIterator(); it++) { + if (MI == it) { + bIsAfterSaveExec = false; + break; + } + } + // Not affect by save exec. + if (!bIsAfterSaveExec) + return nullptr; + + return SaveExec; +} + +// When a Phi's parent isJoinDivergent,the case make phi divergent is that 2 +// incoming values merged from different path of a divergent branch. +// isJoinDivergentOnlyOnSameIncomingValue will check for all +// combinations of incoming values except the BB with same incoming value, +// because if values are same then even divergent branch is not divergent. +// For example phi a:A, b:B, a:C. +// It will check (A,B) (B,C) but not (A, C) Because A +// and C has same value a. +// If only (A,C) is sharing divergent branch, +// then phi a:A, b:B, a:C is still uniform. +// DivergentJoinMap saving MachineBasicBlock pairs which on different path of a +// divergent branch and joined at one block. +// For example, +// A +// / \ +// | \ +// | \ +// B / +// | \ / +// | \ / +// C D +// | / +// \ / +// E +// If A is uniform branch, B is divergent branch. Then only (C, D) will be saved +// in DivergentJoinMap. +// DivergentJoinMap is build with updateDisjointMap in +// SyncDependenceAnalysis.cpp when SyncDependenceAnalysis::join_block is called. +// It will only run on divergent branch, so (A, B) is not in +// DivergentDisjointMap when A is uniform. +static bool isJoinDivergentOnlyOnSameIncomingValue( + const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT, + DivergentJoinMapTy &DivergentJoinMap) { + // for phi which join divergent, if the incoming values from divergent + // branch are the same, the phi is still uniform. + // A + // | \ + // | \ + // B \ + // |\ \ + // | \ | + // C D E + // | / / + // \/ / + // \ / + // F + // for phi in F like. + // phi (a:C, a:D, b:E) + // If A is uniform branch, B is non-uniform branch, phi is uniform. + SmallDenseSet ValueToBlockMap; + for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { + const MachineOperand &Op = Phi.getOperand(I); + if (!Op.isReg()) + continue; + unsigned Reg = Op.getReg(); + if (pDA->isDivergent(Reg)) + return false; + + ValueToBlockMap.insert(Reg); + } + unsigned NumIncoming = (Phi.getNumOperands() - 1) / 2; + // When there's same incoming value from different incoming block. + // If divergent select is only on same value, then it is still uniform. + if (ValueToBlockMap.size() != NumIncoming) { + // When a phi is on divergent join block, there is incoming block which is + // comeing from different path of a divergent branch. + // Check all combination here. + for (unsigned i = 0; i < NumIncoming; i++) { + MachineBasicBlock *BB0 = Phi.getOperand(2 + 2 * i).getMBB(); + const MachineOperand &MO0 = Phi.getOperand(1 + 2 * i); + for (unsigned j = i + 1; j < NumIncoming; j++) { + MachineBasicBlock *BB1 = Phi.getOperand(2 + 2 * j).getMBB(); + const MachineOperand &MO1 = Phi.getOperand(1 + 2 * j); + // If value match, no divergent. + if (MO0.isImm() && MO1.isImm() && MO0.getImm() == MO1.getImm()) + continue; + if (MO0.isReg() && MO1.isReg() && MO0.getReg() == MO1.getReg() && + MO0.getSubReg() == MO1.getSubReg()) + continue; + + // If BB and BB2 is from divergent disjoint, then they will + // divergent join on phi. + // This is for case like + // A + // / \ + // | \ + // | \ + // B / + // | \ / + // | \ / + // C D + // | / + // \ / + // E + // + // phi(a:C, b:D) + // When nearestCommonDominator is A, but B also can be divergent + // disjoint for C and D. + if (DivergentJoinMap[BB0].count(BB1)) + return false; + } + } + return true; + } else { + return false; + } +} +// AMDGPU CHANGE END + +bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const { + // AMDGPU CHANGE BEGIN + // Do not mark phis with undef as incoming values as uniform. + // When promoting to scalar we will readfirstlane on + // the phi output. If some of the inputs are undef then + // this could replace a well defined vector value with an + // undefined scalar value. + if (HasIncomingUndefValue(&Phi)) + return true; + // AMDGPU CHANGE END + + // joining divergent disjoint path in Phi parent block + if (isJoinDivergent(*Phi.getParent())) { + // AMDGPU CHANGE BEGIN + if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) { + // Continue if the divergent join only on same incoming value. + if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT, + DivergentJoinMap)) + return true; + } else + // AMDGPU CHANGE END + return true; + } + + // An incoming value could be divergent by itself. + // Otherwise, an incoming value could be uniform within the loop + // that carries its definition but it may appear divergent + // from outside the loop. This happens when divergent loop exits + // drop definitions of that uniform value in different iterations. + // + // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop + // if (i % thread_id == 0) break; // divergent loop exit + // } + // int divI = i; // divI is divergent + for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { + const MachineOperand &Op = Phi.getOperand(I); + if (!Op.isReg()) + continue; + + unsigned Reg = Op.getReg(); + const MachineOperand &BB = Phi.getOperand(I + 1); + if (isDivergent(Reg) || + isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB())) + return true; + + } + + return false; +} + +bool DivergenceAnalysis::updateVCndMask(const MachineInstr &VCndMask) const { + // VCndMask require the Cond bituniform to be uniform. + unsigned Op = VCndMask.getOpcode(); + unsigned src0Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src0); + unsigned src1Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src1); + unsigned src2Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src2); + + const MachineOperand &src0 = VCndMask.getOperand(src0Idx); + const MachineOperand &src1 = VCndMask.getOperand(src1Idx); + + const MachineOperand &cond = VCndMask.getOperand(src2Idx); + + if (isDivergent(src0)) + return true; + + // If src0 == src1, then return src0 divergent. + if (src0.isReg() && src1.isReg() && src0.getReg() == src1.getReg()) { + if (src0.getSubReg() == src1.getSubReg() && + SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src0_modifiers) == + SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src1_modifiers)) + return false; + } + + if (isDivergent(src1)) + return true; + + llvm::DenseMap Processed; + return !isBitUniform(VCndMask, cond, Processed); +} + +bool DivergenceAnalysis::inRegion(const MachineInstr &I) const { + return I.getParent() && inRegion(*I.getParent()); +} + +bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const { + return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); +} + +// marks all users of loop-carried values of the loop headed by LoopHeader as +// divergent +void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) { + auto *DivLoop = LI.getLoopFor(&LoopHeader); + assert(DivLoop && "loopHeader is not actually part of a loop"); + + SmallVector TaintStack; + DivLoop->getExitBlocks(TaintStack); + + // Otherwise potential users of loop-carried values could be anywhere in the + // dominance region of DivLoop (including its fringes for phi nodes) + DenseSet Visited; + for (auto *Block : TaintStack) { + Visited.insert(Block); + } + Visited.insert(&LoopHeader); + + while (!TaintStack.empty()) { + auto *UserBlock = TaintStack.back(); + TaintStack.pop_back(); + + // don't spread divergence beyond the region + if (!inRegion(*UserBlock)) + continue; + + assert(!DivLoop->contains(UserBlock) && + "irreducible control flow detected"); + + // phi nodes at the fringes of the dominance region + if (!DT.dominates(&LoopHeader, UserBlock)) { + // all PHI nodes of UserBlock become divergent + pushPHINodes(*UserBlock); + continue; + } + + // taint outside users of values carried by DivLoop + for (auto &I : *UserBlock) { + if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) + continue; + if (isDivergent(I)) + continue; + + for (auto &Op : I.uses()) { + if (!Op.isReg()) + continue; + unsigned OpReg = Op.getReg(); + MachineInstr *OpInst = MRI.getUniqueVRegDef(OpReg); + if (!OpInst) + continue; + if (DivLoop->contains(OpInst->getParent())) { + markDivergent(I); + pushUsers(I); + break; + } + } + } + + // visit all blocks in the dominance region + for (auto *SuccBlock : UserBlock->successors()) { + if (!Visited.insert(SuccBlock).second) { + continue; + } + TaintStack.push_back(SuccBlock); + } + } +} + +void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { + Worklist.push_back(&I); +} +void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) { + for (const auto &Phi : Block.phis()) { + if (isDivergent(Phi)) + continue; + pushInstruction(Phi); + } +} + +void DivergenceAnalysis::pushUsers(const ValueTy V) { + for (const auto &UserInst : MRI.use_nodbg_instructions(V)) { + + if (isDivergent(UserInst)) + continue; + + // only compute divergent inside loop + if (!inRegion(UserInst)) + continue; + + Worklist.push_back(&UserInst); + } +} +void DivergenceAnalysis::pushUsers(const MachineInstr &I) { + for (const auto &DstMO : I.defs()) { + unsigned Reg = DstMO.getReg(); + pushUsers(Reg); + } +} + +bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock, + const MachineLoop *BranchLoop) { + LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); + + // ignore divergence outside the region + if (!inRegion(JoinBlock)) { + return false; + } + + // push non-divergent phi nodes in JoinBlock to the worklist + pushPHINodes(JoinBlock); + + // JoinBlock is a divergent loop exit + if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { + return true; + } + + // disjoint-paths divergent at JoinBlock + markBlockJoinDivergent(JoinBlock); + return false; +} + +void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) { + LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); + + markDivergent(Term); + + const auto *BranchLoop = LI.getLoopFor(Term.getParent()); + + // whether there is a divergent loop exit from BranchLoop (if any) + bool IsBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint from Term within the loop + // also iterates over loop exits that become divergent due to Term. + for (const auto *JoinBlock : SDA.join_blocks(Term)) { + IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); + } + + // Branch loop is a divergent loop due to the divergent branch in Term + if (IsBranchLoopDivergent) { + assert(BranchLoop); + if (!DivergentLoops.insert(BranchLoop).second) { + return; + } + propagateLoopDivergence(*BranchLoop); + } +} + +void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) { + LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n"); + + // don't propagate beyond region + if (!inRegion(*ExitingLoop.getHeader())) + return; + + const auto *BranchLoop = ExitingLoop.getParentLoop(); + + // Uses of loop-carried values could occur anywhere + // within the dominance region of the definition. All loop-carried + // definitions are dominated by the loop header (reducible control). + // Thus all users have to be in the dominance region of the loop header, + // except PHI nodes that can also live at the fringes of the dom region + // (incoming defining value). + if (!IsLCSSAForm) + taintLoopLiveOuts(*ExitingLoop.getHeader()); + + // whether there is a divergent loop exit from BranchLoop (if any) + bool IsBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint paths from exits of + // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn + // become divergent. + for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { + IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); + } + + // Branch loop is a divergent due to divergent loop exit in ExitingLoop + if (IsBranchLoopDivergent) { + assert(BranchLoop); + if (!DivergentLoops.insert(BranchLoop).second) { + return; + } + propagateLoopDivergence(*BranchLoop); + } +} + +// For case like +// %149:sreg_64_xexec = S_MOV_B64 $exec +// +//bb.3: +//; predecessors: %bb.3, %bb.2 +// successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%) +// +// %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3 +// %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec +// %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec +// %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec +// $m0 = S_MOV_B32 %153:sgpr_32 +// %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec +// $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc +// S_CBRANCH_EXECNZ %bb.3, implicit $exec +// +//bb.4: +//; predecessors: %bb.3 +// successors: %bb.5(0x80000000); %bb.5(100.00%) +// +// $exec = S_MOV_B64 %149:sreg_64_xexec + +// bb.3 is inside exec region which exec is saved by %149. +// %152:sreg_64 = S_AND_SAVEEXEC_B64 will update the exec which cause divergence +// when it is not bituniform. Everything inside the exec region need to be +// scaned. Out region or phi use should be marked as divergent and add users to +// worklist. +void DivergenceAnalysis::propagateExecControlFlowDivergence( + const MachineInstr &SaveExec) { + const MachineBasicBlock *MBB = SaveExec.getParent(); + auto it = ExecRegionMap.find(MBB); + if (it == ExecRegionMap.end()) + return; + ExecRegion &Region = *it->second; + // One region only need to propagate once. + if (Region.bPropagated) + return; + Region.bPropagated = true; + // Scan all MIs in the region. Mark out region or phi use as divergent and add + // their users to worklist. + auto propagateExecDivergence = [this, Region](const MachineInstr *MI) { + for (const auto &DstMO : MI->defs()) { + Register Reg = DstMO.getReg(); + // Only VCC/Exec/m0. + // Exec always uniform. Assume VCC and m0 not cross region. + if (Reg.isPhysical()) + continue; + for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) { + + if (isDivergent(UserInst)) + continue; + + // only propagate user outside of region or phi which will not be + // guarded by saveExec. + if (UserInst.getOpcode() != AMDGPU::PHI && + isInsideExecRegion(UserInst, *Region.begin, *Region.end, DT, PDT)) { + continue; + } + // Write exec is not divergent. + if (isWriteExec(&UserInst)) + continue; + + markDivergent(UserInst); + pushUsers(UserInst); + } + } + }; + const MachineBasicBlock *RegionBeginMBB = Region.begin->getParent(); + const MachineBasicBlock *RegionEndMBB = Region.end->getParent(); + if (RegionBeginMBB != RegionEndMBB) { + auto it = Region.begin->getIterator(); + for (it++; it != RegionBeginMBB->end(); it++) { + const MachineInstr &MI = *it; + propagateExecDivergence(&MI); + } + + // All blocks between RegionBeginMBB and RegionEndMBB. + for (const MachineBasicBlock *MBB : Region.blocks) { + for (const MachineInstr &MI : *MBB) { + propagateExecDivergence(&MI); + } + } + + for (auto it = RegionEndMBB->begin(); it != Region.end->getIterator(); + it++) { + const MachineInstr &MI = *it; + propagateExecDivergence(&MI); + } + + } else { + auto it = Region.begin->getIterator(); + for (it++; it != Region.end->getIterator(); it++) { + const MachineInstr &MI = *it; + propagateExecDivergence(&MI); + } + } +} + +void DivergenceAnalysis::compute() { + SmallVector ExecRegions; + // Build exec regions. + // Add VCndMask for non-bituniform caused by input sreg. + for (const MachineBasicBlock &MBB : F) { + for (const MachineInstr &Term : MBB.terminators()) { + if (updateTerminator(Term)) + pushInstruction(Term); + } + + for (const MachineInstr &I : MBB) { + unsigned Opcode = I.getOpcode(); + if (isVCndMask(Opcode)) { + // Cond for CndMask needs bit uniform check. + // Add it to worklist to check bit uniform from input. + pushInstruction(I); + } else if (isRestoreExec(&I)) { + const MachineInstr *RegionBegin = + findExecRegionBeginFromRegionEnd(&I, MRI); + if (RegionBegin) { + ExecRegions.emplace_back(ExecRegion(RegionBegin, &I)); + } + } + } + } + + // Build exec region map. + for (const MachineBasicBlock &MBB : F) { + for (ExecRegion &Region : ExecRegions) { + if (isInsideExecRegion(MBB, *Region.begin, *Region.end, DT, PDT)) { + // Add block to region. + if (&MBB != Region.begin->getParent() && + &MBB != Region.end->getParent()) + Region.blocks.emplace_back(&MBB); + // Update ExecRegionMap. + auto it = ExecRegionMap.find(&MBB); + if (it == ExecRegionMap.end()) { + ExecRegionMap[&MBB] = &Region; + } else { + // When MBB inside multiple regions, save the smallest one. + if (isInsideExecRegion(*Region.begin, *it->second->begin, + *it->second->end, DT, PDT)) { + ExecRegionMap[&MBB] = &Region; + } + } + } + } + } + + for (auto DivVal : DivergentValues) { + LLVM_DEBUG(dbgs() << "\t sourceOfDivergence :"; printReg(DivVal, SIRI); + dbgs() << "\n";); + pushUsers(DivVal); + } + + // propagate divergence + while (!Worklist.empty()) { + const MachineInstr *I= Worklist.back(); + Worklist.pop_back(); + + // maintain uniformity of overrides + if (isAlwaysUniformMI(I, SIII, SIRI, MRI)) { + // If used by terminators, and not bit uniform. + // Add terminator. + SmallVector TermUsers; + for (const auto &DstMO : I->defs()) { + unsigned Reg = DstMO.getReg(); + for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) { + + if (isDivergent(UserInst)) + continue; + // Only check terminator here. + if (!UserInst.isTerminator()) + continue; + + // only compute divergent inside loop + if (!inRegion(UserInst)) + continue; + + TermUsers.emplace_back(&UserInst); + } + } + + if (!TermUsers.empty()) { + llvm::DenseMap Processed; + if (!isBitUniform(*I, Processed)) { + for (const MachineInstr *Term : TermUsers) { + Worklist.emplace_back(Term); + } + } + } + + continue; + } + + bool WasDivergent = isDivergent(*I); + if (WasDivergent) + continue; + + // propagate divergence caused by terminator + if (I->isTerminator()) { + if (updateTerminator(*I)) { + // propagate control divergence to affected instructions + propagateBranchDivergence(*I); + continue; + } + } + + // update divergence of I due to divergent operands + bool DivergentUpd = false; + unsigned Opcode = I->getOpcode(); + switch (I->getOpcode()) { + default: + if (isVCndMask(Opcode)) { + DivergentUpd = updateVCndMask(*I); + } else { + DivergentUpd = updateNormalInstruction(*I); + llvm::DenseMap Processed; + if ((DivergentUpd || !isBitUniform(*I, Processed)) && isWriteExec(I)) { + // propagate exec control divergence to affected instructions. + propagateExecControlFlowDivergence(*I); + } + } + break; + case AMDGPU::PHI: + DivergentUpd = updatePHINode(*I); + break; + } + + // propagate value divergence to users + if (DivergentUpd) { + markDivergent(*I); + pushUsers(*I); + } + } +} + +bool DivergenceAnalysis::isAlwaysUniform(const ValueTy V) const { + return UniformOverrides.find(V) != UniformOverrides.end(); +} + +bool DivergenceAnalysis::isDivergent(const ValueTy V) const { + return DivergentValues.find(V) != DivergentValues.end(); +} + +bool DivergenceAnalysis::isDivergent(const MachineOperand &MO) const { + if (!MO.isReg()) + return false; + Register Reg = MO.getReg(); + if (Reg.isPhysical()) { + const MachineInstr *MI = MO.getParent(); + if (MI) + return isDivergent(!MI); + + } else { + return isDivergent(Reg); + } + return true; +} + +bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const { + if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end()) + return false; + if (DivergentInsts.find(&I) != DivergentInsts.end()) + return true; + for (const MachineOperand &DstMO : I.defs()) { + unsigned Reg = DstMO.getReg(); + if (isDivergent(Reg)) + return true; + } + return false; +} + +void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const { + // iterate instructions using instructions() to ensure a deterministic order. + for (auto &MBB : F) + for (auto &I : MBB) { + if (isDivergent(I)) + OS << "DIVERGENT:" << I ; + // AMDGPU changes begin + else + OS << "UNIFORM:" << I ; + // AMDGPU changes end + } +} + +// class GPUDivergenceAnalysis +MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F, + const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI) + : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap), + DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) { + MachineRegisterInfo &MRI = F.getRegInfo(); + const GCNSubtarget *ST = &F.getSubtarget(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + const SIInstrInfo *SIII = ST->getInstrInfo(); + for (auto &MBB : F) + for (auto &I : MBB) { + if (isSourceOfDivergence(&I, MRI, SIRI, SIII)) { + DA.markDivergent(I); + } else if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) { + DA.addUniformOverride(I); + } + } + for (auto &ArgIt : F.getRegInfo().liveins()) { + unsigned Reg = ArgIt.first; + if (isDivergentInputReg(Reg, MRI, SIRI)) { + DA.markDivergent(Reg); + } + } + + DA.compute(); +} + +bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const { + return DA.isDivergent(*I); +} + +void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const { + OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; +} + +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h new file mode 100644 index 0000000000000..edcf96ec44a4d --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h @@ -0,0 +1,281 @@ +//===- AMDGPUMirDivergenceAnalysis.h - Mir Divergence Analysis -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// The divergence analysis determines which instructions and branches are +// divergent given a set of divergent source instructions. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "AMDGPUMirSyncDependenceAnalysis.h" +#include "llvm/Pass.h" +#include + +namespace llvm { +class raw_ostream; +class TargetTransformInfo; +class MachineRegisterInfo; +class SIInstrInfo; +class SIRegisterInfo; +class MachineOperand; +class MachineBasicBlock; + +using Module_ = void; +class TargetTransformInfo; +using ValueTy = unsigned; +using PHINode_ = MachineInstr; + +/// \brief Generic divergence analysis for reducible CFGs. +/// +/// This analysis propagates divergence in a data-parallel context from sources +/// of divergence to all users. It requires reducible CFGs. All assignments +/// should be in SSA form. +class DivergenceAnalysis { +public: + /// \brief This instance will analyze the whole function \p F or the loop \p + /// RegionLoop. + /// + /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop. + /// Otherwise the whole function is analyzed. + /// \param IsLCSSAForm whether the analysis may assume that the IR in the + /// region in in LCSSA form. + DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop, + const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, + bool IsLCSSAForm, + // AMDGPU change begin. + DivergentJoinMapTy &JoinMap + // AMDGPU change end. + ); + + /// \brief The loop that defines the analyzed region (if any). + const MachineLoop *getRegionLoop() const { return RegionLoop; } + const llvm::MachineFunction &getFunction() const { return F; } + + /// \brief Whether \p BB is part of the region. + bool inRegion(const MachineBasicBlock &BB) const; + /// \brief Whether \p I is part of the region. + bool inRegion(const MachineInstr &I) const; + + /// \brief Mark \p UniVal as a value that is always uniform. + void addUniformOverride(const ValueTy UniVal); + void addUniformOverride(const MachineInstr &I); + + /// \brief Mark \p DivVal as a value that is always divergent. + void markDivergent(const ValueTy DivVal); + void markDivergent(const MachineInstr &I); + + /// \brief Propagate divergence to all instructions in the region. + /// Divergence is seeded by calls to \p markDivergent. + void compute(); + + /// \brief Whether any value was marked or analyzed to be divergent. + bool hasDetectedDivergence() const { return !DivergentValues.empty(); } + + /// \brief Whether \p Val will always return a uniform value regardless of its + /// operands + bool isAlwaysUniform(const ValueTy Val) const; + + /// \brief Whether \p Val is a divergent value + bool isDivergent(const ValueTy Val) const; + bool isDivergent(const MachineInstr &I) const; + + void print(llvm::raw_ostream &OS, const Module_ *) const; + +private: + bool isDivergent(const llvm::MachineOperand &MO) const; + bool updateTerminator(const MachineInstr &Term) const; + bool updatePHINode(const PHINode_ &Phi) const; + bool updateVCndMask(const MachineInstr &VCndMask) const; + bool isBitUniform(const MachineInstr &I, + llvm::DenseMap &Processed) const; + bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO, + llvm::DenseMap &Processed) const; + + /// \brief Computes whether \p Inst is divergent based on the + /// divergence of its operands. + /// + /// \returns Whether \p Inst is divergent. + /// + /// This should only be called for non-phi, non-terminator instructions. + bool updateNormalInstruction(const MachineInstr &Inst) const; + + /// \brief Mark users of live-out users as divergent. + /// + /// \param LoopHeader the header of the divergent loop. + /// + /// Marks all users of live-out values of the loop headed by \p LoopHeader + /// as divergent and puts them on the worklist. + void taintLoopLiveOuts(const MachineBasicBlock &LoopHeader); + + /// \brief Push all users of \p Val (in the region) to the worklist + void pushUsers(const ValueTy I); + void pushUsers(const MachineInstr &I); + + void pushInstruction(const MachineInstr &I); + /// \brief Push all phi nodes in @block to the worklist + void pushPHINodes(const MachineBasicBlock &Block); + + /// \brief Mark \p Block as join divergent + /// + /// A block is join divergent if two threads may reach it from different + /// incoming blocks at the same time. + void markBlockJoinDivergent(const MachineBasicBlock &Block) { + DivergentJoinBlocks.insert(&Block); + } + + /// \brief Whether \p Val is divergent when read in \p ObservingBlock. + bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock, + const ValueTy Val, + const MachineBasicBlock &incomingBlock) const; // AMDGPU change + + /// \brief Whether \p Block is join divergent + /// + /// (see markBlockJoinDivergent). + bool isJoinDivergent(const MachineBasicBlock &Block) const { + return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end(); + } + + /// \brief Propagate control-induced divergence to users (phi nodes and + /// instructions). + // + // \param JoinBlock is a divergent loop exit or join point of two disjoint + // paths. + // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop. + bool propagateJoinDivergence(const MachineBasicBlock &JoinBlock, + const MachineLoop *TermLoop); + + /// \brief Propagate induced value divergence due to control divergence in \p + /// Term. + void propagateBranchDivergence(const MachineInstr &Term); + + /// \brief Propagate induced value divergence due to exec update caused by \p + /// SaveExec. + void propagateExecControlFlowDivergence(const MachineInstr &SaveExec); + + /// \brief Propagate divergent caused by a divergent loop exit. + /// + /// \param ExitingLoop is a divergent loop. + void propagateLoopDivergence(const MachineLoop &ExitingLoop); + +private: + const llvm::MachineFunction &F; + const llvm::MachineRegisterInfo &MRI; + const llvm::SIRegisterInfo *SIRI; + const llvm::SIInstrInfo *SIII; + // If regionLoop != nullptr, analysis is only performed within \p RegionLoop. + // Otw, analyze the whole function + const MachineLoop *RegionLoop; + + const MachineDominatorTree &DT; + const MachinePostDominatorTree &PDT; + const MachineLoopInfo &LI; + + // Recognized divergent loops + llvm::DenseSet DivergentLoops; + + // AMDGPU change begin + // Save block pair which divergent disjoint. + // A + // | \ + // | \ + // B C + // | / + // D + // When A is divergent branch, B and C are divergent join at D. + // Then DivergentJoinMap[B].count(C) > 0 and + // DivergentJoinMap[C].count(B) > 0. + DivergentJoinMapTy &DivergentJoinMap; + // AMDGPU change end + + // The SDA links divergent branches to divergent control-flow joins. + SyncDependenceAnalysis &SDA; + + // Use simplified code path for LCSSA form. + bool IsLCSSAForm; + + // Set of known-uniform values. + llvm::DenseSet UniformOverrides; + llvm::DenseSet UniformOverridesInsts; + + // Blocks with joining divergent control from different predecessors. + llvm::DenseSet DivergentJoinBlocks; + + // Detected/marked divergent values. + llvm::DenseSet DivergentValues; + llvm::DenseSet DivergentInsts; + + // Mir change for EXEC control flow. + // Map from MBB to the exec region it belongs too. + // A exec region is begin with + // S_MOV_B64 sreg, exec + // end with + // S_MOV_B64 exec, sreg + // Inside the region, exec might be updated to make control flow with exec. + struct ExecRegion { + const llvm::MachineInstr *begin; + const llvm::MachineInstr *end; + std::vector blocks; + bool bPropagated = false; + ExecRegion(const llvm::MachineInstr *b, + const llvm::MachineInstr *e) + : begin(b), end(e), bPropagated(false) {} + }; + llvm::DenseMap ExecRegionMap; + + // Internal worklist for divergence propagation. + std::vector Worklist; +}; + +/// \brief Divergence analysis frontend for GPU kernels. +class MirGPUDivergenceAnalysis { + // AMDGPU change begin + // Save block pair which divergent disjoint. + // A + // | \ + // | \ + // B C + // | / + // D + // When A is divergent branch, B and C are divergent join at D. + // Then DivergentJoinMap[B].count(C) > 0 and + // DivergentJoinMap[C].count(B) > 0. + DivergentJoinMapTy DivergentJoinMap; + // AMDGPU change end + SyncDependenceAnalysis SDA; + DivergenceAnalysis DA; + +public: + /// Runs the divergence analysis on @F, a GPU kernel + MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI); + + /// Whether any divergence was detected. + bool hasDivergence() const { return DA.hasDetectedDivergence(); } + + /// The GPU kernel this analysis result is for + const llvm::MachineFunction &getFunction() const { return DA.getFunction(); } + + /// Whether \p I is divergent. + bool isDivergent(const MachineInstr *I) const; + + /// Whether \p I is uniform/non-divergent + bool isUniform(const MachineInstr *I) const { return !isDivergent(I); } + + /// Print all divergent values in the kernel. + void print(llvm::raw_ostream &OS, const Module_ *) const; +}; + +} // namespace llvm + diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp new file mode 100644 index 0000000000000..7213f7b4b11b4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp @@ -0,0 +1,511 @@ +//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation +//--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is based on Analysis/MirSyncDependenceAnalysis.cpp, just change +// MachineBasicBlock to MachineBasicBlock. +// This file implements an algorithm that returns for a divergent branch +// the set of basic blocks whose phi nodes become divergent due to divergent +// control. These are the blocks that are reachable by two disjoint paths from +// the branch or loop exits that have a reaching path that is disjoint from a +// path to the loop latch. +// +// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model +// control-induced divergence in phi nodes. +// +// -- Summary -- +// The SyncDependenceAnalysis lazily computes sync dependences [3]. +// The analysis evaluates the disjoint path criterion [2] by a reduction +// to SSA construction. The SSA construction algorithm is implemented as +// a simple data-flow analysis [1]. +// +// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy +// [2] "Efficiently Computing Static Single Assignment Form +// and the Control Dependence Graph", TOPLAS '91, +// Cytron, Ferrante, Rosen, Wegman and Zadeck +// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack +// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira +// +// -- Sync dependence -- +// Sync dependence [4] characterizes the control flow aspect of the +// propagation of branch divergence. For example, +// +// %cond = icmp slt i32 %tid, 10 +// br i1 %cond, label %then, label %else +// then: +// br label %merge +// else: +// br label %merge +// merge: +// %a = phi i32 [ 0, %then ], [ 1, %else ] +// +// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid +// because %tid is not on its use-def chains, %a is sync dependent on %tid +// because the branch "br i1 %cond" depends on %tid and affects which value %a +// is assigned to. +// +// -- Reduction to SSA construction -- +// There are two disjoint paths from A to X, if a certain variant of SSA +// construction places a phi node in X under the following set-up scheme [2]. +// +// This variant of SSA construction ignores incoming undef values. +// That is paths from the entry without a definition do not result in +// phi nodes. +// +// entry +// / \ +// A \ +// / \ Y +// B C / +// \ / \ / +// D E +// \ / +// F +// Assume that A contains a divergent branch. We are interested +// in the set of all blocks where each block is reachable from A +// via two disjoint paths. This would be the set {D, F} in this +// case. +// To generally reduce this query to SSA construction we introduce +// a virtual variable x and assign to x different values in each +// successor block of A. +// entry +// / \ +// A \ +// / \ Y +// x = 0 x = 1 / +// \ / \ / +// D E +// \ / +// F +// Our flavor of SSA construction for x will construct the following +// entry +// / \ +// A \ +// / \ Y +// x0 = 0 x1 = 1 / +// \ / \ / +// x2=phi E +// \ / +// x3=phi +// The blocks D and F contain phi nodes and are thus each reachable +// by two disjoins paths from A. +// +// -- Remarks -- +// In case of loop exits we need to check the disjoint path criterion for loops +// [2]. To this end, we check whether the definition of x differs between the +// loop exit and the loop header (_after_ SSA construction). +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "AMDGPUMirSyncDependenceAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + +#include +#include + +#define DEBUG_TYPE "sync-dependence" + +namespace llvm { + +ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet; + +SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI, + // AMDGPU change begin. + DivergentJoinMapTy &JoinMap + // AMDGPU change end. + ) + : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI), + // AMDGPU change begin. + DivergentJoinMap(JoinMap) + // AMDGPU change end. +{ +} + +SyncDependenceAnalysis::~SyncDependenceAnalysis() {} + +using FunctionRPOT = ReversePostOrderTraversal; + +// divergence propagator for reducible CFGs +struct DivergencePropagator { + const FunctionRPOT &FuncRPOT; + const MachineDominatorTree &DT; + const MachinePostDominatorTree &PDT; + const MachineLoopInfo &LI; + + // identified join points + std::unique_ptr JoinBlocks; + + // reached loop exits (by a path disjoint to a path to the loop header) + SmallPtrSet ReachedLoopExits; + + // if DefMap[B] == C then C is the dominating definition at block B + // if DefMap[B] ~ undef then we haven't seen B yet + // if DefMap[B] == B then B is a join point of disjoint paths from X or B is + // an immediate successor of X (initial value). + using DefiningBlockMap = std::map; + DefiningBlockMap DefMap; + + // all blocks with pending visits + std::unordered_set PendingUpdates; + + DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI) + : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI), + JoinBlocks(new ConstBlockSet) {} + + // set the definition at @block and mark @block as pending for a visit + void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) { + bool WasAdded = DefMap.emplace(&Block, &DefBlock).second; + if (WasAdded) + PendingUpdates.insert(&Block); + } + + void printDefs(raw_ostream &Out) { + Out << "Propagator::DefMap {\n"; + for (const auto *Block : FuncRPOT) { + auto It = DefMap.find(Block); + Out << Block->getName() << " : "; + if (It == DefMap.end()) { + Out << "\n"; + } else { + const auto *DefBlock = It->second; + Out << (DefBlock ? DefBlock->getName() : "") << "\n"; + } + } + Out << "}\n"; + } + + // process @succBlock with reaching definition @defBlock + // the original divergent branch was in @parentLoop (if any) + void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop, + const MachineBasicBlock &DefBlock) { + + // @succBlock is a loop exit + if (ParentLoop && !ParentLoop->contains(&SuccBlock)) { + DefMap.emplace(&SuccBlock, &DefBlock); + ReachedLoopExits.insert(&SuccBlock); + return; + } + + // first reaching def? + auto ItLastDef = DefMap.find(&SuccBlock); + if (ItLastDef == DefMap.end()) { + addPending(SuccBlock, DefBlock); + return; + } + + // a join of at least two definitions + if (ItLastDef->second != &DefBlock) { + // do we know this join already? + if (!JoinBlocks->insert(&SuccBlock).second) + return; + + // update the definition + addPending(SuccBlock, SuccBlock); + } + } + + // find all blocks reachable by two disjoint paths from @rootTerm. + // This method works for both divergent terminators and loops with + // divergent exits. + // @rootBlock is either the block containing the branch or the header of the + // divergent loop. + // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator) + // headed by @rootBlock. + // @parentLoop is the parent loop of the MachineLoop or the loop that contains the + // Terminator. + template + std::unique_ptr + computeJoinPoints(const MachineBasicBlock &RootBlock, + SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) { + assert(JoinBlocks); + + // bootstrap with branch targets + for (const auto *SuccBlock : NodeSuccessors) { + DefMap.emplace(SuccBlock, SuccBlock); + + if (ParentLoop && !ParentLoop->contains(SuccBlock)) { + // immediate loop exit from node. + ReachedLoopExits.insert(SuccBlock); + continue; + } else { + // regular successor + PendingUpdates.insert(SuccBlock); + } + } + + auto ItBeginRPO = FuncRPOT.begin(); + + // skip until term (TODO RPOT won't let us start at @term directly) + for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {} + + auto ItEndRPO = FuncRPOT.end(); + assert(ItBeginRPO != ItEndRPO); + + // propagate definitions at the immediate successors of the node in RPO + auto ItBlockRPO = ItBeginRPO; + while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) { + const auto *Block = *ItBlockRPO; + + // skip @block if not pending update + auto ItPending = PendingUpdates.find(Block); + if (ItPending == PendingUpdates.end()) + continue; + PendingUpdates.erase(ItPending); + + // propagate definition at @block to its successors + auto ItDef = DefMap.find(Block); + const auto *DefBlock = ItDef->second; + assert(DefBlock); + + auto *BlockLoop = LI.getLoopFor(Block); + if (ParentLoop && + (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) { + // if the successor is the header of a nested loop pretend its a + // single node with the loop's exits as successors + SmallVector BlockLoopExits; + BlockLoop->getExitBlocks(BlockLoopExits); + for (const auto *BlockLoopExit : BlockLoopExits) { + visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock); + } + + } else { + // the successors are either on the same loop level or loop exits + for (const auto *SuccBlock : Block->successors()) { + visitSuccessor(*SuccBlock, ParentLoop, *DefBlock); + } + } + } + + // We need to know the definition at the parent loop header to decide + // whether the definition at the header is different from the definition at + // the loop exits, which would indicate a divergent loop exits. + // + // A // loop header + // | + // B // nested loop header + // | + // C -> X (exit from B loop) -..-> (A latch) + // | + // D -> back to B (B latch) + // | + // proper exit from both loops + // + // D post-dominates B as it is the only proper exit from the "A loop". + // If C has a divergent branch, propagation will therefore stop at D. + // That implies that B will never receive a definition. + // But that definition can only be the same as at D (D itself in thise case) + // because all paths to anywhere have to pass through D. + // + const MachineBasicBlock *ParentLoopHeader = + ParentLoop ? ParentLoop->getHeader() : nullptr; + if (ParentLoop && ParentLoop->contains(PdBoundBlock)) { + DefMap[ParentLoopHeader] = DefMap[PdBoundBlock]; + } + + // analyze reached loop exits + if (!ReachedLoopExits.empty()) { + assert(ParentLoop); + const auto *HeaderDefBlock = DefMap[ParentLoopHeader]; + LLVM_DEBUG(printDefs(dbgs())); + + // AMDGPU CHANGE: Allow null HeaderDefBlock + // Because of the way they walk the blocks (a reverse post order traversal + // stopping at the immediate post dominator) it is possible that + // they will reach a loop exit, but not the loop header. + // + // We conservatively mark the exit blocks as divergent join points + // in this case. + // + // Problem CFG is below: + // + // +--> A + // | / \ + // | B C + // | | / | + // +--L P + // + // In this cfg, C is the RootBlock and P is C's post-dominator. + // It will only visit L and P and then stop because it hits the + // post dominator. Most loops do not hit this case because the + // loop exiting block (C) will branch directly back to the loop + // header. + // + if (HeaderDefBlock) + { + for (const auto *ExitBlock : ReachedLoopExits) { + auto ItExitDef = DefMap.find(ExitBlock); + assert((ItExitDef != DefMap.end()) && + "no reaching def at reachable loop exit"); + if (ItExitDef->second != HeaderDefBlock) { + JoinBlocks->insert(ExitBlock); + } + } + } + else + { + for (const auto *ExitBlock : ReachedLoopExits) + { + JoinBlocks->insert(ExitBlock); + } + } + } + + return std::move(JoinBlocks); + } +}; + +// AMDGPU change begin. +// For all join blocks caused by divergent RootBlock, the prevs of a join block +// which are in DefMap or the RootBlock are divergent join each other on the join block because +// of divergent RootBlock. +static void updateJoinMap( + const MachineBasicBlock *RootBlock, + DenseMap> &JoinMap, + DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) { + for (const MachineBasicBlock *JoinBB : JoinBlocks) { + // makr divergent join for all pred pair which in DefMap. + for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end(); + predIt++) { + auto predIt2 = predIt; + const MachineBasicBlock *pred = *predIt; + if (DefMap.count(pred) == 0 && pred != RootBlock) + continue; + + for (predIt2++; predIt2 != JoinBB->pred_end(); predIt2++) { + const MachineBasicBlock *pred2 = *predIt2; + if (DefMap.count(pred2) == 0 && pred2 != RootBlock) + continue; + + JoinMap[pred].insert(pred2); + JoinMap[pred2].insert(pred); + LLVM_DEBUG(dbgs() << "joint_bb0: " << pred->getName() + << " joint_bb1: " << pred2->getName() << "\n";); + } + } + } +} +// AMDGPU change end. + +const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) { + using LoopExitVec = SmallVector; + LoopExitVec LoopExits; + MachineLoop.getExitBlocks(LoopExits); + if (LoopExits.size() < 1) { + return EmptyBlockSet; + } + + // already available in cache? + auto ItCached = CachedLoopExitJoins.find(&MachineLoop); + if (ItCached != CachedLoopExitJoins.end()) { + return *ItCached->second; + } + + // dont propagte beyond the immediate post dom of the loop + const auto *PdNode = PDT.getNode(const_cast(MachineLoop.getHeader())); + const auto *IpdNode = PdNode->getIDom(); + const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) { + IpdNode = IpdNode->getIDom(); + PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + } + + // compute all join points + DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; + auto JoinBlocks = Propagator.computeJoinPoints( + *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock); + + // AMDGPU change begin. + // Save divergent join pairs. + updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap, + *JoinBlocks.get()); + // AMDGPU change end. + + auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks)); + assert(ItInserted.second); + return *ItInserted.first->second; +} + +const ConstBlockSet & +SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) { + // trivial case + if (Term.getParent()->succ_size() < 1) { + return EmptyBlockSet; + } + + // already available in cache? + auto ItCached = CachedBranchJoins.find(&Term); + if (ItCached != CachedBranchJoins.end()) + return *ItCached->second; + + // dont propagate beyond the immediate post dominator of the branch + const auto *PdNode = PDT.getNode(const_cast(Term.getParent())); + const auto *IpdNode = PdNode->getIDom(); + const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + + + // compute all join points + DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; + const auto &TermBlock = *Term.getParent(); + + // AMDGPU CHANGE + // Make sure the post-dominator is outside the loop for the loop header. + // Otherwise, we may not find all the join blocks in the loop + // because the search stops too early. Some join points can be reached + // after the post-dominator! + // + // Problem CFG is below: + // + // +--> A + // | / \ + // | B P + // | | / | + // +--L X + // + // In this cfg, A is the loop header and P is A's post-dominator. + // The algorithm to mark join points does an Reverse Post Order walk + // from A and stops when it reaches the post dominator. It would not + // mark the phi node in L as divergent even when A had a divergent branch. + // The fix we made was to make the join point search continue all the way + // to the loops post dominator (which is X in this example). + // + // NOTE: They already made this change for the loop case above, but for + // a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&) + // + const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock); + if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) + { + while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) { + IpdNode = IpdNode->getIDom(); + PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + } + } + + auto JoinBlocks = Propagator.computeJoinPoints( + TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock); + + // AMDGPU change begin. + // Save divergent join pairs. + updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap, + *JoinBlocks.get()); + // AMDGPU change end. + + auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks)); + assert(ItInserted.second); + return *ItInserted.first->second; +} + +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h new file mode 100644 index 0000000000000..a52bcc7bc9e7c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h @@ -0,0 +1,98 @@ +//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// This file defines the SyncDependenceAnalysis class, which computes for +// every divergent branch the set of phi nodes that the branch will make +// divergent. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include +#include + +namespace llvm { +class MachineBasicBlock; +class MachineDominatorTree; +class MachineLoop; +class MachinePostDominatorTree; +class MachineLoopInfo; +class MachineFunction; +class MachineInstr; + +using DivergentJoinMapTy = + llvm::DenseMap>; + +using ConstBlockSet = llvm::SmallPtrSet; + +/// \brief Relates points of divergent control to join points in +/// reducible CFGs. +/// +/// This analysis relates points of divergent control to points of converging +/// divergent control. The analysis requires all loops to be reducible. +class SyncDependenceAnalysis { + void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop, + const MachineBasicBlock *defBlock); + +public: + bool inRegion(const MachineBasicBlock &BB) const; + + ~SyncDependenceAnalysis(); + SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI, + // AMDGPU change begin + DivergentJoinMapTy &JoinMap + // AMDGPU change end + ); + + /// \brief Computes divergent join points and loop exits caused by branch + /// divergence in \p Term. + /// + /// The set of blocks which are reachable by disjoint paths from \p Term. + /// The set also contains loop exits if there two disjoint paths: + /// one from \p Term to the loop exit and another from \p Term to the loop + /// header. Those exit blocks are added to the returned set. + /// If L is the parent loop of \p Term and an exit of L is in the returned + /// set then L is a divergent loop. + const ConstBlockSet &join_blocks(const MachineInstr &Term); + + /// \brief Computes divergent join points and loop exits (in the surrounding + /// loop) caused by the divergent loop exits of\p MachineLoop. + /// + /// The set of blocks which are reachable by disjoint paths from the + /// loop exits of \p MachineLoop. + /// This treats the loop as a single node in \p MachineLoop's parent loop. + /// The returned set has the same properties as for join_blocks(TermInst&). + const ConstBlockSet &join_blocks(const MachineLoop &MachineLoop); + +private: + static ConstBlockSet EmptyBlockSet; + + llvm::ReversePostOrderTraversal FuncRPOT; + const MachineDominatorTree &DT; + const MachinePostDominatorTree &PDT; + const MachineLoopInfo &LI; + // AMDGPU change begin. + DivergentJoinMapTy &DivergentJoinMap; + // AMDGPU change end. + std::map> CachedLoopExitJoins; + std::map> + CachedBranchJoins; +}; + +} // namespace llvm + + diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp new file mode 100644 index 0000000000000..648df7f724617 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -0,0 +1,188 @@ +//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--------------------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//===--------------------------------------------------------------------------------===// + +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "AMDGPUOccupancyAndLatencyHelper.h" + +#include "llvm/CodeGen/MachineLoopInfo.h" + +namespace llvm { + +// Other info which can help compare schedule result. +float SchedScore::computeScore() const { + // Occupancy 1 cannot mix alu. + unsigned MixHidenAlu = Alu - MixAlu; + if (Occupancy == 1) + MixHidenAlu = 0; + return ((float)MemLatency - (float)MixHidenAlu) / (float)Occupancy - + LatencyHide; +} +float SchedScore::computeScore2() const { + float cycles = 0; + cycles = (MixAlu * Occupancy + MemLatency); + cycles /= Occupancy; + return cycles; +} + +void SchedScore::sum(const SchedScore &s, unsigned loopDepth) { + unsigned loopCount = loopDepth > 0 ? std::pow(3, loopDepth) : 1; + LatencyHide += loopCount * s.LatencyHide; + MemLatency += loopCount * s.MemLatency; + MixAlu += loopCount * s.MixAlu; + Alu += loopCount * s.Alu; + Lds += loopCount * s.Lds; + SgprSpill |= s.SgprSpill; +} +bool SchedScore::isBetter(const SchedScore &s) const { + float score = computeScore(); + float newScore = s.computeScore(); + bool spillBetter = !SgprSpill && s.SgprSpill; + return spillBetter ? true : newScore >= score; +} +// Does more occupancy give more perf. +bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const { + unsigned gain = latencyGain(TargetOccupancy, ExtraOcc); + // 10% is good enough. + if ((10*gain) >= Alu) + return true; + else + return false; +} + +unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const { + unsigned latency = MemLatency; + return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc)); +} + +// AMDGPULatencyTracker +AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST) + : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {} + +void AMDGPULatencyTracker::scan(const MachineInstr &MI) { + if (MI.isDebugInstr()) return; + int latency = SIII->getInstrLatency(ItinerayData, MI); + // If inside latency hide. + if (!LatencyMIs.empty()) { + bool bWaitCnt = false; + for (auto &MO : MI.operands()) { + if (MO.isReg()) { + unsigned reg = MO.getReg(); + auto it = LatencyMIs.find(reg); + if (it != LatencyMIs.end()) { + bWaitCnt = true; + // If MI use mem result, update latency to mem latency. + int cycle = it->second; + if (cycle > latency) + latency = cycle; + } + } + } + // Update latency for each mem latency inst. + for (auto it = LatencyMIs.begin(); it != LatencyMIs.end();) { + auto prev = it; + auto l = (it++); + int cycle = l->second; + if (cycle <= latency) { + // Only left cycles. + // Remove the reg. + LatencyMIs.erase(prev); + if (bWaitCnt && cycle == latency) { + score.MemLatency += cycle; + // Only count memLatency once, the rest is hide. + bWaitCnt = false; + } else { + // Hide cycle or count mem latency? + score.LatencyHide += cycle; + } + } else { + l->second -= latency; + // Hide latency. + score.LatencyHide += latency; + } + } + + } else { + // TODO: check branch/lds? + // TODO: check prevVAlu? + auto getAluStatus = [](const MachineInstr &MI, + const llvm::SIInstrInfo *SIII) { + AluStatus status = AluStatus::Nothing; + if (SIII->isVALU(MI.getOpcode())) { + status = AluStatus::Vector; + } else if (SIII->isSALU(MI.getOpcode())) { + status = AluStatus::Scalar; + } + return status; + }; + AluStatus status = getAluStatus(MI, SIII); + + switch (prevStatus) { + case AluStatus::Nothing: { + score.Alu += latency; + score.MixAlu += latency; + prevStatus = status; + } break; + case AluStatus::Vector: + case AluStatus::Scalar: { + score.Alu += latency; + // Ignore mix alu. + if (prevStatus != status) { + prevStatus = AluStatus::Nothing; + } else { + score.MixAlu += latency; + } + } break; + } + } + // Update latency inst. + if (SIII->isHighLatencyInstruction(MI) && MI.mayLoad()) { + unsigned reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kHighLetency = 180; + LatencyMIs[reg] = kHighLetency; + } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) { + unsigned reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kLowLetency = 35; + LatencyMIs[reg] = kLowLetency; + } +} + +SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI) { + SchedScore totalScore; + for (auto &MFI : MF) { + MachineBasicBlock &MBB = MFI; + MachineBasicBlock::iterator Next; + AMDGPULatencyTracker latencyTracker(ST); + for (auto &MI : MBB) { + latencyTracker.scan(MI); + } + unsigned loopDepth = 0; + if (MLI) { + loopDepth = MLI->getLoopDepth(&MBB); + } + totalScore.sum(latencyTracker.score, loopDepth); + } + return totalScore; +} + +} // namespace llvm + + diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h new file mode 100644 index 0000000000000..f108bab24bd39 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -0,0 +1,74 @@ +//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--------------------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//===--------------------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" + +namespace llvm { + +class MachineFunction; +class GCNSubtarget; +class MachineInstr; +class SIInstrInfo; +class MachineLoopInfo; + +struct SchedScore { + // Score for this Sched result. + unsigned Occupancy = 0; + bool SgprSpill = false; + unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass? + unsigned MemLatency = 0; // Only save mem latency. + // We want mem latency small and hide big. Compare + // memLatency - hide * Occ, smaller is better. + unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1. + unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy. + unsigned Lds = 0; // Todo: count lds. + SchedScore() {} + + // Other info which can help compare schedule result. + float computeScore() const; + float computeScore2() const; + + void sum(const SchedScore &s, unsigned loopDepth=0); + bool isBetter(const SchedScore &s) const; + bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const; + // More latency can be hiden with ExtraOcc. + unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; +}; + +struct AMDGPULatencyTracker { + AMDGPULatencyTracker(const llvm::GCNSubtarget &ST); + const llvm::SIInstrInfo *SIII; + const llvm::InstrItineraryData *ItinerayData; + // Latency MI dst reg to cycle map. + llvm::DenseMap LatencyMIs; + SchedScore score; + // Low latency MI not wait. + unsigned hideLatency = 0; + unsigned memLatency = 0; + // For simple, only consider mixture as one valu one salu. + // Not group now. + unsigned prevSAlu = 0; + unsigned prevVAlu = 0; + enum class AluStatus { + Nothing, + Vector, + Scalar, + } prevStatus = AluStatus::Nothing; + void scan(const llvm::MachineInstr &MI); +}; + +SchedScore CollectLatency(llvm::MachineFunction &MF, + const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI = nullptr); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp new file mode 100644 index 0000000000000..a0f2a5d4dc121 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -0,0 +1,1790 @@ +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" + +//#include "dxc/DXIL/DxilMetadataHelper.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/ADT/IntEqClasses.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Support/GraphWriter.h" + +#include "llvm/Support/Debug.h" + +#include "GCNRegPressure.h" +#include "AMDGPUMIRUtils.h" +#include "AMDGPUSubExpDag.h" +#include + +#define DEBUG_TYPE "xb-sub-exp-dag" +using namespace llvm; + +namespace llvm { + +// Expression Dag. + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const { + dbgs() << "\nSubExp:\n"; + dbgs() << "input regs:\n"; + for (auto &input : inputLive) { + pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs()); + dbgs() << "\n"; + } + dbgs() << "output regs:\n"; + for (auto &output : outputLive) { + pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs()); + dbgs() << "\n"; + } + + for (MachineInstr *MI : SUnits) { + MI->dump(); + } + dbgs() << "End of SubExp\n"; +} +#endif + +bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const +{ + for (const MachineInstr *MI : SUnits) + { + if (MI->modifiesRegister(Reg, SIRI)) + { + return true; + } + } + + return false; +} + +void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + sMaxSize = std::max(sInputSize, sOutputSize); + vMaxSize = std::max(vInputSize, vOutputSize); + + DenseMap LiveRegs; + GCNRegPressure CurPressure; + + // Add output to pressure. + for (MachineInstr *MI : BottomRoots) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (!Reg.isVirtual()) + continue; + LaneBitmask mask = getRegMask(MO, MRI); + auto it = LiveRegs.find(Reg); + if (it != LiveRegs.end()) { + LiveRegs[Reg] = mask | it->second; + } else { + LiveRegs[Reg] = mask; + } + } + } + + for (auto it : LiveRegs) { + LaneBitmask emptyMask; + CurPressure.inc(it.first, emptyMask, it.second, MRI); + } + + for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) { + MachineInstr *MI = *it; + auto *ST = &MI->getMF()->getSubtarget(); // TODO: Better way to get this. + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (!Reg.isVirtual()) { + if (Reg == AMDGPU::SCC) + bTouchSCC = true; + continue; + } + + LaneBitmask LiveMask = getRegMask(MO, MRI); + LaneBitmask PrevMask; + auto liveIt = LiveRegs.find(Reg); + if (liveIt != LiveRegs.end()) { + PrevMask = liveIt->second; + } + + if (MO.isDef()) { + LiveMask = PrevMask & (~(LiveMask)); + } else { + LiveMask = PrevMask | LiveMask; + } + + CurPressure.inc(Reg, PrevMask, LiveMask, MRI); + LiveRegs[Reg] = LiveMask; + } + + unsigned sSize = CurPressure.getSGPRNum(); + unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts()); + if (sSize > sMaxSize) + sMaxSize = sSize; + if (vSize > vMaxSize) + vMaxSize = vSize; + } +} + +bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const { + if (bMultiDefOutput) + return false; + if (bHasTerminatorInst) + return false; + if (bUseIncomingReg) + return false; + + // Input should be single def. + for (unsigned Reg : TopRegs) { + if (!MRI.hasOneDef(Reg) && !llvm::IsSub0Sub1SingleDef(Reg, MRI)) + return false; + } + return true; +} + +ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, const bool bJoinInput) + : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {} + +template +void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) { + unsigned NodeSize = InputLiveReg.size() + insts.size(); + SUnits.reserve(NodeSize); + + for (MachineInstr *MI : insts) { + if (MI->isDebugInstr()) + continue; + SUnits.emplace_back(MI, SUnits.size()); + SUnit *SU = &SUnits.back(); + SUnitMIMap[SU] = MI; + MISUnitMap[MI] = SU; + } + + for (auto it : InputLiveReg) { + unsigned Reg = it.first; + SUnits.emplace_back(); + SUnit *SU = &SUnits.back(); + SU->NodeNum = SUnits.size() - 1; + SUnitInputMap[SU] = Reg; + InputSUnitMap[Reg] = SU; + } +} + +template void ExpDag::initNodes>( + const LiveSet &InputLiveReg, DenseSet &instRange); + +template void ExpDag::initNodes>( + const LiveSet &InputLiveReg, std::vector &instRange); + +template +void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, + T &insts) { + initNodes(InputLiveReg, insts); + addDataDep(SIRI); + addCtrlDep(); + buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII); +} + +template void +ExpDag::build>(const LiveSet &InputLiveReg, + const LiveSet &OutputLiveReg, + DenseSet &instRange); +template void ExpDag::build>(const LiveSet &InputLiveReg, + const LiveSet &OutputLiveReg, + std::vector &instRange); + +void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + IntEqClasses SubtreeClasses(SUnits.size()); + std::vector passThruInputs; + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) { + passThruInputs.emplace_back(SU.NodeNum); + continue; + } + if (!bJoinInputToSubExp && !SU.isInstr()) + continue; + // Join prev. + for (SDep &PreDep : SU.Preds) { + SUnit *PreSU = PreDep.getSUnit(); + if (!bJoinInputToSubExp && !PreSU->isInstr()) + continue; + SubtreeClasses.join(SU.NodeNum, PreSU->NodeNum); + } + // Join succ. + for (SDep &SucDep : SU.Succs) { + SUnit *SucSU = SucDep.getSUnit(); + SubtreeClasses.join(SU.NodeNum, SucSU->NodeNum); + } + } + SubtreeClasses.compress(); + + unsigned NumSubExps = SubtreeClasses.getNumClasses(); + // Not count passThruInputs for subExps since they're exp with only 1 SU. + // SubExpIndexMap is used to pack SubIdx within updated NumSubExps. + NumSubExps -= passThruInputs.size(); + SubExps.resize(NumSubExps); + DenseMap SubExpIndexMap; + + // Add SU to sub exp. + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) { + continue; + } + unsigned SubIdx = SubtreeClasses[SU.NodeNum]; + unsigned OriginSubIdx = SubIdx; + // Pack subidx. + if (SubExpIndexMap.count(SubIdx) == 0) { + unsigned count = SubExpIndexMap.size(); + SubExpIndexMap.insert(std::make_pair(SubIdx, count)); + } + SubIdx = SubExpIndexMap[SubIdx]; + // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag. + SU.NodeQueueId = SubIdx; + + SubExp &Exp = SubExps[SubIdx]; + auto it = SUnitInputMap.find(&SU); + if (it != SUnitInputMap.end()) { + // Input. + unsigned Reg = it->second; + Exp.TopRegs.insert(Reg); + } else { + MachineInstr *MI = SU.getInstr(); + MachineBasicBlock *MBB = MI->getParent(); + Exp.FromBB = MBB; + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) { + Exp.bUseIncomingReg = true; + } + } + + Exp.SUnits.emplace_back(MI); + if (SU.NumSuccsLeft == 0) { + Exp.BottomRoots.insert(MI); + if (MI->isTerminator()) + Exp.bHasTerminatorInst = true; + } + if (MI->isNotDuplicable()) + Exp.bNotSafeToCopy = true; + // Skip Scalar mem access since no scalar store. + if (MI->mayLoadOrStore() && !SIII->isSMRD(*MI)) { + Exp.bHasMemInst = true; + } + // Add bottom regs. + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + Register Reg = MO.getReg(); + // physical reg is not in live reg. + if (!Reg.isVirtual()) + continue; + if (SU.NumSuccsLeft) { + // For SU which has used in current blk. + // Check if used in other blks or subExps. + bool bUsedInOtherBlk = false; + for (auto &UserMI : MRI.use_nodbg_instructions(Reg)) { + if (UserMI.getParent() != MBB) { + bUsedInOtherBlk = true; + break; + } + auto suIt = MISUnitMap.find(&UserMI); + // When UserMI is not in dag, treat it as other block. + if (suIt == MISUnitMap.end()) { + bUsedInOtherBlk = true; + break; + } + SUnit *UseSU = suIt->second; + // UserMI should always be in same subExp. + unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum]; + if (UseSubIdx != OriginSubIdx) { + // When reg has multiple def, it is possible for user def in different subExp. + if (MRI.getUniqueVRegDef(Reg)) + llvm::report_fatal_error("user and def in different subExp"); + break; + } + } + if (!bUsedInOtherBlk) + continue; + } + Exp.BottomRegs.insert(Reg); + if (!MRI.getUniqueVRegDef(Reg)) { + Exp.bMultiDefOutput = true; + } + } + } + } + // Calc reg for SubExp. + // Get block live in and live out. + // Only reg will miss live mask. + for (SubExp &Exp : SubExps) { + for (unsigned Reg : Exp.TopRegs) { + auto it = StartLiveReg.find(Reg); + assert(it != StartLiveReg.end() && + "cannot find input reg in block start live"); + Exp.inputLive[Reg] |= it->second; + } + + for (unsigned Reg : Exp.BottomRegs) { + auto it = EndLiveReg.find(Reg); + if (it == EndLiveReg.end()) { + //"cannot find output reg in block end live"); + // Bottom reg is killed inside current block, did not get out of the + // block. + // Or the bottom reg is not treat as output in this dag, not save to + // outputLive which will affect profit count. + continue; + } + Exp.outputLive[Reg] |= it->second; + } + + CollectLiveSetPressure(Exp.inputLive, MRI, SIRI, Exp.vInputSize, + Exp.sInputSize); + CollectLiveSetPressure(Exp.outputLive, MRI, SIRI, Exp.vOutputSize, + Exp.sOutputSize); + } +} + +void ExpDag::addDataDep(const SIRegisterInfo *SIRI) { + DenseMap curDefMI; + + for (SUnit &SU : SUnits) { + if (!SU.isInstr()) + continue; + MachineInstr *MI = SU.getInstr(); + + // Link use to the def. + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + + Register Reg = MO.getReg(); + SUnit *DefSU = nullptr; + + auto curDefIt = curDefMI.find(Reg); + // Check def inst first. + if (curDefIt != curDefMI.end()) { + MachineInstr *curDef = curDefIt->second; + DefSU = MISUnitMap[curDef]; + } else { + // physical reg is not in live reg. + if (!Reg.isVirtual()) + continue; + if (MO.isUndef()) + continue; + // Is it OK for degbug instr MO cannot find def? + if (MI->isDebugInstr()) + continue; + // Should be an input. + assert(InputSUnitMap.count(Reg) > 0 && "cannot find def"); + DefSU = InputSUnitMap[Reg]; + } + SU.addPred(SDep(DefSU, SDep::Data, Reg)); + } + + // Add def to curDefMI; + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + + // For case like: + // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // %808.sub1:sgpr_64 = S_MOV_B32 0 + // When partially write, link MI to previous def. + if (MO.getSubReg() != 0) { + SUnit *DefSU = nullptr; + auto curDefIt = curDefMI.find(Reg); + // Check def inst first. + if (curDefIt != curDefMI.end()) { + MachineInstr *curDef = curDefIt->second; + DefSU = MISUnitMap[curDef]; + // Add link between different defs. + SU.addPred(SDep(DefSU, SDep::Data, Reg)); + } + } + + curDefMI[Reg] = MI; + } + } +} + +void ExpDag::addCtrlDep() { + // TODO: add depend for memory, barrier. +} + +BlockExpDag::BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII) + : ExpDag(MRI, SIRI, SIII, /*bJoinInput*/ true), LIS(LIS), MBB(B) {} + +void BlockExpDag::build() { + auto *SlotIndexes = LIS->getSlotIndexes(); + const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB); + const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI); + + const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB); + const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI); + + std::vector insts; + for (MachineInstr &MI : *MBB) { + insts.emplace_back(&MI); + } + + ExpDag::build(StartLiveReg, EndLiveReg, insts); +} + +void BlockExpDag::buildWithPressure() { + auto *SlotIndexes = LIS->getSlotIndexes(); + const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB); + const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI); + + const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB); + const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI); + + std::vector insts; + for (MachineInstr &MI : *MBB) { + insts.emplace_back(&MI); + } + + ExpDag::build(StartLiveReg, EndLiveReg, insts); + // Build pressure. + buildPressure(StartLiveReg, EndLiveReg); +} + +void BlockExpDag::buildAvail( + const LiveSet &passThruSet, + DenseMap &DagAvailRegMap) { + DenseSet Processed; + + DenseSet WorkList; + MachineInstr &BeginMI = MBB->instr_front(); + + // Calc avaialbe for each node, live is avail & sum(input of success). + // If a reg is avaiable from the node, then success node can use it from this + // node. For dag live, pred output don't need to have all input a node needs. + // As long as all pred outputs can cover inputs, it is OK. + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0) { + GCNDownwardRPTracker RP(*LIS); + RP.reset(BeginMI, &passThruSet); + MachineInstr *MI = SU.getInstr(); + if (MI) { + RP.reset(*MI, &passThruSet); + RP.advance(); + } + DagAvailRegMap[&SU] = RP.getLiveRegs(); + + // Add succ to work list. + for (auto &Succ : SU.Succs) { + SUnit *SuccSU = Succ.getSUnit(); + if (SuccSU->NumPredsLeft > 0) + SuccSU->NumPredsLeft--; + WorkList.insert(SuccSU); + } + } + } + while (!WorkList.empty()) { + bool bUpdated = false; + SmallVector ReadyNodes; + for (SUnit *SU : WorkList) { + if (SU->NumPredsLeft > 0) + continue; + ReadyNodes.emplace_back(SU); + // Ready, move it to Processed. + Processed.insert(SU); + bUpdated = true; + // Only update 1 node once. + // Order of schedle here should not affect pressure. + break; + } + + for (SUnit *SU : ReadyNodes) { + // Remove SU from worklist. + WorkList.erase(SU); + + MachineInstr *MI = SU->getInstr(); + // Calc pressure based on pred nodes. + GCNRPTracker::LiveRegSet dagLive; + for (auto &Pred : SU->Preds) { + SUnit *PredSU = Pred.getSUnit(); + GCNRPTracker::LiveRegSet PredLive = DagAvailRegMap[PredSU]; + + GCNDownwardRPTracker RP(*LIS); + RP.reset(BeginMI, &PredLive); + if (MI) { + RP.reset(*MI, &PredLive); + // Update PredLive based on MI. + RP.advance(); + } + llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs()); + } + DagAvailRegMap[SU] = dagLive; + + // Add succ to work list. + for (auto &Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + if (SuccSU->NumPredsLeft > 0) + SuccSU->NumPredsLeft--; + WorkList.insert(SuccSU); + } + } + + // Skip dead loop + if (ReadyNodes.empty()) { + printf("dead loop when build dag pressure"); + break; + } + } + + assert(WorkList.empty() && "schedule failed for available reg"); +} + +void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, + const LiveSet &EndLiveReg) { + if (MBB->empty()) + return; + DenseMap DagAvailRegMap; + GCNRPTracker::LiveRegSet passThruSet; + for (auto Reg : StartLiveReg) { + unsigned reg = Reg.first; + auto EndReg = EndLiveReg.find(reg); + if (EndReg == EndLiveReg.end()) + continue; + + LaneBitmask mask = Reg.second; + LaneBitmask endMask = EndReg->second; + mask &= endMask; + if (mask.getAsInteger() == 0) + continue; + passThruSet[reg] = mask; + } + + // Build avial for each nodes. + buildAvail(passThruSet, DagAvailRegMap); + + // Calc avaialbe for each node, live is avail & sum(input of success). + // If a reg is avaiable from the node, then success node can use it from this + // node. For dag live, pred output don't need to have all input a node needs. + // As long as all pred outputs can cover inputs, it is OK. + DenseSet Processed; + + DenseSet WorkList; + MachineInstr &BeginMI = MBB->instr_front(); + + for (SUnit &SU : SUnits) { + if (SU.NumSuccsLeft == 0) { + // Calc pressure based on pass thru. + // Using pass thru as base because output of current SU should not + // affect other output SUs. + GCNUpwardRPTracker RP(*LIS); + RP.reset(BeginMI, &passThruSet, /*After*/true); + MachineInstr *MI = SU.getInstr(); + if (MI) { + RP.reset(*MI, &passThruSet, /*After*/true); + RP.recede(*MI); + } + DagPressureMap[&SU] = RP.getLiveRegs(); + // Add pred to work list. + for (auto &Pred : SU.Preds) { + SUnit *PredSU = Pred.getSUnit(); + PredSU->NumSuccsLeft--; + WorkList.insert(PredSU); + } + } + } + + while (!WorkList.empty()) { + bool bUpdated = false; + SmallVector ReadyNodes; + for (SUnit *SU : WorkList) { + if (SU->NumSuccsLeft > 0) + continue; + ReadyNodes.emplace_back(SU); + // Ready, move it to Processed. + Processed.insert(SU); + bUpdated = true; + // Only update 1 node once. + // Order of schedle here should not affect pressure. + break; + } + + for (SUnit *SU : ReadyNodes) { + // Remove SU from worklist. + WorkList.erase(SU); + + MachineInstr *MI = SU->getInstr(); + // Calc pressure based on succ nodes. + GCNRPTracker::LiveRegSet dagLive; + for (auto &Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU]; + + GCNUpwardRPTracker RP(*LIS); + RP.reset(BeginMI, &SuccLive, /*After*/true); + if (MI) { + RP.reset(*MI, &SuccLive, /*After*/true); + // Update SuccLive based on MI. + RP.recede(*MI); + } + llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs()); + } + // Remove live which not avail in SU. + GCNRPTracker::LiveRegSet availLive = DagAvailRegMap[SU]; + llvm::andLiveRegSet(dagLive, availLive); + DagPressureMap[SU] = dagLive; + + // Add pred to work list. + for (auto &Pred : SU->Preds) { + SUnit *PredSU = Pred.getSUnit(); + PredSU->NumSuccsLeft--; + WorkList.insert(PredSU); + } + } + + // Skip dead loop + if (ReadyNodes.empty()) { + printf("dead loop when build dag pressure"); + break; + } + } +} + +// dump functions. + +std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const { + std::string s; + raw_string_ostream oss(s); + auto it = SUnitInputMap.find(SU); + if (it != SUnitInputMap.end()) { + oss << "second) << ">"; + } else { + SU->getInstr()->print(oss, /*SkipOpers=*/true); + } + + return oss.str(); +} + +/// Return the label. +std::string ExpDag::getDAGName() const { + return "dag.exp"; +} + +/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG +/// rendered using 'dot'. +/// +void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const { +#if 0 // TODO: Re-enable this + // This code is only for debugging! +#ifndef NDEBUG + ViewGraph(const_cast(this), Name, false, Title); +#else + errs() << "BlockExpDag::viewGraph is only available in debug builds on " + << "systems with Graphviz or gv!\n"; +#endif // NDEBUG +#endif +} + +void ExpDag::dump() { + viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName()); +} + +} + +// Expression Dag dump. +namespace llvm { + +static DenseSet ViewNodes; + +template <> +struct DOTGraphTraits : public DefaultDOTGraphTraits { + + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + static std::string getGraphName(const llvm::ExpDag *G) { + return "ExpDag graph"; + } + + static bool renderGraphFromBottomUp() { return true; } + + static bool isNodeHidden(const SUnit *Node) { + if (ViewNodes.empty()) + return false; + + return ViewNodes.count(Node) == 0; + } + + static std::string getNodeIdentifierLabel(const SUnit *Node, + const llvm::ExpDag *Graph) { + std::string R; + raw_string_ostream OS(R); + OS << static_cast(Node); + return R; + } + + /// If you want to override the dot attributes printed for a particular + /// edge, override this method. + static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI, + const llvm::ExpDag *Graph) { + if (EI.isArtificialDep()) + return "color=cyan,style=dashed"; + if (EI.isCtrlDep()) + return "color=blue,style=dashed"; + return ""; + } + + static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *Graph) { + std::string Str; + raw_string_ostream SS(Str); + SS << "SU:" << SU->NodeNum; + return SS.str(); + } + static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) { + return G->getGraphNodeLabel(SU); + } + static std::string getNodeAttributes(const SUnit *N, + const llvm::ExpDag *Graph) { + std::string Str("shape=Mrecord"); + + Str += ",style=filled,fillcolor=\"#"; + // Use NodeQueueId as SubIdx for ExpDag. + Str += DOT::getColorString(N->NodeQueueId); + Str += '"'; + + return Str; + } + + static void addCustomGraphFeatures(llvm::ExpDag *G, + GraphWriter &GW) { + return G->addCustomGraphFeatures(GW); + } +}; + +template <> struct GraphTraits : public GraphTraits { + using nodes_iterator = pointer_iterator::iterator>; + static nodes_iterator nodes_begin(llvm::ExpDag *G) { + return nodes_iterator(G->SUnits.begin()); + } + static nodes_iterator nodes_end(llvm::ExpDag *G) { + return nodes_iterator(G->SUnits.end()); + } +}; + +} // namespace llvm + +namespace llvm { +void getRegBound(llvm::MachineBasicBlock *MBB, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + llvm::LiveIntervals *LIS, unsigned &MaxVGPR, + unsigned &MaxSGPR) { + // TODO: calc real reg bound. + MaxVGPR = AMDGPU::VGPR255 - AMDGPU::VGPR0; + MaxSGPR = AMDGPU::SGPR104 - AMDGPU::SGPR0; + + const auto &EndSlot = LIS->getMBBEndIdx(MBB); + const GCNRPTracker::LiveRegSet outputLive = + llvm::getLiveRegs(EndSlot, *LIS, MRI); + + auto* ST = &MBB->getParent()->getSubtarget(); // TODO: Better way to get this. + if (MBB->empty()) { + GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive); + MaxSGPR = MaxPressure.getSGPRNum(); + MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts()); + return; + } + + BlockExpDag dag(MBB, LIS, MRI, SIRI, SIII); + dag.build(); + + std::vector &SUnits = dag.SUnits; + // Remove input nodes. + for (SUnit &SU : SUnits) { + if (!SU.isInstr()) + continue; + std::vector inputDeps; + for (SDep &Dep : SU.Preds) { + SUnit *Pred = Dep.getSUnit(); + if (Pred->isInstr()) + continue; + inputDeps.emplace_back(Dep); + } + for (SDep &Dep : inputDeps) { + SU.removePred(Dep); + } + } + + unsigned inputSize = dag.InputSUnitMap.size(); + unsigned instNodeSize = SUnits.size() - inputSize; + SUnits.erase(SUnits.begin() + instNodeSize, SUnits.end()); + + std::vector BotRoots; + for (SUnit &SU : SUnits) { + if (SU.NumSuccsLeft == 0) + BotRoots.emplace_back(&SU); + } + + auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI); + + GCNUpwardRPTracker RPTracker(*LIS); + RPTracker.reset(MBB->front(), &outputLive, /*After*/true); + for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) { + const SUnit *SU = *it; + if (!SU->isInstr()) + continue; + MachineInstr *MI = SU->getInstr(); + RPTracker.recede(*MI); + } + + GCNRegPressure MaxPressure = RPTracker.getMaxPressureAndReset(); + MaxSGPR = MaxPressure.getSGPRNum(); + MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts()); +} +} // namespace llvm + +// HRB +namespace { + +std::vector buildWorkList(std::vector &SUnits) { + std::vector resultList; + resultList.reserve(SUnits.size()); + for (SUnit &SU : SUnits) { + resultList.emplace_back(&SU); + } + return resultList; +} + +void sortByHeight(std::vector &workList) { + std::sort(workList.begin(), workList.end(), + [](const SUnit *a, const SUnit *b) { + // Lowest height first. + if (a->getHeight() < b->getHeight()) + return true; + // If height the same, NodeNum big first. + if (a->getHeight() == b->getHeight()) + return a->NodeNum > b->NodeNum; + return false; + }); +} + +void sortByInChain(std::vector &workList, DenseSet &Chained) { + // In chain nodes at end. + std::sort(workList.begin(), workList.end(), + [&Chained](const SUnit *a, const SUnit *b) { + return Chained.count(a) < Chained.count(b); + }); +} + +const TargetRegisterClass *getRegClass(SUnit *SU, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + if (!SU->isInstr()) + return nullptr; + MachineInstr *MI = SU->getInstr(); + if (MI->getNumDefs() == 0) + return nullptr; + + // For MI has more than one dst, always use first dst. + MachineOperand *MO = MI->defs().begin(); + if (!MO->isReg()) + return nullptr; + unsigned Reg = MO->getReg(); + return SIRI->getRegClassForReg(MRI, Reg); +} + +unsigned getVGPRSize(const TargetRegisterClass *RC, + const SIRegisterInfo *SIRI) { + if (!RC) + return 0; + if (SIRI->isSGPRClass(RC)) + return 0; + return RC->getLaneMask().getNumLanes(); +} +unsigned getSGPRSize(const TargetRegisterClass *RC, + const SIRegisterInfo *SIRI) { + if (!RC) + return 0; + if (!SIRI->isSGPRClass(RC)) + return 0; + return RC->getLaneMask().getNumLanes(); +} + +void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet &backNodes, + unsigned NodeNum, + SmallDenseSet &visitedNodes) { + if (visitedNodes.count(SU)) + return; + visitedNodes.insert(SU); + + for (SDep &Dep : SU->Succs) { + if (Dep.isWeak()) + continue; + if (Dep.getLatency() > 0) + continue; + + SUnit *Succ = Dep.getSUnit(); /* + if (Succ->NodeNum >= NodeNum) + continue;*/ + + backNodes.insert(Succ); + collectSameHeightBackNodes(Succ, backNodes, NodeNum, visitedNodes); + } +} + +} // namespace + +namespace llvm { + +void HRB::Lineage::addNode(llvm::SUnit *SU) { Nodes.emplace_back(SU); } +unsigned HRB::Lineage::getSize() const { + return RC ? RC->getLaneMask().getNumLanes() : 0; +} +unsigned HRB::Lineage::length() const { return Nodes.size(); } + +SUnit *HRB::Lineage::getHead() const { return Nodes.front(); } +SUnit *HRB::Lineage::getTail() const { return Nodes.back(); } + +void HRB::buildLinear(std::vector &SUnits) { + // Working list from TopRoots. + std::vector workList = buildWorkList(SUnits); + IntEqClasses EqClasses(SUnits.size()); + + while (!workList.empty()) { + sortByHeight(workList); + // Highest SU. + SUnit *SU = workList.back(); + workList.pop_back(); + if (!SU->isInstr()) + continue; + if (ChainedNodes.count(SU) > 0) + continue; + bRecomputeHeight = false; + Lineage lineage = buildChain(SU, SUnits); + + // Remove chained nodes from worklist. + sortByInChain(workList, ChainedNodes); + while (!workList.empty()) { + SUnit *back = workList.back(); + if (ChainedNodes.count(back)) + workList.pop_back(); + else + break; + } + + Lineages.emplace_back(lineage); + + if (bRecomputeHeight) { + // Update height from tail. + SUnit *tail = lineage.Nodes.back(); + tail->setDepthDirty(); + tail->getHeight(); + } + } + + DenseSet tailSet; + for (Lineage &L : Lineages) { + if (L.Nodes.size() < 2) + continue; + auto it = L.Nodes.rbegin(); + it++; + SUnit *tail = L.Nodes.back(); + // If already as tail for other lineage, start from next. + if (tailSet.count(tail) > 0) { + tail = *it; + it++; + } else { + tailSet.insert(tail); + } + for (; it != L.Nodes.rend(); it++) { + SUnit *SU = *it; + if (tail->NodeNum == -1) + continue; + EqClasses.join(SU->NodeNum, tail->NodeNum); + } + } + + EqClasses.compress(); + // TODO: assign sub class to node. + for (Lineage &L : Lineages) { + for (SUnit *SU : L.Nodes) { + if (SU->NodeNum == -1) + continue; + unsigned SubIdx = EqClasses[SU->NodeNum]; + //// Pack subidx. + // if (EqClasses.count(SubIdx) == 0) + // EqClasses[SubIdx] = EqClasses.size(); + SubIdx = EqClasses[SubIdx]; + // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag. + SU->NodeQueueId = SubIdx; + } + } + + LLVM_DEBUG( + dbgs() << "Chained Nodes:"; for (SUnit *SU + : ChainedNodes) { + dbgs() << " " << SU->NodeNum << "\n"; + } for (int i = 0; i < Lineages.size(); i++) { + dbgs() << "Lineage" << i << ":"; + Lineage &L = Lineages[i]; + for (SUnit *SU : L.Nodes) { + dbgs() << " " << SU->NodeNum; + } + dbgs() << "\n"; + }); +} + +SUnit *HRB::findHeir(SUnit *SU, std::vector &SUnits) { + std::vector Candidates; + for (SDep &Dep : SU->Succs) { + // Only check data dep. + if (Dep.getKind() != SDep::Data) + continue; + + SUnit *Succ = Dep.getSUnit(); + Candidates.emplace_back(Succ); + } + + if (Candidates.empty()) + return nullptr; + + if (Candidates.size() == 1) + return Candidates.front(); + + sortByHeight(Candidates); + // Lowest height. + SUnit *Heir = Candidates.front(); + SmallVector SameHeightCandidate; + for (SUnit *SU : Candidates) { + if (Heir->getHeight() != SU->getHeight()) + break; + SameHeightCandidate.emplace_back(SU); + } + // Make sure choose lowest dependence between SameHeightCandidate. + if (SameHeightCandidate.size() > 1) { + for (int i = 1; i < SameHeightCandidate.size(); i++) { + SUnit *SU = SameHeightCandidate[i]; + // If Heir is pred of SU, use SU. + if (canReach(SU, Heir)) + Heir = SU; + } + } + + unsigned HeriHeight = Heir->getHeight(); + + // if lowest node is in ChainedNodes, try to find same height nodes? + + for (SDep &Dep : SU->Succs) { + // Only check data dep. + if (Dep.getKind() != SDep::Data) + continue; + SUnit *Succ = Dep.getSUnit(); + if (Succ == Heir) + continue; + // Avoid cycle in DAG. + if (canReach(Heir, Succ)) + return nullptr; + // Make sure Succ is before Heir. + Heir->addPred(SDep(Succ, SDep::Artificial)); + updateReachForEdge(Succ, Heir, SUnits); + LLVM_DEBUG(dbgs() << "add edge from " << Succ->NodeNum << "(" + << Succ->getHeight() << ") to " << Heir->NodeNum << "(" + << HeriHeight << ")\n"); + // Update height if need. + unsigned Height = Succ->getHeight(); + if (Height <= HeriHeight) { + bRecomputeHeight = true; + } + } + return Heir; +} + +HRB::Lineage HRB::buildChain(SUnit *Node, + std::vector &SUnits) { + HRB::Lineage chain; + chain.addNode(Node); + ChainedNodes.insert(Node); + LLVM_DEBUG(dbgs() << "start chain " << Node->NodeNum << "(" + << Node->getHeight() << ")\n"); + while (Node->NumSuccsLeft > 0) { + SUnit *Heir = findHeir(Node, SUnits); + if (!Heir) + break; + chain.addNode(Heir); + + LLVM_DEBUG(dbgs() << "add node to chain " << Heir->NodeNum << "\n"); + if (ChainedNodes.count(Heir) > 0) + break; + ChainedNodes.insert(Heir); + + Node = Heir; + } + // Find biggest vgpr RC for the chain. + // TODO: Build conflict and allocate on each edge of the chain. + const TargetRegisterClass *RC = nullptr; + unsigned maxRCSize = 0; + for (SUnit *SU : chain.Nodes) { + const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI); + unsigned RCSize = getVGPRSize(SuRC, SIRI); + if (RCSize > maxRCSize) { + maxRCSize = RCSize; + RC = SuRC; + } + } + if (!RC) { + // TODO: Find biggest sgpr RC. + unsigned maxRCSize = 0; + for (SUnit *SU : chain.Nodes) { + const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI); + unsigned RCSize = getSGPRSize(SuRC, SIRI); + if (RCSize > maxRCSize) { + maxRCSize = RCSize; + RC = SuRC; + } + } + } + chain.RC = RC; + return chain; +} + +void HRB::buildConflict() { + + for (unsigned i = 0; i < Lineages.size(); i++) { + Lineage &a = Lineages[i]; + for (unsigned j = i + 1; j < Lineages.size(); j++) { + Lineage &b = Lineages[j]; + if (isConflict(a, b)) { + Color.Conflicts[i].insert(j); + Color.Conflicts[j].insert(i); + LLVM_DEBUG(dbgs() << i << " conflict" << j << "\n"); + } + } + // SelfConflict. + Color.Conflicts[i].insert(i); + } +} + +bool HRB::canReach(llvm::SUnit *a, llvm::SUnit *b) { + auto it = ReachMap.find(a); + // If no reach info, treat as reach. + if (it == ReachMap.end()) + return true; + DenseSet &CurReach = it->second; + return CurReach.find(b) != CurReach.end(); +} + +void HRB::updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b, + std::vector &SUnits) { + DenseSet &ReachA = ReachMap[a]; + ReachA.insert(b); + DenseSet &ReachB = ReachMap[b]; + ReachA.insert(ReachB.begin(), ReachB.end()); + + for (SUnit &SU : SUnits) { + if (!canReach(&SU, a)) + continue; + + DenseSet &CurReach = ReachMap[&SU]; + CurReach.insert(ReachA.begin(), ReachA.end()); + } +} + +void HRB::buildReachRelation(ArrayRef BotRoots) { + // Add fake entry to do PostOrder traversal. + // SUnit using Pred to traversal, so need to Revrese post order. + SUnit FakeEntry; + SmallVector FakeDeps; + for (SUnit *Root : BotRoots) { + SDep Dep = SDep(Root, SDep::Artificial); + FakeEntry.addPred(Dep); + FakeDeps.emplace_back(Dep); + } + + ReversePostOrderTraversal RPOT(&FakeEntry); + for (SUnit *SU : RPOT) { + // Create Reach Set first. + ReachMap[SU].clear(); + } + for (SUnit *SU : RPOT) { + DenseSet &CurReach = ReachMap[SU]; + // All Preds can reach SU and SU's reach. + for (SDep &Dep : SU->Preds) { + // Igonre week dep. + if (Dep.isWeak()) + continue; + DenseSet &PrevReach = ReachMap[Dep.getSUnit()]; + PrevReach.insert(SU); + PrevReach.insert(CurReach.begin(), CurReach.end()); + } + assert(CurReach.count(SU) == 0 && "dead loop"); + } + // Remove fake entry. + for (SDep &Dep : FakeDeps) { + FakeEntry.removePred(Dep); + } + ReachMap.erase(&FakeEntry); + + LLVM_DEBUG(for (Lineage &L + : Lineages) { + for (SUnit *SU : L.Nodes) { + DenseSet &CurReach = ReachMap[SU]; + dbgs() << SU->NodeNum << " reach: "; + for (SUnit *R : CurReach) { + dbgs() << R->NodeNum << " "; + } + dbgs() << "\n"; + } + }); +} + +bool HRB::isConflict(const Lineage &a, const Lineage &b) { + // Make conflict between sgpr and vgpr to help group lineages when share + // colors. Keep the conflict will group lineages in avoid mix use color in + // different sub exp. + SUnit *head0 = a.getHead(); + SUnit *tail0 = a.getTail(); + SUnit *head1 = b.getHead(); + SUnit *tail1 = b.getTail(); + DenseSet &Reach0 = ReachMap[head0]; + DenseSet &Reach1 = ReachMap[head1]; + bool r01 = Reach0.count(tail1) != 0; + bool r10 = Reach1.count(tail0) != 0; + return r01 && r10; +} +bool HRB::canFuse(const Lineage &a, const Lineage &b) { + if (a.RC != b.RC) { + // no RC will not conflict with other nodes. + if (!a.RC) + return false; + if (!b.RC) + return false; + // SGRP and VGPR not conflict. + if (SIRI->isSGPRClass(a.RC) != SIRI->isSGPRClass(b.RC)) + return false; + } + // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa. + SUnit *head0 = a.getHead(); + SUnit *tail0 = a.getTail(); + SUnit *head1 = b.getHead(); + SUnit *tail1 = b.getTail(); + DenseSet &Reach0 = ReachMap[head0]; + DenseSet &Reach1 = ReachMap[head1]; + bool r01 = Reach0.count(tail1) != 0; + bool r10 = Reach1.count(tail0) != 0; + return r01 != r10; +} + +bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector &SUnits) { + + // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa. + SUnit *head0 = a.getHead(); + SUnit *tail0 = a.getTail(); + SUnit *head1 = b.getHead(); + SUnit *tail1 = b.getTail(); + DenseSet &Reach0 = ReachMap[head0]; + DenseSet &Reach1 = ReachMap[head1]; + bool r01 = Reach0.count(tail1) != 0; + bool r10 = Reach1.count(tail0) != 0; + if (r01 == r10) + return false; + Lineage *newHead = &a; + Lineage *newTail = &b; + if (r01) { + // a reach b, b cannot reach a. + // link a.tail->b.head. + newHead = &a; + newTail = &b; + } else { + // b reach a, a cannot reach b. + // link b.tail->a.head. + newHead = &b; + newTail = &a; + } + + // Merge reg class. + const TargetRegisterClass *RC0 = newHead->RC; + const TargetRegisterClass *RC1 = newTail->RC; + unsigned RC0Size = getVGPRSize(RC0, SIRI); + unsigned RC1Size = getVGPRSize(RC1, SIRI); + if (RC1Size > RC0Size) + newHead->RC = RC1; + // Merge chain. + SUnit *fuseTail = newHead->getTail(); + SUnit *fuseHead = newTail->getHead(); + assert(ReachMap[fuseHead].count(fuseTail) == 0 && ""); + fuseHead->addPred(SDep(fuseTail, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "fuse " << fuseTail->NodeNum << "->" << fuseHead->NodeNum + << "\n"); + // Update reach map. + updateReachForEdge(fuseTail, fuseHead, SUnits); + // Merge Nodes. + newHead->Nodes.append(newTail->Nodes.begin(), newTail->Nodes.end()); + // Clear newTail. + newTail->Nodes.clear(); + newTail->RC = nullptr; + return true; +} + +void HRB::fusionLineages(std::vector &SUnits) { + if (Lineages.empty()) + return; + bool bUpdated = true; + while (bUpdated) { + bUpdated = false; + int size = Lineages.size(); + for (int i = 0; i < size; i++) { + Lineage &a = Lineages[i]; + if (a.length() == 0) + continue; + + for (int j = i + 1; j < size; j++) { + Lineage &b = Lineages[j]; + if (b.length() == 0) + continue; + if (tryFuse(a, b, SUnits)) { + bUpdated = true; + if (a.length() == 0) + break; + } + } + } + // Remove empty lineages. + std::sort(Lineages.begin(), Lineages.end(), + [](const Lineage &a, const Lineage &b) { + return a.length() > b.length(); + }); + while (Lineages.back().length() == 0) { + Lineages.pop_back(); + } + } + // Set ID after fusion. + unsigned ID = 0; + for (Lineage &L : Lineages) { + L.ID = ID++; + } +} + +unsigned HRB::colorLineages(std::vector &lineages, + DenseMap &AllocMap, + const unsigned Limit) { + // allocate long Lineage first. How about size of RC? + std::sort(lineages.begin(), lineages.end(), + [](const Lineage *a, const Lineage *b) { + // Make sure root allocate first. + return a->length() > b->length(); + }); + + unsigned maxColor = 0; + const unsigned VGPR_LIMIT = 256 * 4; + + for (Lineage *L : lineages) { + unsigned ID = L->ID; + auto &Conflict = Color.Conflicts[ID]; + std::bitset colors; + for (unsigned j : Conflict) { + Lineage *C = &Lineages[j]; + if (AllocMap.count(C) == 0) + continue; + unsigned c = AllocMap[C]; + unsigned s = C->getSize(); + for (unsigned i = 0; i < s; i++) { + unsigned pos = c + i; + colors.set(pos); + } + } + + unsigned color = Limit; + unsigned size = L->getSize(); + for (unsigned i = 0; i < Limit - size;) { + unsigned oldI = i; + for (unsigned j = 0; j < size; j++) { + unsigned pos = i + size - 1 - j; + if (colors.test(pos)) { + i = pos + 1; + break; + } + } + + if (i != oldI) + continue; + color = i; + break; + } + + AllocMap[L] = color; + color += size; + if (color > maxColor) + maxColor = color; + } + return maxColor; +} + +void HRB::ColorResult::colorSU(SUnit *SU, unsigned color) { + ColorMap[SU] = color; +} + +unsigned HRB::ColorResult::getLineage(SUnit *SU) const { + return LineageMap.find(SU)->second; +} + +bool HRB::ColorResult::isConflict(const SUnit *SU0, unsigned Lineage) const { + const unsigned L = LineageMap.find(SU0)->second; + const auto &Conflict = Conflicts.find(L)->second; + return Conflict.count(Lineage) > 0; +} + +bool HRB::ColorResult::isHead(SUnit *SU) const { return HeadSet.count(SU); } +bool HRB::ColorResult::isTail(SUnit *SU) const { return TailSet.count(SU); } + +const SUnit *HRB::ColorResult::getTail(SUnit *SU) const { + if (!isHead(SU)) + return nullptr; + auto it = HeadTailMap.find(SU); + return it->second; +} + +unsigned HRB::ColorResult::getColor(const llvm::SUnit *SU) const { + auto it = ColorMap.find(SU); + return it->second; +} + +unsigned HRB::ColorResult::getSize(const llvm::SUnit *SU) const { + auto it = SizeMap.find(SU); + return it->second; +} + +HRB::ColorResult &HRB::coloring() { + // Collect VGPR lineages. + std::vector vgprLineages; + for (Lineage &L : Lineages) { + auto RC = L.RC; + if (!RC) + continue; + if (SIRI->isSGPRClass(RC)) + continue; + vgprLineages.emplace_back(&L); + } + + const unsigned VGPR_LIMIT = 256 * 4; + DenseMap VAllocMap; + const unsigned maxVGPR = colorLineages(vgprLineages, VAllocMap, VGPR_LIMIT); + + // Collect SGPR lineages. + std::vector sgprLineages; + for (Lineage &L : Lineages) { + auto RC = L.RC; + if (!RC) + continue; + if (!SIRI->isSGPRClass(RC)) + continue; + sgprLineages.emplace_back(&L); + } + + const unsigned SGPR_LIMIT = 104; + DenseMap SAllocMap; + const unsigned maxSGPR = colorLineages(sgprLineages, SAllocMap, SGPR_LIMIT); + // +1 for each type of lineages(SGPR, VGPR, no reg). + const unsigned maxReg = maxSGPR + 1 + maxVGPR + 1 + 1; + const unsigned sgprBase = maxVGPR + 1; + + for (Lineage &L : Lineages) { + // Collect HeadSet. + Color.HeadSet.insert(L.getHead()); + Color.TailSet.insert(L.getTail()); + Color.HeadTailMap[L.getHead()] = L.getTail(); + // Save color. + auto RC = L.RC; + // All no reg lineage goes to maxReg. + unsigned color = maxReg; + if (!RC) { + } else if (SIRI->isSGPRClass(RC)) { + color = SAllocMap[&L] + sgprBase; + } else { + color = VAllocMap[&L]; + } + unsigned size = L.getSize(); + for (SUnit *SU : L.Nodes) { + Color.colorSU(SU, color); + Color.SizeMap[SU] = size; + Color.LineageMap[SU] = L.ID; + } + } + Color.maxReg = maxReg; + Color.maxSGPR = maxSGPR; + Color.maxVGPR = maxVGPR; + + for (unsigned i = 0; i < Lineages.size(); i++) { + Lineage &a = Lineages[i]; + SUnit *headA = a.getHead(); + unsigned colorA = Color.getColor(headA); + unsigned sizeA = Color.getSize(headA); + for (unsigned j = i + 1; j < Lineages.size(); j++) { + Lineage &b = Lineages[j]; + + SUnit *headB = b.getHead(); + unsigned colorB = Color.getColor(headB); + unsigned sizeB = Color.getSize(headB); + + if (colorB >= (colorA + sizeA)) + continue; + if (colorA >= (colorB + sizeB)) + continue; + Color.ShareColorLineages.insert(i); + Color.ShareColorLineages.insert(j); + } + } + + return Color; +} + +void HRB::dump() { + for (int i = 0; i < Lineages.size(); i++) { + dbgs() << "Lineage" << i << ":"; + Lineage &L = Lineages[i]; + for (SUnit *SU : L.Nodes) { + dbgs() << " " << SU->NodeNum; + } + dbgs() << "\n"; + if (!Color.ColorMap.empty()) { + dbgs() << "color:" << Color.getColor(L.getHead()) + << " size: " << Color.getSize(L.getHead()) << "\n"; + } + if (!ReachMap.empty()) { + dbgs() << "conflict:"; + for (int j = 0; j < Lineages.size(); j++) { + if (i == j) + continue; + if (isConflict(L, Lineages[j])) { + dbgs() << " " << j; + } + } + dbgs() << "\n"; + } + } +} + +void HRB::dumpReachMap() { + if (!ReachMap.empty()) { + dbgs() << "reachMap:"; + for (auto it : ReachMap) { + SUnit *SU = it.first; + auto &Reach = it.second; + if (SU->isInstr()) { + MachineInstr *MI = SU->getInstr(); + MI->print(dbgs()); + } + dbgs() << SU->NodeNum << "can reach :\n"; + for (SUnit *R : Reach) { + dbgs() << R->NodeNum << " "; + } + dbgs() << "\n"; + } + dbgs() << "\n"; + } +} + +// schedule base on HRB lineages and color result. + +std::vector hrbSched(std::vector &SUnits, + std::vector &BRoots, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) { + HRB hrb(MRI, SIRI); + // build reach info to avoid dead loop when build linear. + hrb.buildReachRelation(BRoots); + hrb.buildLinear(SUnits); + + std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *a, const SUnit *b) { + return a->NumSuccsLeft < b->NumSuccsLeft; + }); + while (!BRoots.empty() && BRoots.back()->NumSuccsLeft > 0) { + BRoots.pop_back(); + } + + hrb.buildReachRelation(BRoots); + hrb.fusionLineages(SUnits); + hrb.buildConflict(); + const HRB::ColorResult &Color = hrb.coloring(); + + LLVM_DEBUG(hrb.dump()); + + // All lineage head which don't has Pred is TopRoots. + // Put top roots in worklist. + // while worklist not empty. + // if not head or color avail + // is candidate. + // choose best candidate by height. + // update worklist. + std::vector ReadyList; + for (SUnit &SU : SUnits) { + if (SU.NumPredsLeft == 0) + ReadyList.emplace_back(&SU); //.insert(&SU); + } + // When there're more than one sub exp in the DAG, make sure not mix different + // sub exp or it will dead loop for color goes different subexp. + + std::bitset<512 * 2> colors; + auto isColorAvail = [&colors](unsigned color, unsigned size) -> bool { + for (unsigned i = 0; i < size; i++) { + unsigned pos = color + i; + if (colors.test(pos)) + return false; + } + return true; + }; + auto allocColor = [&colors](unsigned color, unsigned size) { + for (unsigned i = 0; i < size; i++) { + unsigned pos = color + i; + assert(!colors.test(pos) && "color already allocated"); + LLVM_DEBUG(dbgs() << pos << "is allocated\n"); + colors.set(pos); + } + }; + + auto freeColor = [&colors](unsigned color, unsigned size) { + for (unsigned i = 0; i < size; i++) { + unsigned pos = color + i; + assert(colors.test(pos) && "color has not been allocated"); + LLVM_DEBUG(dbgs() << pos << "is free\n"); + colors.reset(pos); + } + }; + + // Save color and size for tail to support case two lineage share tail. + // When finish a tail, free color for working lineage which end with tail. + DenseMap, 2>> + TailMap; + + // For lineages share same color, need to choose correct order. + // If l0 has color 0, l1 has color 1, l2 has color 0, l3 has color 1. + // l0 and l3 conflict, l1 and l2 conflict. + // l0 and l3 must sched together. + // If sched l0 and l1, it may dead lock for l0 wait something in l3 and l1 + // wait something in l2. + // ShareColorLineages will mark lineages which share color with other + // lineages. When sched, choose new lineages which has more conflict with + // ShareColorLineages. + const DenseSet &ShareColorLineages = Color.ShareColorLineages; + + std::vector Schedule; + DenseSet UnfinishedLineages; + while (!ReadyList.empty()) { + // Make sure node conflict with predLineage first. + std::sort(ReadyList.begin(), ReadyList.end(), + [&UnfinishedLineages, &Color](const SUnit *a, const SUnit *b) { + unsigned confA = 0; + for (unsigned L : UnfinishedLineages) { + if (Color.isConflict(a, L)) + confA++; + } + unsigned confB = 0; + for (unsigned L : UnfinishedLineages) { + if (Color.isConflict(b, L)) + confB++; + } + return confA > confB; + }); + + LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU + : ReadyList) { + dbgs() << " " << SU->NodeNum; + } dbgs() << "\n";); + SUnit *Candidate = nullptr; + for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) { + SUnit *SU = *it; + unsigned color = Color.getColor(SU); + unsigned size = Color.getSize(SU); + // If SU is not head or color is available, SU is the candidate. + if (Color.isHead(SU)) { + if (!isColorAvail(color, size)) + continue; + // alloc color. + allocColor(color, size); + // save tail color. + const SUnit *Tail = Color.getTail(SU); + unsigned ID = Color.getLineage(SU); + SmallVector, 2> &tailColors = + TailMap[Tail]; + tailColors.emplace_back(std::make_tuple(color, size, ID)); + if (ShareColorLineages.count(ID)) + UnfinishedLineages.insert(ID); + } + + // free color for working lineage which end with SU. + if (Color.isTail(SU)) { + auto &tailColors = TailMap[SU]; + for (auto &tailTuple : tailColors) { + unsigned lineageColor, lineageSize, ID; + std::tie(lineageColor, lineageSize, ID) = tailTuple; + freeColor(lineageColor, lineageSize); + if (ShareColorLineages.count(ID)) + UnfinishedLineages.insert(ID); + } + // Clear the tail. + TailMap.erase(SU); + } + + Candidate = SU; + // Remove Candidate from ReadyList. + ReadyList.erase(it); + break; + } + + if (!Candidate) { + // In case failed to find candidate, start a lineage if there is one. + for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) { + SUnit *SU = *it; + + if (!Color.isHead(SU)) { + continue; + } + Candidate = SU; + // Remove Candidate from ReadyList. + ReadyList.erase(it); + break; + } + } + assert(Candidate && "fail to find a Candidate"); + LLVM_DEBUG(dbgs() << "Sched " << Candidate->NodeNum << "\n"); + + // Add all Candidate succ which is Ready. + for (SDep &Dep : Candidate->Succs) { + if (Dep.isWeak()) + continue; + SUnit *Succ = Dep.getSUnit(); + + if (Succ->NumPredsLeft > 0) + Succ->NumPredsLeft--; + LLVM_DEBUG(dbgs() << "Succ " << Succ->NodeNum << " has " + << Succ->NumPredsLeft << " preds\n"); + if (Succ->NumPredsLeft == 0) + ReadyList.emplace_back(Succ); + } + + // Sched Candidate. + assert(Candidate->isInstr() && "Candidate must be instr Node"); + Schedule.emplace_back(Candidate); + } + assert(Schedule.size() == SUnits.size() && "SUnit size should match"); + return Schedule; +} + +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h new file mode 100644 index 0000000000000..c234f32370793 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h @@ -0,0 +1,197 @@ +#pragma once + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/MC/LaneBitmask.h" + +#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. + +namespace llvm { +class MachineFunction; +class LiveIntervals; +class MachineRegisterInfo; +class SIRegisterInfo; +class SIInstrInfo; +class MachineInstr; +class MachineBasicBlock; +template +class GraphWriter; +class SUnit; +class IntEqClasses; +class Twine; + +using LiveSet = llvm::DenseMap; + +// SubExp and BlockExpDag. +struct SubExp { + // Keep original order for sunits. + std::vector SUnits; + llvm::DenseSet TopRegs; + llvm::DenseSet BottomRoots; + llvm::DenseSet BottomRegs; + bool bMultiDefOutput = false; + bool bHasTerminatorInst = false; + bool bUseIncomingReg = false; + bool bMoveIntoLoop = false; + bool bNotSafeToCopy = false; + bool bHasMemInst = false; + bool bHoist = false; + // If temp/out reg is used by inst not in the subExp, cannot move since not + // all users will be move. But OK to clone. + bool bCloneOnly = false; + bool bTouchSCC = false; + llvm::MachineBasicBlock *FromBB; + llvm::MachineBasicBlock *ToBB; + unsigned sInputSize; + unsigned vInputSize; + unsigned sOutputSize; + unsigned vOutputSize; + unsigned sMaxSize; + unsigned vMaxSize; + LiveSet inputLive; + LiveSet outputLive; + bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool bMoveUp) const; + void calcMaxPressure(const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); + void dump(const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) const; + bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const; +}; + +struct ExpDag { + ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII, + const bool bJoinInput); + const llvm::MachineRegisterInfo &MRI; + const llvm::SIRegisterInfo *SIRI; + const llvm::SIInstrInfo *SIII; + const bool bJoinInputToSubExp; + + std::vector SUnits; ///< The scheduling units. + llvm::DenseMap MISUnitMap; + llvm::DenseMap SUnitMIMap; + llvm::DenseMap InputSUnitMap; + llvm::DenseMap SUnitInputMap; + std::vector SubExps; + template + void build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, + T &insts); + void dump(); + void viewGraph(const llvm::Twine &Name, const llvm::Twine &Title) const; + /// Returns a label for an SUnit node in a visualization of the ScheduleDAG. + std::string getGraphNodeLabel(const llvm::SUnit *SU) const; + std::string getDAGName() const; + /// Adds custom features for a visualization of the ScheduleDAG. + void addCustomGraphFeatures(llvm::GraphWriter &) const {} +private: + template + void initNodes(const LiveSet &InputLiveReg, T &insts); + void addDataDep(const llvm::SIRegisterInfo *SIRI); + void addCtrlDep(); + void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, + const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII); +}; + +struct BlockExpDag : public ExpDag { + BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII); + llvm::LiveIntervals *LIS; + llvm::MachineBasicBlock *MBB; + llvm::DenseMap DagPressureMap; + std::vector> SUnitsInSameDepth; + std::vector SubExps; + void build(); + void buildWithPressure(); +private: + void buildAvail(const LiveSet &passThruSet, + llvm::DenseMap &DagAvailRegMap); + void buildPressure(const LiveSet &StartLiveReg, + const LiveSet &EndLiveReg); +}; + +void getRegBound(llvm::MachineBasicBlock *MBB, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII, llvm::LiveIntervals *LIS, + unsigned &MaxVGPR, unsigned &MaxSGRP); + +// Currently mix sgpr and vgpr when build lineage to avoid cycle. +// This maybe waste registers. +// Based on "Minimum Register Instruction Sequencing to Reduce Register Spills +// in Out-of-Order Issue Superscalar Architectures". +class HRB { +public: + struct Lineage { + unsigned ID = 0; + const llvm::TargetRegisterClass *RC = nullptr; + llvm::SmallVector Nodes; + llvm::SUnit *getHead() const; + llvm::SUnit *getTail() const; + void addNode(llvm::SUnit *); + unsigned getSize() const; + unsigned length() const; + }; + struct ColorResult { + llvm::DenseMap ColorMap; + llvm::DenseMap SizeMap; + llvm::DenseMap LineageMap; + llvm::DenseMap> Conflicts; + llvm::DenseSet ShareColorLineages; + llvm::DenseSet HeadSet; + llvm::DenseSet TailSet; + llvm::DenseMap HeadTailMap; + unsigned maxReg = 0; + unsigned maxVGPR = 0; + unsigned maxSGPR = 0; + void colorSU(llvm::SUnit *SU, unsigned color); + unsigned getLineage(llvm::SUnit *SU) const; + bool isConflict(const llvm::SUnit *SU0, unsigned Lineage) const; + bool isHead(llvm::SUnit *SU) const; + bool isTail(llvm::SUnit *SU) const; + const llvm::SUnit *getTail(llvm::SUnit *SU) const; + unsigned getColor(const llvm::SUnit *SU) const; + unsigned getSize(const llvm::SUnit *SU) const; + }; + HRB(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI) + : MRI(MRI), SIRI(SIRI) {} + + void buildLinear(std::vector &SUnits); + void buildConflict(); + void buildReachRelation(llvm::ArrayRef BotRoots); + llvm::DenseMap> &getReachMap() { + return ReachMap; + } + bool canReach(llvm::SUnit *a, llvm::SUnit *b); + void updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b, + std::vector &SUnits); + void fusionLineages(std::vector &SUnits); + ColorResult &coloring(); + void dump(); + void dumpReachMap(); + +private: + Lineage buildChain(llvm::SUnit *Node, std::vector &SUnits); + llvm::SUnit *findHeir(llvm::SUnit *SU, std::vector &SUnits); + bool isConflict(const Lineage &a, const Lineage &b); + bool canFuse(const Lineage &a, const Lineage &b); + bool tryFuse(Lineage &a, Lineage &b, std::vector &SUnits); + unsigned colorLineages(std::vector &lineages, + llvm::DenseMap &AllocMap, + const unsigned Limit); + + llvm::DenseSet ChainedNodes; + llvm::DenseMap> ReachMap; + bool bRecomputeHeight = false; + std::vector Lineages; + ColorResult Color; + const llvm::MachineRegisterInfo &MRI; + const llvm::SIRegisterInfo *SIRI; +}; + +std::vector hrbSched(std::vector &SUnits, + std::vector &BRoots, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); + +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d0454cce15756..564c92239acdf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -517,6 +517,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); + initializeAMDGPUHotBlockRematerializePass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h new file mode 100644 index 0000000000000..c9172bae2cb4a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h @@ -0,0 +1,106 @@ +//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Build degree about VMem to help balance latency and pressure inside a +/// block. +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. + +namespace llvm { +class MachineBasicBlock; +class SUnit; +class SIInstrInfo; +class MachineInstr; + +class SimpleDAG { +public: + SimpleDAG(llvm::MachineBasicBlock &MBB, const llvm::SIInstrInfo *TII) + : SIII(TII), MBB(MBB) {} + std::vector SUnits; + // InstrInfo. + const llvm::SIInstrInfo *SIII; + llvm::DenseMap MISUnitMap; + llvm::DenseMap SUnitMIMap; + llvm::MachineBasicBlock &MBB; + void build(); + +private: + void initNodes(); + void addDependence(); + void addCtrlDep(); +}; + + +// Collect height/depth for high latency mem ld, which only update height/depth +// when cross high latency mem ld. Call the height/depth as VMem degree here. +// The rule is sample and its user should has different degree. +// For example +// a = sample // a has depth 0, height 3 +// b = sample a // b has depth 1, height 2 +// c = sample c // c has depth 2, height 1 +// user of c // user of c has depth 2, height 0 +// +// For the purpose of in block reorder/remat, nothing will move/clone cross the +// block. So do this after cross blk remat? In the middle of cross block remat +// to help reach target when only move things cross blk cannot reach the target. +// Reorder at the beginning? No pressure at that time? After get pressure, might +// need to update max pressure. + +class VMemDegreeDAG { +public: + VMemDegreeDAG(std::vector &Units, + const llvm::SIInstrInfo *TII) + : SUnits(Units), SIII(TII) {} + std::vector &SUnits; + // InstrInfo. + const llvm::SIInstrInfo *SIII; + void build(); + + + bool isHighLatency(const llvm::SUnit *SU) const; + bool isHighLatency(const llvm::MachineInstr *MI) const; + // height/depth based on Long latency inst. + std::vector VMemDataHeight; + std::vector VMemDataDepth; + // Full height/depth count non-data dependent too. + std::vector VMemFullHeight; + std::vector VMemFullDepth; + llvm::SmallVector VMemSUs; + llvm::SmallVector, 16> GroupedVMemSUs; + llvm::SmallVector, 16> GroupedVMemSUsByDepth; + + + void dump(); + +private: + static constexpr unsigned kNoReg = -1; + + + std::pair buildVMemDepthHeight(std::vector &VMemHeight, + std::vector &VMemDepth, bool bDataOnly); + // Compute vmem height/depth. + void buildVMemDepthHeight(); + void buildVMemDataDepthHeight(); + void groupVmemSUnits(); + +}; + + + +// Split block based on vmem depth. +void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag); + +} + diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..f089b210c8849 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHotBlockRematerialize.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp @@ -81,10 +82,14 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp + AMDGPUMIRUtils.cpp + AMDGPUMirDivergenceAnalysis.cpp + AMDGPUMirSyncDependenceAnalysis.cpp AMDGPUIGroupLP.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp + AMDGPUOccupancyAndLatencyHelper.cpp AMDGPUPerfHintAnalysis.cpp AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp @@ -106,6 +111,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUSelectionDAGInfo.cpp AMDGPUSetWavePriority.cpp AMDGPUSplitModule.cpp + AMDGPUSubExpDag.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 7554b9f578fcb..aa4b3f948b726 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -47,6 +47,10 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + unsigned getMaxSGPR() const { + return std::max(getSGPRNum(), getSGPRTuplesWeight()); + } + /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR32]; } /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 79ef1432d512a..3c467c098a65e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1332,6 +1332,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isLowLatencyInstruction(const MachineInstr &MI) const; bool isHighLatencyDef(int Opc) const override; + bool isHighLatencyInstruction(const MachineInstr& MI) const { + return isHighLatencyDef(MI.getOpcode()); + } /// Return the descriptor of the target-specific machine instruction /// that corresponds to the specified pseudo or native opcode. diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir new file mode 100644 index 0000000000000..e8a66b47ac732 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir @@ -0,0 +1,405 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s + +# DEFS +# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni00:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div00]], implicit $exec +# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni01:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div01]], implicit $exec +# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni02:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div02]], implicit $exec +# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni03:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div03]], implicit $exec +# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni04:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div04]], implicit $exec +# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni05:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div05]], implicit $exec +# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni06:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div06]], implicit $exec +# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni07:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div07]], implicit $exec +# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni08:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div08]], implicit $exec +# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni09:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div09]], implicit $exec +# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni10:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div10]], implicit $exec +# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni11:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div11]], implicit $exec +# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni12:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div12]], implicit $exec +# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni13:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div13]], implicit $exec +# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni14:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div14]], implicit $exec +# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni15:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div15]], implicit $exec +# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni16:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div16]], implicit $exec +# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni17:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div17]], implicit $exec +# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni18:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div18]], implicit $exec +# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni19:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div19]], implicit $exec +# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni20:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div20]], implicit $exec +# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni21:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div21]], implicit $exec +# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni22:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div22]], implicit $exec +# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni23:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div23]], implicit $exec +# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni24:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div24]], implicit $exec +# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni25:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div25]], implicit $exec +# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni26:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div26]], implicit $exec +# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni27:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div27]], implicit $exec +# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni28:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div28]], implicit $exec +# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni29:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div29]], implicit $exec +# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni30:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div30]], implicit $exec +# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni31:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div31]], implicit $exec +# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni32:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div32]], implicit $exec +# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni33:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div33]], implicit $exec +# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni34:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div34]], implicit $exec +# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni35:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div35]], implicit $exec +# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni36:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div36]], implicit $exec +# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni37:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div37]], implicit $exec +# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni38:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div38]], implicit $exec +# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni39:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div39]], implicit $exec +# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni40:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div40]], implicit $exec +# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni41:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div41]], implicit $exec +# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni42:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div42]], implicit $exec +# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni43:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div43]], implicit $exec +# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni44:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div44]], implicit $exec +# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni45:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div45]], implicit $exec +# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni46:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div46]], implicit $exec +# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni47:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div47]], implicit $exec +# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni48:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div48]], implicit $exec +# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni49:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div49]], implicit $exec +# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni50:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div50]], implicit $exec +# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni51:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div51]], implicit $exec +# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni52:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div52]], implicit $exec +# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni53:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div53]], implicit $exec +# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni54:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div54]], implicit $exec +# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni55:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div55]], implicit $exec +# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni56:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div56]], implicit $exec +# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni57:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div57]], implicit $exec +# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni58:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div58]], implicit $exec +# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#uni59:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div59]], implicit $exec + + +# USERS: +# CHECK: %[[#div_00:]]:vgpr_32 = COPY %[[#uni00]] +#CHECK: EXP 0, %[[#div_00]], +# CHECK: %[[#div_01:]]:vgpr_32 = COPY %[[#uni01]] +#CHECK: EXP 0, %[[#div_01]], +# CHECK: %[[#div_02:]]:vgpr_32 = COPY %[[#uni02]] +#CHECK: EXP 0, %[[#div_02]], +# CHECK: %[[#div_03:]]:vgpr_32 = COPY %[[#uni03]] +#CHECK: EXP 0, %[[#div_03]], +# CHECK: %[[#div_04:]]:vgpr_32 = COPY %[[#uni04]] +#CHECK: EXP 0, %[[#div_04]], +# CHECK: %[[#div_05:]]:vgpr_32 = COPY %[[#uni05]] +#CHECK: EXP 0, %[[#div_05]], +# CHECK: %[[#div_06:]]:vgpr_32 = COPY %[[#uni06]] +#CHECK: EXP 0, %[[#div_06]], +# CHECK: %[[#div_07:]]:vgpr_32 = COPY %[[#uni07]] +#CHECK: EXP 0, %[[#div_07]], +# CHECK: %[[#div_08:]]:vgpr_32 = COPY %[[#uni08]] +#CHECK: EXP 0, %[[#div_08]], +# CHECK: %[[#div_09:]]:vgpr_32 = COPY %[[#uni09]] +#CHECK: EXP 0, %[[#div_09]], +# CHECK: %[[#div_10:]]:vgpr_32 = COPY %[[#uni10]] +#CHECK: EXP 0, %[[#div_10]], +# CHECK: %[[#div_11:]]:vgpr_32 = COPY %[[#uni11]] +#CHECK: EXP 0, %[[#div_11]], +# CHECK: %[[#div_12:]]:vgpr_32 = COPY %[[#uni12]] +#CHECK: EXP 0, %[[#div_12]], +# CHECK: %[[#div_13:]]:vgpr_32 = COPY %[[#uni13]] +#CHECK: EXP 0, %[[#div_13]], +# CHECK: %[[#div_14:]]:vgpr_32 = COPY %[[#uni14]] +#CHECK: EXP 0, %[[#div_14]], +# CHECK: %[[#div_15:]]:vgpr_32 = COPY %[[#uni15]] +#CHECK: EXP 0, %[[#div_15]], +# CHECK: %[[#div_16:]]:vgpr_32 = COPY %[[#uni16]] +#CHECK: EXP 0, %[[#div_16]], +# CHECK: %[[#div_17:]]:vgpr_32 = COPY %[[#uni17]] +#CHECK: EXP 0, %[[#div_17]], +# CHECK: %[[#div_18:]]:vgpr_32 = COPY %[[#uni18]] +#CHECK: EXP 0, %[[#div_18]], +# CHECK: %[[#div_19:]]:vgpr_32 = COPY %[[#uni19]] +#CHECK: EXP 0, %[[#div_19]], +# CHECK: %[[#div_20:]]:vgpr_32 = COPY %[[#uni20]] +#CHECK: EXP 0, %[[#div_20]], +# CHECK: %[[#div_21:]]:vgpr_32 = COPY %[[#uni21]] +#CHECK: EXP 0, %[[#div_21]], +# CHECK: %[[#div_22:]]:vgpr_32 = COPY %[[#uni22]] +#CHECK: EXP 0, %[[#div_22]], +# CHECK: %[[#div_23:]]:vgpr_32 = COPY %[[#uni23]] +#CHECK: EXP 0, %[[#div_23]], +# CHECK: %[[#div_24:]]:vgpr_32 = COPY %[[#uni24]] +#CHECK: EXP 0, %[[#div_24]], +# CHECK: %[[#div_25:]]:vgpr_32 = COPY %[[#uni25]] +#CHECK: EXP 0, %[[#div_25]], +# CHECK: %[[#div_26:]]:vgpr_32 = COPY %[[#uni26]] +#CHECK: EXP 0, %[[#div_26]], +# CHECK: %[[#div_27:]]:vgpr_32 = COPY %[[#uni27]] +#CHECK: EXP 0, %[[#div_27]], +# CHECK: %[[#div_28:]]:vgpr_32 = COPY %[[#uni28]] +#CHECK: EXP 0, %[[#div_28]], +# CHECK: %[[#div_29:]]:vgpr_32 = COPY %[[#uni29]] +#CHECK: EXP 0, %[[#div_29]], +# CHECK: %[[#div_30:]]:vgpr_32 = COPY %[[#uni30]] +#CHECK: EXP 0, %[[#div_30]], +# CHECK: %[[#div_31:]]:vgpr_32 = COPY %[[#uni31]] +#CHECK: EXP 0, %[[#div_31]], +# CHECK: %[[#div_32:]]:vgpr_32 = COPY %[[#uni32]] +#CHECK: EXP 0, %[[#div_32]], +# CHECK: %[[#div_33:]]:vgpr_32 = COPY %[[#uni33]] +#CHECK: EXP 0, %[[#div_33]], +# CHECK: %[[#div_34:]]:vgpr_32 = COPY %[[#uni34]] +#CHECK: EXP 0, %[[#div_34]], +# CHECK: %[[#div_35:]]:vgpr_32 = COPY %[[#uni35]] +#CHECK: EXP 0, %[[#div_35]], +# CHECK: %[[#div_36:]]:vgpr_32 = COPY %[[#uni36]] +#CHECK: EXP 0, %[[#div_36]], +# CHECK: %[[#div_37:]]:vgpr_32 = COPY %[[#uni37]] +#CHECK: EXP 0, %[[#div_37]], +# CHECK: %[[#div_38:]]:vgpr_32 = COPY %[[#uni38]] +#CHECK: EXP 0, %[[#div_38]], +# CHECK: %[[#div_39:]]:vgpr_32 = COPY %[[#uni39]] +#CHECK: EXP 0, %[[#div_39]], +# CHECK: %[[#div_40:]]:vgpr_32 = COPY %[[#uni40]] +#CHECK: EXP 0, %[[#div_40]], +# CHECK: %[[#div_41:]]:vgpr_32 = COPY %[[#uni41]] +#CHECK: EXP 0, %[[#div_41]], +# CHECK: %[[#div_42:]]:vgpr_32 = COPY %[[#uni42]] +#CHECK: EXP 0, %[[#div_42]], +# CHECK: %[[#div_43:]]:vgpr_32 = COPY %[[#uni43]] +#CHECK: EXP 0, %[[#div_43]], +# CHECK: %[[#div_44:]]:vgpr_32 = COPY %[[#uni44]] +#CHECK: EXP 0, %[[#div_44]], +# CHECK: %[[#div_45:]]:vgpr_32 = COPY %[[#uni45]] +#CHECK: EXP 0, %[[#div_45]], +# CHECK: %[[#div_46:]]:vgpr_32 = COPY %[[#uni46]] +#CHECK: EXP 0, %[[#div_46]], +# CHECK: %[[#div_47:]]:vgpr_32 = COPY %[[#uni47]] +#CHECK: EXP 0, %[[#div_47]], +# CHECK: %[[#div_48:]]:vgpr_32 = COPY %[[#uni48]] +#CHECK: EXP 0, %[[#div_48]], +# CHECK: %[[#div_49:]]:vgpr_32 = COPY %[[#uni49]] +#CHECK: EXP 0, %[[#div_49]], +# CHECK: %[[#div_50:]]:vgpr_32 = COPY %[[#uni50]] +#CHECK: EXP 0, %[[#div_50]], +# CHECK: %[[#div_51:]]:vgpr_32 = COPY %[[#uni51]] +#CHECK: EXP 0, %[[#div_51]], +# CHECK: %[[#div_52:]]:vgpr_32 = COPY %[[#uni52]] +#CHECK: EXP 0, %[[#div_52]], +# CHECK: %[[#div_53:]]:vgpr_32 = COPY %[[#uni53]] +#CHECK: EXP 0, %[[#div_53]], +# CHECK: %[[#div_54:]]:vgpr_32 = COPY %[[#uni54]] +#CHECK: EXP 0, %[[#div_54]], +# CHECK: %[[#div_55:]]:vgpr_32 = COPY %[[#uni55]] +#CHECK: EXP 0, %[[#div_55]], +# CHECK: %[[#div_56:]]:vgpr_32 = COPY %[[#uni56]] +#CHECK: EXP 0, %[[#div_56]], +# CHECK: %[[#div_57:]]:vgpr_32 = COPY %[[#uni57]] +#CHECK: EXP 0, %[[#div_57]], +# CHECK: %[[#div_58:]]:vgpr_32 = COPY %[[#uni58]] +#CHECK: EXP 0, %[[#div_58]], +# CHECK: %[[#div_59:]]:vgpr_32 = COPY %[[#uni59]] +#CHECK: EXP 0, %[[#div_59]], + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr8, $vgpr0, $vgpr1 + + %1000:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1001:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1002:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1003:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1004:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1005:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1006:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1007:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1008:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1009:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %1059, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %99:vgpr_32 = COPY %1058 + S_BRANCH %bb.2 + + bb.2: + %1:vgpr_32 = IMPLICIT_DEF + EXP 0, killed %1000, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1001, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1002, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1003, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1004, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1005, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1006, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1007, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1008, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1009, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %1, %1, %1, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %1, %1, %1, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... From 77398423b044438b3f1a1306c140908b815e244b Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Thu, 6 Feb 2025 13:52:02 -0800 Subject: [PATCH 02/25] Fixed build, and added simple tests that exercise major code paths --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 8 +- .../test/CodeGen/AMDGPU/remat/group_remat.mir | 507 ++++++++++++++ .../AMDGPU/remat/group_remat_with_uses.mir | 641 ++++++++++++++++++ .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 450 ++++++++++++ 4 files changed, 1603 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 44ebaa2d51bec..8647185bf5d51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -291,7 +291,7 @@ unsigned CollectFnPressure( MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI, const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure, RematStatus &status) { - unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF); + unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second; // If only have one block, input/ouput virtual live set are empty. if (MF.size() > 1) { // Build input output live reg first. @@ -1351,7 +1351,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, bool bForceRematSgpr = bSGPRSpill | status.bNotBalance; // If bound by lds, skip. - if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) && + if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && !bForceRematSgpr) return false; @@ -1663,6 +1663,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, Register OpReg = Op.getReg(); if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO)) continue; + if (IsImplicitUseOfReg(Op, AMDGPU::MODE)) + continue; if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI)) continue; // Alow unused scc define. @@ -4454,7 +4456,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt } // If bound by lds, skip. - if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) && + if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second && !bSGPRSpill) return false; diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir new file mode 100644 index 0000000000000..7f3483c66a5d9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir @@ -0,0 +1,507 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat | FileCheck %s + +# Check that the whole expression gets moved to uses in bb.2. +# CHECK: bb.0: +# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0 +# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1 +# CHECK: bb.1: +# CHECK: bb.2: +# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]] +# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]] +# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]] +# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]] +# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]] +# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]] +# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]] +# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]] +# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]] +# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]] +# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]] +# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]] +# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]] +# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]] +# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]] +# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]] +# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]] +# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]] +# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]] +# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]] +# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]] +# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]] +# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]] +# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]] +# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]] +# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]] +# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]] +# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]] +# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]] +# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]] +# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]] +# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]] +# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]] +# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]] +# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]] +# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]] +# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]] +# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]] +# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]] +# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]] +# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]] +# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]] +# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]] +# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]] +# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]] +# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]] +# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]] +# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]] +# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]] +# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]] +# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]] +# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]] +# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]] +# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]] +# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]] +# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]] +# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]] +# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]] +# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]] +# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]] +# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]] +# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]] +# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]] +# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]] +# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]] +# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]] +# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]] +# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]] +# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]] +# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]] +# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]] +# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]] +# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]] +# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]] +# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]] +# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]] +# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]] +# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]] +# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]] +# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]] +# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]] +# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]] +# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]] +# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]] +# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]] +# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]] +# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]] +# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]] +# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]] +# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]] +# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]] +# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]] +# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]] +# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]] +# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]] +# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]] +# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]] +# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]] +# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]] +# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]] +# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]] +# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]] +# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]] +# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]] +# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]] +# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]] +# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]] +# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]] +# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]] +# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]] +# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]] +# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]] +# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]] +# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]] +# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]] +# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]] +# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]] +# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]] +# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]] +# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]] +# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]] +# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]] +# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]] +# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]] +# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]] +# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]] +# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]] +# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]] +# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]] +# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]] +# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]] +# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]] + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + undef %0.sub0:sgpr_64 = COPY $sgpr0 + undef %0.sub1:sgpr_64 = COPY $sgpr1 + + undef %1.sub0:sgpr_128 = COPY $sgpr4 + undef %1.sub1:sgpr_128 = COPY $sgpr5 + undef %1.sub2:sgpr_128 = COPY $sgpr6 + undef %1.sub3:sgpr_128 = COPY $sgpr7 + + + %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec + %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode + %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode + %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode + %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode + %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode + %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode + %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode + %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode + %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode + %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode + %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode + %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode + %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode + %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode + %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode + %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode + %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode + %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode + %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode + %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode + %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode + %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode + %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode + %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode + %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode + %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode + %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode + %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode + %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode + %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode + %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode + %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode + %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode + %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode + %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode + %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode + %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode + %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode + %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode + %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode + %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode + %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode + %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode + %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode + %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode + %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode + %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode + %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode + %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode + %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode + %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode + %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode + %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode + %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode + %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode + %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode + %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode + %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode + %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode + %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode + %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode + %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode + %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode + %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode + %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode + %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode + %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode + %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode + %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode + %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode + %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode + %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode + %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode + %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode + %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode + %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode + %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode + %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode + %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode + %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode + %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode + %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode + %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode + %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode + %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode + %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode + %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode + %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode + %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode + %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode + %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode + %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode + %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode + %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode + %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode + %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode + %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode + %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode + %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode + %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode + %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode + %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode + %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode + %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode + %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode + %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode + %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode + %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode + %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode + %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode + %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode + %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode + %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode + %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode + %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode + %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode + %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode + %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode + %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode + %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode + %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode + %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode + %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode + %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode + %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode + %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode + %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode + %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode + %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode + %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode + %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode + %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + %8001:vgpr_32 = COPY %8000 + %8002:vgpr_32 = COPY %8000 + %8003:vgpr_32 = COPY %8000 + %8004:vgpr_32 = COPY %8000 + %8005:vgpr_32 = COPY %8000 + %8006:vgpr_32 = COPY %8000 + %8007:vgpr_32 = COPY %8000 + %8008:vgpr_32 = COPY %8000 + %8009:vgpr_32 = COPY %8000 + %8010:vgpr_32 = COPY %8000 + %8011:vgpr_32 = COPY %8000 + %8012:vgpr_32 = COPY %8000 + %8013:vgpr_32 = COPY %8000 + %8014:vgpr_32 = COPY %8000 + %8015:vgpr_32 = COPY %8000 + %8016:vgpr_32 = COPY %8000 + %8017:vgpr_32 = COPY %8000 + + %9001:vgpr_32 = COPY %8001 + %9002:vgpr_32 = COPY %8002 + %9003:vgpr_32 = COPY %8003 + %9004:vgpr_32 = COPY %8004 + %9005:vgpr_32 = COPY %8005 + %9006:vgpr_32 = COPY %8006 + %9007:vgpr_32 = COPY %8007 + %9008:vgpr_32 = COPY %8008 + %9009:vgpr_32 = COPY %8009 + %9010:vgpr_32 = COPY %8010 + %9011:vgpr_32 = COPY %8011 + %9012:vgpr_32 = COPY %8012 + %9013:vgpr_32 = COPY %8013 + %9014:vgpr_32 = COPY %8014 + %9015:vgpr_32 = COPY %8015 + %9016:vgpr_32 = COPY %8016 + %9017:vgpr_32 = COPY %8017 + + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + + EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir new file mode 100644 index 0000000000000..637a683bdd041 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir @@ -0,0 +1,641 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-sub-exp-remat-aggressive | FileCheck %s + +# Check that the whole expression gets CLONED to uses in bb.2. +# CHECK: bb.0: +# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0 +# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1 +# CHECK: bb.1: +# CHECK: bb.2: +# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]] +# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]] +# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]] +# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]] +# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]] +# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]] +# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]] +# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]] +# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]] +# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]] +# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]] +# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]] +# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]] +# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]] +# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]] +# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]] +# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]] +# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]] +# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]] +# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]] +# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]] +# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]] +# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]] +# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]] +# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]] +# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]] +# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]] +# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]] +# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]] +# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]] +# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]] +# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]] +# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]] +# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]] +# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]] +# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]] +# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]] +# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]] +# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]] +# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]] +# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]] +# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]] +# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]] +# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]] +# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]] +# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]] +# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]] +# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]] +# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]] +# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]] +# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]] +# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]] +# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]] +# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]] +# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]] +# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]] +# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]] +# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]] +# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]] +# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]] +# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]] +# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]] +# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]] +# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]] +# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]] +# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]] +# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]] +# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]] +# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]] +# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]] +# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]] +# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]] +# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]] +# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]] +# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]] +# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]] +# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]] +# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]] +# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]] +# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]] +# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]] +# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]] +# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]] +# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]] +# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]] +# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]] +# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]] +# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]] +# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]] +# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]] +# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]] +# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]] +# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]] +# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]] +# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]] +# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]] +# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]] +# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]] +# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]] +# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]] +# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]] +# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]] +# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]] +# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]] +# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]] +# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]] +# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]] +# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]] +# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]] +# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]] +# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]] +# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]] +# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]] +# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]] +# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]] +# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]] +# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]] +# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]] +# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]] +# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]] +# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]] +# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]] +# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]] +# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]] +# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]] +# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]] +# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]] +# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]] +# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]] +# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]] +# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]] +# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]] + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + undef %0.sub0:sgpr_64 = COPY $sgpr0 + undef %0.sub1:sgpr_64 = COPY $sgpr1 + + undef %1.sub0:sgpr_128 = COPY $sgpr4 + undef %1.sub1:sgpr_128 = COPY $sgpr5 + undef %1.sub2:sgpr_128 = COPY $sgpr6 + undef %1.sub3:sgpr_128 = COPY $sgpr7 + + + %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec + %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode + %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode + %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode + %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode + %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode + %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode + %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode + %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode + %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode + %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode + %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode + %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode + %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode + %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode + %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode + %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode + %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode + %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode + %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode + %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode + %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode + %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode + %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode + %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode + %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode + %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode + %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode + %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode + %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode + %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode + %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode + %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode + %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode + %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode + %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode + %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode + %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode + %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode + %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode + %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode + %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode + %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode + %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode + %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode + %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode + %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode + %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode + %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode + %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode + %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode + %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode + %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode + %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode + %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode + %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode + %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode + %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode + %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode + %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode + %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode + %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode + %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode + %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode + %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode + %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode + %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode + %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode + %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode + %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode + %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode + %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode + %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode + %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode + %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode + %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode + %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode + %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode + %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode + %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode + %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode + %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode + %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode + %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode + %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode + %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode + %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode + %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode + %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode + %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode + %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode + %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode + %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode + %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode + %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode + %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode + %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode + %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode + %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode + %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode + %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode + %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode + %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode + %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode + %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode + %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode + %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode + %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode + %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode + %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode + %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode + %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode + %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode + %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode + %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode + %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode + %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode + %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode + %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode + %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode + %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode + %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode + %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode + %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode + %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode + %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode + %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode + %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode + %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode + %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode + %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode + %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode + %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode + EXP 0, %500, %500, %500, %500, -1, -1, 15, implicit $exec + EXP 0, %501, %501, %501, %501, -1, -1, 15, implicit $exec + EXP 0, %502, %502, %502, %502, -1, -1, 15, implicit $exec + EXP 0, %503, %503, %503, %503, -1, -1, 15, implicit $exec + EXP 0, %504, %504, %504, %504, -1, -1, 15, implicit $exec + EXP 0, %505, %505, %505, %505, -1, -1, 15, implicit $exec + EXP 0, %506, %506, %506, %506, -1, -1, 15, implicit $exec + EXP 0, %507, %507, %507, %507, -1, -1, 15, implicit $exec + EXP 0, %508, %508, %508, %508, -1, -1, 15, implicit $exec + EXP 0, %509, %509, %509, %509, -1, -1, 15, implicit $exec + EXP 0, %5010, %5010, %5010, %5010, -1, -1, 15, implicit $exec + EXP 0, %5011, %5011, %5011, %5011, -1, -1, 15, implicit $exec + EXP 0, %5012, %5012, %5012, %5012, -1, -1, 15, implicit $exec + EXP 0, %5013, %5013, %5013, %5013, -1, -1, 15, implicit $exec + EXP 0, %5014, %5014, %5014, %5014, -1, -1, 15, implicit $exec + EXP 0, %5015, %5015, %5015, %5015, -1, -1, 15, implicit $exec + EXP 0, %5016, %5016, %5016, %5016, -1, -1, 15, implicit $exec + EXP 0, %5017, %5017, %5017, %5017, -1, -1, 15, implicit $exec + EXP 0, %5018, %5018, %5018, %5018, -1, -1, 15, implicit $exec + EXP 0, %5019, %5019, %5019, %5019, -1, -1, 15, implicit $exec + EXP 0, %5020, %5020, %5020, %5020, -1, -1, 15, implicit $exec + EXP 0, %5021, %5021, %5021, %5021, -1, -1, 15, implicit $exec + EXP 0, %5022, %5022, %5022, %5022, -1, -1, 15, implicit $exec + EXP 0, %5023, %5023, %5023, %5023, -1, -1, 15, implicit $exec + EXP 0, %5024, %5024, %5024, %5024, -1, -1, 15, implicit $exec + EXP 0, %5025, %5025, %5025, %5025, -1, -1, 15, implicit $exec + EXP 0, %5026, %5026, %5026, %5026, -1, -1, 15, implicit $exec + EXP 0, %5027, %5027, %5027, %5027, -1, -1, 15, implicit $exec + EXP 0, %5028, %5028, %5028, %5028, -1, -1, 15, implicit $exec + EXP 0, %5029, %5029, %5029, %5029, -1, -1, 15, implicit $exec + EXP 0, %5030, %5030, %5030, %5030, -1, -1, 15, implicit $exec + EXP 0, %5031, %5031, %5031, %5031, -1, -1, 15, implicit $exec + EXP 0, %5032, %5032, %5032, %5032, -1, -1, 15, implicit $exec + EXP 0, %5033, %5033, %5033, %5033, -1, -1, 15, implicit $exec + EXP 0, %5034, %5034, %5034, %5034, -1, -1, 15, implicit $exec + EXP 0, %5035, %5035, %5035, %5035, -1, -1, 15, implicit $exec + EXP 0, %5036, %5036, %5036, %5036, -1, -1, 15, implicit $exec + EXP 0, %5037, %5037, %5037, %5037, -1, -1, 15, implicit $exec + EXP 0, %5038, %5038, %5038, %5038, -1, -1, 15, implicit $exec + EXP 0, %5039, %5039, %5039, %5039, -1, -1, 15, implicit $exec + EXP 0, %5040, %5040, %5040, %5040, -1, -1, 15, implicit $exec + EXP 0, %5041, %5041, %5041, %5041, -1, -1, 15, implicit $exec + EXP 0, %5042, %5042, %5042, %5042, -1, -1, 15, implicit $exec + EXP 0, %5043, %5043, %5043, %5043, -1, -1, 15, implicit $exec + EXP 0, %5044, %5044, %5044, %5044, -1, -1, 15, implicit $exec + EXP 0, %5045, %5045, %5045, %5045, -1, -1, 15, implicit $exec + EXP 0, %5046, %5046, %5046, %5046, -1, -1, 15, implicit $exec + EXP 0, %5047, %5047, %5047, %5047, -1, -1, 15, implicit $exec + EXP 0, %5048, %5048, %5048, %5048, -1, -1, 15, implicit $exec + EXP 0, %5049, %5049, %5049, %5049, -1, -1, 15, implicit $exec + EXP 0, %5050, %5050, %5050, %5050, -1, -1, 15, implicit $exec + EXP 0, %5051, %5051, %5051, %5051, -1, -1, 15, implicit $exec + EXP 0, %5052, %5052, %5052, %5052, -1, -1, 15, implicit $exec + EXP 0, %5053, %5053, %5053, %5053, -1, -1, 15, implicit $exec + EXP 0, %5054, %5054, %5054, %5054, -1, -1, 15, implicit $exec + EXP 0, %5055, %5055, %5055, %5055, -1, -1, 15, implicit $exec + EXP 0, %5056, %5056, %5056, %5056, -1, -1, 15, implicit $exec + EXP 0, %5057, %5057, %5057, %5057, -1, -1, 15, implicit $exec + EXP 0, %5058, %5058, %5058, %5058, -1, -1, 15, implicit $exec + EXP 0, %5059, %5059, %5059, %5059, -1, -1, 15, implicit $exec + EXP 0, %5060, %5060, %5060, %5060, -1, -1, 15, implicit $exec + EXP 0, %5061, %5061, %5061, %5061, -1, -1, 15, implicit $exec + EXP 0, %5062, %5062, %5062, %5062, -1, -1, 15, implicit $exec + EXP 0, %5063, %5063, %5063, %5063, -1, -1, 15, implicit $exec + EXP 0, %5064, %5064, %5064, %5064, -1, -1, 15, implicit $exec + EXP 0, %5065, %5065, %5065, %5065, -1, -1, 15, implicit $exec + EXP 0, %5066, %5066, %5066, %5066, -1, -1, 15, implicit $exec + EXP 0, %5067, %5067, %5067, %5067, -1, -1, 15, implicit $exec + EXP 0, %5068, %5068, %5068, %5068, -1, -1, 15, implicit $exec + EXP 0, %5069, %5069, %5069, %5069, -1, -1, 15, implicit $exec + EXP 0, %5070, %5070, %5070, %5070, -1, -1, 15, implicit $exec + EXP 0, %5071, %5071, %5071, %5071, -1, -1, 15, implicit $exec + EXP 0, %5072, %5072, %5072, %5072, -1, -1, 15, implicit $exec + EXP 0, %5073, %5073, %5073, %5073, -1, -1, 15, implicit $exec + EXP 0, %5074, %5074, %5074, %5074, -1, -1, 15, implicit $exec + EXP 0, %5075, %5075, %5075, %5075, -1, -1, 15, implicit $exec + EXP 0, %5076, %5076, %5076, %5076, -1, -1, 15, implicit $exec + EXP 0, %5077, %5077, %5077, %5077, -1, -1, 15, implicit $exec + EXP 0, %5078, %5078, %5078, %5078, -1, -1, 15, implicit $exec + EXP 0, %5079, %5079, %5079, %5079, -1, -1, 15, implicit $exec + EXP 0, %5080, %5080, %5080, %5080, -1, -1, 15, implicit $exec + EXP 0, %5081, %5081, %5081, %5081, -1, -1, 15, implicit $exec + EXP 0, %5082, %5082, %5082, %5082, -1, -1, 15, implicit $exec + EXP 0, %5083, %5083, %5083, %5083, -1, -1, 15, implicit $exec + EXP 0, %5084, %5084, %5084, %5084, -1, -1, 15, implicit $exec + EXP 0, %5085, %5085, %5085, %5085, -1, -1, 15, implicit $exec + EXP 0, %5086, %5086, %5086, %5086, -1, -1, 15, implicit $exec + EXP 0, %5087, %5087, %5087, %5087, -1, -1, 15, implicit $exec + EXP 0, %5088, %5088, %5088, %5088, -1, -1, 15, implicit $exec + EXP 0, %5089, %5089, %5089, %5089, -1, -1, 15, implicit $exec + EXP 0, %5090, %5090, %5090, %5090, -1, -1, 15, implicit $exec + EXP 0, %5091, %5091, %5091, %5091, -1, -1, 15, implicit $exec + EXP 0, %5092, %5092, %5092, %5092, -1, -1, 15, implicit $exec + EXP 0, %5093, %5093, %5093, %5093, -1, -1, 15, implicit $exec + EXP 0, %5094, %5094, %5094, %5094, -1, -1, 15, implicit $exec + EXP 0, %5095, %5095, %5095, %5095, -1, -1, 15, implicit $exec + EXP 0, %5096, %5096, %5096, %5096, -1, -1, 15, implicit $exec + EXP 0, %5097, %5097, %5097, %5097, -1, -1, 15, implicit $exec + EXP 0, %5098, %5098, %5098, %5098, -1, -1, 15, implicit $exec + EXP 0, %5099, %5099, %5099, %5099, -1, -1, 15, implicit $exec + EXP 0, %50100, %50100, %50100, %50100, -1, -1, 15, implicit $exec + EXP 0, %50101, %50101, %50101, %50101, -1, -1, 15, implicit $exec + EXP 0, %50102, %50102, %50102, %50102, -1, -1, 15, implicit $exec + EXP 0, %50103, %50103, %50103, %50103, -1, -1, 15, implicit $exec + EXP 0, %50104, %50104, %50104, %50104, -1, -1, 15, implicit $exec + EXP 0, %50105, %50105, %50105, %50105, -1, -1, 15, implicit $exec + EXP 0, %50106, %50106, %50106, %50106, -1, -1, 15, implicit $exec + EXP 0, %50107, %50107, %50107, %50107, -1, -1, 15, implicit $exec + EXP 0, %50108, %50108, %50108, %50108, -1, -1, 15, implicit $exec + EXP 0, %50109, %50109, %50109, %50109, -1, -1, 15, implicit $exec + EXP 0, %50110, %50110, %50110, %50110, -1, -1, 15, implicit $exec + EXP 0, %50111, %50111, %50111, %50111, -1, -1, 15, implicit $exec + EXP 0, %50112, %50112, %50112, %50112, -1, -1, 15, implicit $exec + EXP 0, %50113, %50113, %50113, %50113, -1, -1, 15, implicit $exec + EXP 0, %50114, %50114, %50114, %50114, -1, -1, 15, implicit $exec + EXP 0, %50115, %50115, %50115, %50115, -1, -1, 15, implicit $exec + EXP 0, %50116, %50116, %50116, %50116, -1, -1, 15, implicit $exec + EXP 0, %50117, %50117, %50117, %50117, -1, -1, 15, implicit $exec + EXP 0, %50118, %50118, %50118, %50118, -1, -1, 15, implicit $exec + EXP 0, %50119, %50119, %50119, %50119, -1, -1, 15, implicit $exec + EXP 0, %50120, %50120, %50120, %50120, -1, -1, 15, implicit $exec + EXP 0, %50121, %50121, %50121, %50121, -1, -1, 15, implicit $exec + EXP 0, %50122, %50122, %50122, %50122, -1, -1, 15, implicit $exec + EXP 0, %50123, %50123, %50123, %50123, -1, -1, 15, implicit $exec + EXP 0, %50124, %50124, %50124, %50124, -1, -1, 15, implicit $exec + EXP 0, %50125, %50125, %50125, %50125, -1, -1, 15, implicit $exec + EXP 0, %50126, %50126, %50126, %50126, -1, -1, 15, implicit $exec + EXP 0, %50127, %50127, %50127, %50127, -1, -1, 15, implicit $exec + EXP 0, %50128, %50128, %50128, %50128, -1, -1, 15, implicit $exec + EXP 0, %50129, %50129, %50129, %50129, -1, -1, 15, implicit $exec + EXP 0, %50130, %50130, %50130, %50130, -1, -1, 15, implicit $exec + EXP 0, %50131, %50131, %50131, %50131, -1, -1, 15, implicit $exec + EXP 0, %50132, %50132, %50132, %50132, -1, -1, 15, implicit $exec + EXP 0, %50133, %50133, %50133, %50133, -1, -1, 15, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + %8001:vgpr_32 = COPY %8000 + %8002:vgpr_32 = COPY %8000 + %8003:vgpr_32 = COPY %8000 + %8004:vgpr_32 = COPY %8000 + %8005:vgpr_32 = COPY %8000 + %8006:vgpr_32 = COPY %8000 + %8007:vgpr_32 = COPY %8000 + %8008:vgpr_32 = COPY %8000 + %8009:vgpr_32 = COPY %8000 + %8010:vgpr_32 = COPY %8000 + %8011:vgpr_32 = COPY %8000 + %8012:vgpr_32 = COPY %8000 + %8013:vgpr_32 = COPY %8000 + %8014:vgpr_32 = COPY %8000 + %8015:vgpr_32 = COPY %8000 + %8016:vgpr_32 = COPY %8000 + %8017:vgpr_32 = COPY %8000 + + %9001:vgpr_32 = COPY %8001 + %9002:vgpr_32 = COPY %8002 + %9003:vgpr_32 = COPY %8003 + %9004:vgpr_32 = COPY %8004 + %9005:vgpr_32 = COPY %8005 + %9006:vgpr_32 = COPY %8006 + %9007:vgpr_32 = COPY %8007 + %9008:vgpr_32 = COPY %8008 + %9009:vgpr_32 = COPY %8009 + %9010:vgpr_32 = COPY %8010 + %9011:vgpr_32 = COPY %8011 + %9012:vgpr_32 = COPY %8012 + %9013:vgpr_32 = COPY %8013 + %9014:vgpr_32 = COPY %8014 + %9015:vgpr_32 = COPY %8015 + %9016:vgpr_32 = COPY %8016 + %9017:vgpr_32 = COPY %8017 + + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + + EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir new file mode 100644 index 0000000000000..bc2c97f91f46c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -0,0 +1,450 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that the loads have been moved to the use +# CHECK: bb.2: +# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0 +# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0 +# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0 +# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0 +# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0 +# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0 +# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0 +# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0 +# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0 +# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0 +# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0 +# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0 +# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0 +# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0 +# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 +# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 +# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 +# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 +# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 +# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 +# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 +# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 +# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 +# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 +# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 +# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 +# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 +# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 +# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 +# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 +# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 +# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 +# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 +# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 +# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 +# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 +# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 +# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 +# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 +# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 +# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 +# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 +# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 +# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 +# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 +# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 +# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 +# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 +# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 +# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 +# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 +# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 +# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 +# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 +# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 +# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 +# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 +# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 +# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 +# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 +# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 +# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 +# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 +# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + undef %0.sub0:sgpr_64 = COPY $sgpr0 + undef %0.sub1:sgpr_64 = COPY $sgpr1 + + undef %1.sub0:sgpr_128 = COPY $sgpr4 + undef %1.sub1:sgpr_128 = COPY $sgpr5 + undef %1.sub2:sgpr_128 = COPY $sgpr6 + undef %1.sub3:sgpr_128 = COPY $sgpr7 + + %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 + %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 + %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0 + %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0 + %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0 + %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0 + %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0 + %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0 + %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0 + %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0 + %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0 + %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0 + %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0 + %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0 + %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0 + %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0 + %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0 + %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0 + %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0 + %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0 + %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0 + %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0 + %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0 + %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0 + %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0 + %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0 + %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0 + %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0 + %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0 + %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0 + %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0 + %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0 + %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0 + %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0 + %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0 + %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0 + %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0 + %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0 + %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0 + %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0 + %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0 + %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0 + %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0 + %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0 + %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0 + %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0 + %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0 + %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0 + %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0 + %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0 + %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0 + %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0 + %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0 + %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0 + %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0 + %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0 + %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0 + %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0 + %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0 + %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0 + %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0 + %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0 + %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0 + %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0 + + %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %8001:vgpr_32 = COPY %8000 + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 + + EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... From 3539ab3386d1cfe3798c7b2294bfa163dddc5745 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Thu, 6 Feb 2025 14:09:32 -0800 Subject: [PATCH 03/25] Test renames, only keeping the required flags for the tests --- .../remat/{group_remat_with_uses.mir => group_remat_clone.mir} | 2 +- .../AMDGPU/remat/{group_remat.mir => group_remat_move.mir} | 0 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 2 +- llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename llvm/test/CodeGen/AMDGPU/remat/{group_remat_with_uses.mir => group_remat_clone.mir} (99%) rename llvm/test/CodeGen/AMDGPU/remat/{group_remat.mir => group_remat_move.mir} (100%) diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir similarity index 99% rename from llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir index 637a683bdd041..c99a1835454fd 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir @@ -638,4 +638,4 @@ body: | S_ENDPGM 0 ... - \ No newline at end of file + diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/remat/group_remat.mir rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir index bc2c97f91f46c..528515d235c8b 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -1,6 +1,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s -# Check that the loads have been moved to the use +# Check that the scalar loads have been moved to the use # CHECK: bb.2: # CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0 # CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir index e8a66b47ac732..53f59cc3f8b0b 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-sub-exp-remat | FileCheck %s # DEFS # CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec From a33f944e94d4e1036085a84370a8fc94b05ff1ff Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 10 Feb 2025 13:22:19 -0800 Subject: [PATCH 04/25] Using the mir uniformity analysis instead, which DOES require SSA; but I don't see any reason why we can't just require SSA --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 13 +++++++++++-- .../CodeGen/AMDGPU/remat/group_remat_clone.mir | 15 ++++++++------- .../CodeGen/AMDGPU/remat/group_remat_move.mir | 17 +++++++++-------- llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 14 ++++++++------ 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 8647185bf5d51..f0a2dcdb5cc11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -34,6 +34,9 @@ #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/MachineCycleAnalysis.h" +#include "llvm/CodeGen/MachineUniformityAnalysis.h" + #include #define DEBUG_TYPE "amdgpu-hot-block-remat" @@ -4619,10 +4622,16 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { AliasAnalysis *AA = &getAnalysis().getAAResults(); { - llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI); + MachineCycleInfo CI; + CI.compute(MF); + auto TTI = MF.getTarget().getTargetTransformInfo(MF.getFunction()); + MachineUniformityInfo MachineUniformity = + llvm::computeMachineUniformityInfo(MF, CI, *DT, /*HasBranchDivergence*/true); + + //llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI); for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (DA.isUniform(&MI)) { + if (MachineUniformity.isUniform(&MI)) { TotalUniformInsts.insert(&MI); } } diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir index c99a1835454fd..06ea907aca44d 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir @@ -170,14 +170,15 @@ body: | successors: %bb.1, %bb.2 liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 - undef %0.sub0:sgpr_64 = COPY $sgpr0 - undef %0.sub1:sgpr_64 = COPY $sgpr1 - - undef %1.sub0:sgpr_128 = COPY $sgpr4 - undef %1.sub1:sgpr_128 = COPY $sgpr5 - undef %1.sub2:sgpr_128 = COPY $sgpr6 - undef %1.sub3:sgpr_128 = COPY $sgpr7 + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir index 7f3483c66a5d9..ebd89451154ae 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir @@ -170,14 +170,15 @@ body: | successors: %bb.1, %bb.2 liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 - undef %0.sub0:sgpr_64 = COPY $sgpr0 - undef %0.sub1:sgpr_64 = COPY $sgpr1 - - undef %1.sub0:sgpr_128 = COPY $sgpr4 - undef %1.sub1:sgpr_128 = COPY $sgpr5 - undef %1.sub2:sgpr_128 = COPY $sgpr6 - undef %1.sub3:sgpr_128 = COPY $sgpr7 + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec @@ -504,4 +505,4 @@ body: | S_ENDPGM 0 ... - \ No newline at end of file + diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir index 528515d235c8b..a702f7fc8011e 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -162,13 +162,15 @@ body: | successors: %bb.1, %bb.2 liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 - undef %0.sub0:sgpr_64 = COPY $sgpr0 - undef %0.sub1:sgpr_64 = COPY $sgpr1 + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 - undef %1.sub0:sgpr_128 = COPY $sgpr4 - undef %1.sub1:sgpr_128 = COPY $sgpr5 - undef %1.sub2:sgpr_128 = COPY $sgpr6 - undef %1.sub3:sgpr_128 = COPY $sgpr7 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 From 2215b797765b0fe5d04db70cfb0bb015918f5f50 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 11 Feb 2025 16:40:00 -0800 Subject: [PATCH 05/25] In block remat AND making v to s slightly more robust --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 6 +- .../AMDGPU/remat/group_remat_clone.mir | 3 +- llvm/test/CodeGen/AMDGPU/remat/in_blk.mir | 760 ++++++++++++++++++ .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 2 +- .../CodeGen/AMDGPU/remat/vector_to_scalar.mir | 240 +++--- 5 files changed, 887 insertions(+), 124 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/remat/in_blk.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index f0a2dcdb5cc11..a3a20765c2df6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -2676,7 +2676,7 @@ bool collectPacifist(MachineInstr &MI, continue; Register Reg = MO.getReg(); - if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)) + if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::MODE)) continue; if (Reg.isPhysical()) return false; @@ -2794,7 +2794,9 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, bool bUpdated = false; // Move pacifist to its first user. - for (MachineInstr *MI : pacifistList) { + //for (MachineInstr *MI : pacifistList) { + for (auto it = pacifistList.rbegin(); it != pacifistList.rend(); it++) { + MachineInstr *MI = *it; MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes); if (firstUser == MI) continue; diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir index 06ea907aca44d..bfb8e85c8aef6 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir @@ -180,6 +180,7 @@ body: | ; undef %1.sub2:sgpr_128 = COPY $sgpr6 ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode @@ -639,4 +640,4 @@ body: | S_ENDPGM 0 ... - + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir b/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir new file mode 100644 index 0000000000000..6db673b849ef2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir @@ -0,0 +1,760 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-in-blk-remat | FileCheck %s + +# Check that pacifist insts are moved to their users within the block. +# CHECK: bb.0: +# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0 +# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1 +# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]] +# CHECK: EXP 0, %[[#r502]] +# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]] +# CHECK: EXP 0, %[[#r503]] +# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]] +# CHECK: EXP 0, %[[#r504]] +# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]] +# CHECK: EXP 0, %[[#r505]] +# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]] +# CHECK: EXP 0, %[[#r506]] +# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]] +# CHECK: EXP 0, %[[#r507]] +# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]] +# CHECK: EXP 0, %[[#r508]] +# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]] +# CHECK: EXP 0, %[[#r509]] +# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]] +# CHECK: EXP 0, %[[#r5010]] +# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]] +# CHECK: EXP 0, %[[#r5011]] +# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]] +# CHECK: EXP 0, %[[#r5012]] +# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]] +# CHECK: EXP 0, %[[#r5013]] +# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]] +# CHECK: EXP 0, %[[#r5014]] +# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]] +# CHECK: EXP 0, %[[#r5015]] +# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]] +# CHECK: EXP 0, %[[#r5016]] +# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]] +# CHECK: EXP 0, %[[#r5017]] +# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]] +# CHECK: EXP 0, %[[#r5018]] +# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]] +# CHECK: EXP 0, %[[#r5019]] +# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]] +# CHECK: EXP 0, %[[#r5020]] +# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]] +# CHECK: EXP 0, %[[#r5021]] +# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]] +# CHECK: EXP 0, %[[#r5022]] +# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]] +# CHECK: EXP 0, %[[#r5023]] +# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]] +# CHECK: EXP 0, %[[#r5024]] +# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]] +# CHECK: EXP 0, %[[#r5025]] +# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]] +# CHECK: EXP 0, %[[#r5026]] +# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]] +# CHECK: EXP 0, %[[#r5027]] +# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]] +# CHECK: EXP 0, %[[#r5028]] +# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]] +# CHECK: EXP 0, %[[#r5029]] +# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]] +# CHECK: EXP 0, %[[#r5030]] +# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]] +# CHECK: EXP 0, %[[#r5031]] +# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]] +# CHECK: EXP 0, %[[#r5032]] +# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]] +# CHECK: EXP 0, %[[#r5033]] +# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]] +# CHECK: EXP 0, %[[#r5034]] +# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]] +# CHECK: EXP 0, %[[#r5035]] +# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]] +# CHECK: EXP 0, %[[#r5036]] +# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]] +# CHECK: EXP 0, %[[#r5037]] +# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]] +# CHECK: EXP 0, %[[#r5038]] +# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]] +# CHECK: EXP 0, %[[#r5039]] +# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]] +# CHECK: EXP 0, %[[#r5040]] +# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]] +# CHECK: EXP 0, %[[#r5041]] +# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]] +# CHECK: EXP 0, %[[#r5042]] +# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]] +# CHECK: EXP 0, %[[#r5043]] +# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]] +# CHECK: EXP 0, %[[#r5044]] +# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]] +# CHECK: EXP 0, %[[#r5045]] +# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]] +# CHECK: EXP 0, %[[#r5046]] +# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]] +# CHECK: EXP 0, %[[#r5047]] +# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]] +# CHECK: EXP 0, %[[#r5048]] +# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]] +# CHECK: EXP 0, %[[#r5049]] +# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]] +# CHECK: EXP 0, %[[#r5050]] +# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]] +# CHECK: EXP 0, %[[#r5051]] +# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]] +# CHECK: EXP 0, %[[#r5052]] +# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]] +# CHECK: EXP 0, %[[#r5053]] +# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]] +# CHECK: EXP 0, %[[#r5054]] +# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]] +# CHECK: EXP 0, %[[#r5055]] +# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]] +# CHECK: EXP 0, %[[#r5056]] +# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]] +# CHECK: EXP 0, %[[#r5057]] +# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]] +# CHECK: EXP 0, %[[#r5058]] +# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]] +# CHECK: EXP 0, %[[#r5059]] +# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]] +# CHECK: EXP 0, %[[#r5060]] +# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]] +# CHECK: EXP 0, %[[#r5061]] +# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]] +# CHECK: EXP 0, %[[#r5062]] +# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]] +# CHECK: EXP 0, %[[#r5063]] +# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]] +# CHECK: EXP 0, %[[#r5064]] +# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]] +# CHECK: EXP 0, %[[#r5065]] +# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]] +# CHECK: EXP 0, %[[#r5066]] +# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]] +# CHECK: EXP 0, %[[#r5067]] +# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]] +# CHECK: EXP 0, %[[#r5068]] +# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]] +# CHECK: EXP 0, %[[#r5069]] +# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]] +# CHECK: EXP 0, %[[#r5070]] +# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]] +# CHECK: EXP 0, %[[#r5071]] +# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]] +# CHECK: EXP 0, %[[#r5072]] +# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]] +# CHECK: EXP 0, %[[#r5073]] +# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]] +# CHECK: EXP 0, %[[#r5074]] +# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]] +# CHECK: EXP 0, %[[#r5075]] +# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]] +# CHECK: EXP 0, %[[#r5076]] +# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]] +# CHECK: EXP 0, %[[#r5077]] +# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]] +# CHECK: EXP 0, %[[#r5078]] +# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]] +# CHECK: EXP 0, %[[#r5079]] +# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]] +# CHECK: EXP 0, %[[#r5080]] +# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]] +# CHECK: EXP 0, %[[#r5081]] +# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]] +# CHECK: EXP 0, %[[#r5082]] +# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]] +# CHECK: EXP 0, %[[#r5083]] +# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]] +# CHECK: EXP 0, %[[#r5084]] +# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]] +# CHECK: EXP 0, %[[#r5085]] +# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]] +# CHECK: EXP 0, %[[#r5086]] +# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]] +# CHECK: EXP 0, %[[#r5087]] +# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]] +# CHECK: EXP 0, %[[#r5088]] +# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]] +# CHECK: EXP 0, %[[#r5089]] +# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]] +# CHECK: EXP 0, %[[#r5090]] +# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]] +# CHECK: EXP 0, %[[#r5091]] +# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]] +# CHECK: EXP 0, %[[#r5092]] +# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]] +# CHECK: EXP 0, %[[#r5093]] +# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]] +# CHECK: EXP 0, %[[#r5094]] +# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]] +# CHECK: EXP 0, %[[#r5095]] +# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]] +# CHECK: EXP 0, %[[#r5096]] +# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]] +# CHECK: EXP 0, %[[#r5097]] +# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]] +# CHECK: EXP 0, %[[#r5098]] +# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]] +# CHECK: EXP 0, %[[#r5099]] +# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]] +# CHECK: EXP 0, %[[#r50100]] +# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]] +# CHECK: EXP 0, %[[#r50101]] +# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]] +# CHECK: EXP 0, %[[#r50102]] +# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]] +# CHECK: EXP 0, %[[#r50103]] +# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]] +# CHECK: EXP 0, %[[#r50104]] +# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]] +# CHECK: EXP 0, %[[#r50105]] +# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]] +# CHECK: EXP 0, %[[#r50106]] +# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]] +# CHECK: EXP 0, %[[#r50107]] +# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]] +# CHECK: EXP 0, %[[#r50108]] +# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]] +# CHECK: EXP 0, %[[#r50109]] +# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]] +# CHECK: EXP 0, %[[#r50110]] +# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]] +# CHECK: EXP 0, %[[#r50111]] +# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]] +# CHECK: EXP 0, %[[#r50112]] +# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]] +# CHECK: EXP 0, %[[#r50113]] +# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]] +# CHECK: EXP 0, %[[#r50114]] +# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]] +# CHECK: EXP 0, %[[#r50115]] +# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]] +# CHECK: EXP 0, %[[#r50116]] +# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]] +# CHECK: EXP 0, %[[#r50117]] +# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]] +# CHECK: EXP 0, %[[#r50118]] +# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]] +# CHECK: EXP 0, %[[#r50119]] +# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]] +# CHECK: EXP 0, %[[#r50120]] +# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]] +# CHECK: EXP 0, %[[#r50121]] +# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]] +# CHECK: EXP 0, %[[#r50122]] +# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]] +# CHECK: EXP 0, %[[#r50123]] +# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]] +# CHECK: EXP 0, %[[#r50124]] +# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]] +# CHECK: EXP 0, %[[#r50125]] +# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]] +# CHECK: EXP 0, %[[#r50126]] +# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]] +# CHECK: EXP 0, %[[#r50127]] +# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]] +# CHECK: EXP 0, %[[#r50128]] +# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]] +# CHECK: EXP 0, %[[#r50129]] +# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]] +# CHECK: EXP 0, %[[#r50130]] +# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]] +# CHECK: EXP 0, %[[#r50131]] +# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]] +# CHECK: EXP 0, %[[#r50132]] +# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]] +# CHECK: EXP 0, %[[#r50133]] + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + + %3:vgpr_32 = IMPLICIT_DEF + + + %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec + %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode + %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode + %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode + %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode + %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode + %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode + %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode + %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode + %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode + %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode + %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode + %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode + %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode + %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode + %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode + %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode + %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode + %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode + %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode + %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode + %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode + %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode + %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode + %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode + %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode + %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode + %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode + %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode + %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode + %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode + %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode + %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode + %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode + %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode + %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode + %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode + %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode + %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode + %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode + %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode + %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode + %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode + %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode + %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode + %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode + %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode + %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode + %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode + %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode + %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode + %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode + %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode + %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode + %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode + %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode + %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode + %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode + %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode + %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode + %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode + %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode + %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode + %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode + %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode + %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode + %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode + %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode + %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode + %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode + %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode + %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode + %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode + %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode + %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode + %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode + %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode + %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode + %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode + %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode + %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode + %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode + %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode + %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode + %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode + %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode + %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode + %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode + %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode + %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode + %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode + %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode + %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode + %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode + %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode + %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode + %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode + %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode + %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode + %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode + %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode + %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode + %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode + %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode + %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode + %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode + %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode + %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode + %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode + %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode + %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode + %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode + %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode + %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode + %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode + %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode + %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode + %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode + %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode + %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode + %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode + %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode + %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode + %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode + %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode + %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode + %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode + %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode + %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode + %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode + %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode + %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode + %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode + + + + EXP 0, %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_BRANCH %bb.1 + + ; %8001:vgpr_32 = COPY %8000 + ; %8002:vgpr_32 = COPY %8000 + ; %8003:vgpr_32 = COPY %8000 + ; %8004:vgpr_32 = COPY %8000 + ; %8005:vgpr_32 = COPY %8000 + ; %8006:vgpr_32 = COPY %8000 + ; %8007:vgpr_32 = COPY %8000 + ; %8008:vgpr_32 = COPY %8000 + ; %8009:vgpr_32 = COPY %8000 + ; %8010:vgpr_32 = COPY %8000 + ; %8011:vgpr_32 = COPY %8000 + ; %8012:vgpr_32 = COPY %8000 + ; %8013:vgpr_32 = COPY %8000 + ; %8014:vgpr_32 = COPY %8000 + ; %8015:vgpr_32 = COPY %8000 + ; %8016:vgpr_32 = COPY %8000 + ; %8017:vgpr_32 = COPY %8000 + + ; %9001:vgpr_32 = COPY %8001 + ; %9002:vgpr_32 = COPY %8002 + ; %9003:vgpr_32 = COPY %8003 + ; %9004:vgpr_32 = COPY %8004 + ; %9005:vgpr_32 = COPY %8005 + ; %9006:vgpr_32 = COPY %8006 + ; %9007:vgpr_32 = COPY %8007 + ; %9008:vgpr_32 = COPY %8008 + ; %9009:vgpr_32 = COPY %8009 + ; %9010:vgpr_32 = COPY %8010 + ; %9011:vgpr_32 = COPY %8011 + ; %9012:vgpr_32 = COPY %8012 + ; %9013:vgpr_32 = COPY %8013 + ; %9014:vgpr_32 = COPY %8014 + ; %9015:vgpr_32 = COPY %8015 + ; %9016:vgpr_32 = COPY %8016 + ; %9017:vgpr_32 = COPY %8017 + + bb.1: + + EXP 0, %500, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %501, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %502, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %503, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %504, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %505, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %506, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %507, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %508, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %509, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %5099, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50110, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50111, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50112, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50113, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50114, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50115, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50116, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50117, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50118, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50119, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50120, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50121, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50122, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50123, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50124, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50125, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50126, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50127, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50128, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50129, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50130, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50131, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50132, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %50133, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir index a702f7fc8011e..69875261b74e9 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -1,6 +1,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s -# Check that the scalar loads have been moved to the use +# Check that the loads have been moved to the use # CHECK: bb.2: # CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0 # CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir index 53f59cc3f8b0b..3a2d61555c0b4 100644 --- a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir +++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir @@ -1,125 +1,125 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-sub-exp-remat | FileCheck %s # DEFS -# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni00:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div00]], implicit $exec -# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni01:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div01]], implicit $exec -# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni02:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div02]], implicit $exec -# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni03:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div03]], implicit $exec -# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni04:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div04]], implicit $exec -# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni05:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div05]], implicit $exec -# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni06:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div06]], implicit $exec -# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni07:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div07]], implicit $exec -# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni08:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div08]], implicit $exec -# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni09:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div09]], implicit $exec -# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni10:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div10]], implicit $exec -# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni11:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div11]], implicit $exec -# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni12:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div12]], implicit $exec -# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni13:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div13]], implicit $exec -# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni14:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div14]], implicit $exec -# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni15:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div15]], implicit $exec -# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni16:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div16]], implicit $exec -# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni17:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div17]], implicit $exec -# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni18:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div18]], implicit $exec -# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni19:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div19]], implicit $exec -# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni20:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div20]], implicit $exec -# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni21:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div21]], implicit $exec -# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni22:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div22]], implicit $exec -# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni23:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div23]], implicit $exec -# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni24:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div24]], implicit $exec -# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni25:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div25]], implicit $exec -# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni26:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div26]], implicit $exec -# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni27:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div27]], implicit $exec -# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni28:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div28]], implicit $exec -# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni29:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div29]], implicit $exec -# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni30:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div30]], implicit $exec -# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni31:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div31]], implicit $exec -# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni32:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div32]], implicit $exec -# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni33:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div33]], implicit $exec -# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni34:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div34]], implicit $exec -# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni35:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div35]], implicit $exec -# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni36:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div36]], implicit $exec -# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni37:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div37]], implicit $exec -# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni38:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div38]], implicit $exec -# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni39:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div39]], implicit $exec -# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni40:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div40]], implicit $exec -# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni41:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div41]], implicit $exec -# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni42:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div42]], implicit $exec -# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni43:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div43]], implicit $exec -# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni44:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div44]], implicit $exec -# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni45:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div45]], implicit $exec -# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni46:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div46]], implicit $exec -# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni47:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div47]], implicit $exec -# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni48:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div48]], implicit $exec -# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni49:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div49]], implicit $exec -# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni50:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div50]], implicit $exec -# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni51:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div51]], implicit $exec -# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni52:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div52]], implicit $exec -# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni53:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div53]], implicit $exec -# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni54:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div54]], implicit $exec -# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni55:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div55]], implicit $exec -# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni56:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div56]], implicit $exec -# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni57:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div57]], implicit $exec -# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni58:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div58]], implicit $exec -# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec # CHECK: %[[#uni59:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div59]], implicit $exec @@ -269,66 +269,66 @@ body: | successors: %bb.1, %bb.2 liveins: $sgpr0, $sgpr1, $sgpr8, $vgpr0, $vgpr1 - %1000:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1001:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1002:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1003:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1004:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1005:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1006:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1007:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1008:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1009:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1000:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1001:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1002:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1003:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1004:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1005:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1006:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1007:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1008:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1009:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %1059, 0, implicit $exec, implicit $mode $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 S_CBRANCH_EXECZ %bb.2, implicit $exec From d36a4ae0143602d8e858a72745e735e92b2f7f30 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Wed, 12 Feb 2025 09:29:11 -0800 Subject: [PATCH 06/25] clang-format --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 582 +++++++++--------- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 372 +++++------ llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 107 ++-- .../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp | 215 +++---- .../AMDGPU/AMDGPUMirDivergenceAnalysis.h | 48 +- .../AMDGPUMirSyncDependenceAnalysis.cpp | 152 ++--- .../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h | 17 +- .../AMDGPUOccupancyAndLatencyHelper.cpp | 21 +- .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 15 +- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 106 ++-- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h | 23 +- llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h | 27 +- 12 files changed, 820 insertions(+), 865 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index a3a20765c2df6..4656e28499a0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===// +//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===// // // The LLVM Compiler Infrastructure // @@ -13,24 +13,24 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUSubtarget.h" +#include "AMDGPUMIRUtils.h" #include "AMDGPUMirDivergenceAnalysis.h" +#include "AMDGPUOccupancyAndLatencyHelper.h" #include "AMDGPUSubExpDag.h" +#include "AMDGPUSubtarget.h" #include "AMDGPUVMemDegreeDAG.h" -#include "AMDGPUOccupancyAndLatencyHelper.h" #include "GCNRegPressure.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" #include "SIMachineFunctionInfo.h" -#include "AMDGPUMIRUtils.h" +#include "SIRegisterInfo.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/SlotIndexes.h" @@ -43,20 +43,24 @@ using namespace llvm; static cl::opt TargetOccupancy("amdgpu-remat-target-occupancy"); -static cl::opt EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive"); -static cl::opt EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive"); -static cl::opt EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone"); +static cl::opt + EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive"); +static cl::opt + EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive"); +static cl::opt + EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone"); static cl::opt EnableVmemDegree("amdgpu-remat-enable-vmem-degree"); static cl::opt EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat"); static cl::opt EnableSubExp("amdgpu-remat-enable-sub-exp-remat"); -static cl::opt EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos"); -static cl::opt EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg"); +static cl::opt + EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos"); +static cl::opt + EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg"); namespace { typedef DenseSet InstSet; typedef DenseSet BlockSet; -template -using BlockMap = MapVector; +template using BlockMap = MapVector; // Rematerialize in a single pass instead of doing in register allcation. // If in register allocation, fail to rematerialize will cause spill. @@ -65,9 +69,9 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass { public: static char ID; - DenseSet TotalUniformInsts; - DenseSet SafeToRemoveInsts; - DenseSet DivergentInsts; + DenseSet TotalUniformInsts; + DenseSet SafeToRemoveInsts; + DenseSet DivergentInsts; void RemoveInst(const MachineInstr *MI) { TotalUniformInsts.erase(MI); SafeToRemoveInsts.erase(MI); @@ -99,9 +103,8 @@ typedef AMDGPUHotBlockRematerialize Remat; // Util functions. namespace { -MachineBasicBlock * -nearest_common_dominator(MachineDominatorTree *DT, - BlockSet &Blocks) { +MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT, + BlockSet &Blocks) { auto I = Blocks.begin(), E = Blocks.end(); MachineBasicBlock *DomB = cast(*(I++)); @@ -217,10 +220,10 @@ bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { return true; } - // SGPR has alignment requirment, cannot get accurate reg number. const unsigned NearTargetRegLimit = 10; -bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) { +bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, + MachineFunction &MF) { unsigned maxSGPR = ST->getAddressableNumSGPRs(); const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); @@ -254,10 +257,10 @@ struct RematStatus { DenseSet MemWriteMBBSet; }; -unsigned CollectMBBPressure( - MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI, - const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure, - RematStatus &status) { +unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, + const GCNSubtarget *ST, unsigned &maxVPressure, + unsigned &maxSPressure, RematStatus &status) { // Skip processing current block if it has only debug instructions if (MBB.getFirstNonDebugInstr() == MBB.end()) return ST->getOccupancyWithNumVGPRs(0); @@ -290,10 +293,10 @@ unsigned CollectMBBPressure( return RP.getOccupancy(*ST); } -unsigned CollectFnPressure( - MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI, - const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure, - RematStatus &status) { +unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, + const GCNSubtarget *ST, unsigned &maxVPressure, + unsigned &maxSPressure, RematStatus &status) { unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second; // If only have one block, input/ouput virtual live set are empty. if (MF.size() > 1) { @@ -376,14 +379,14 @@ unsigned CollectFnPressure( } return TgtOcc; } -RematStatus -GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, - const MachineRegisterInfo &MRI, const GCNSubtarget *ST) { +RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const GCNSubtarget *ST) { unsigned maxSPressure = 0; unsigned maxVPressure = 0; RematStatus status; - unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, - maxSPressure, status); + unsigned TgtOcc = + CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, maxSPressure, status); const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; if (TgtOcc >= MaxOcc) { status.TargetOcc = TgtOcc; @@ -418,7 +421,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, TgtOcc = bigOcc; bNotBalance = true; if (TgtOcc >= MaxOccupancy) - TgtOcc = MaxOccupancy-1; + TgtOcc = MaxOccupancy - 1; } } @@ -436,7 +439,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, vInputPressure += RegSize; } else { unsigned RegIndex = SIRI->getHWRegIndex(Reg); - uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex; + uint64_t mask = ((1 << RegSize) - 1) << RegIndex; sInputMask |= mask; } } @@ -451,7 +454,6 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, mask = mask << 4; } - // If balanced, try next occupancy. TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1); @@ -614,8 +616,7 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR, } int GetReducedSize(MapVector &RematMap, bool bVGPR, - GCNRPTracker::LiveRegSet &CanidateSet, - InstSet &ReducedInsts, + GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, BlockLiveInfo &LiveInfo, DenseMap &RPOTIndexMap) { @@ -791,9 +792,11 @@ void BuildRematCandiates(std::vector &Candidates, } // For case like -// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform -// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform -// %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform +// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, +// implicit-def dead $scc; xb.uniform +// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; +// xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit +// killed $scc; xb.uniform // Sink S_AND right before S_CSELECT will overwrite SCC. // To avoid it, skip case when DefMI and UseMI has implicit define use. bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { @@ -973,7 +976,7 @@ int FilterRematCandiates(std::vector &Candidates, } void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef, - SmallVector &userMIs) { + SmallVector &userMIs) { for (MachineInstr *UseMI : userMIs) { for (MachineOperand &MO : UseMI->operands()) { if (!MO.isReg()) @@ -999,7 +1002,6 @@ DenseMap reduceClonedMBBs( } } - // For userBlocks which dominate all hotBlocks, don't need to clone because // the value not cross hotBlocks when later blocks are cloned. // For userBlocks which dominated by all hotBlocks, they could share clones @@ -1064,68 +1066,45 @@ DenseMap reduceClonedMBBs( // Look for an earlier insert point if the InstructionToMove // writes to scc and scc is live at the CurrentInsertPoint. static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash( - MachineInstr *InstructionToMove, - MachineBasicBlock *MBB, - MachineBasicBlock::iterator CurrentInsertPoint, - MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII -) -{ - const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); - if (WillSmashScc) - { - CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB, - CurrentInsertPoint, - SIRI, - SIII, - &MRI - ); - } - - return CurrentInsertPoint; + MachineInstr *InstructionToMove, MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + const bool WillSmashScc = + InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) { + CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef( + MBB, CurrentInsertPoint, SIRI, SIII, &MRI); + } + + return CurrentInsertPoint; } // Look for an earlier insert point if the SubExp // writes to scc and scc is live at the CurrentInsertPoint. static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash( - const SubExp &SubExpToMove, - MachineBasicBlock *MBB, - MachineBasicBlock::iterator CurrentInsertPoint, - MachineRegisterInfo& MRI, - const SIRegisterInfo* SIRI, - const SIInstrInfo* SIII -) -{ - const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI); - if (WillSmashScc) - { - CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB, - CurrentInsertPoint, - SIRI, - SIII, - &MRI - ); - } - - return CurrentInsertPoint; + const SubExp &SubExpToMove, MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) { + CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef( + MBB, CurrentInsertPoint, SIRI, SIII, &MRI); + } + + return CurrentInsertPoint; } // Return trun if moving MI to Location will smash a live scc value. -static bool WillSmashSccAtLocation( - MachineInstr* MI, - MachineBasicBlock* MBB, - MachineBasicBlock::iterator Location -) -{ - // It is ok to pass nullptr to `modifiesRegister` for TRI here since - // SCC has no subreg/suprereg relationships. - return MI->modifiesRegister(AMDGPU::SCC, nullptr) - && llvm::IsSccLiveAt(MBB, Location); +static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock::iterator Location) { + // It is ok to pass nullptr to `modifiesRegister` for TRI here since + // SCC has no subreg/suprereg relationships. + return MI->modifiesRegister(AMDGPU::SCC, nullptr) && + llvm::IsSccLiveAt(MBB, Location); } -void ApplyCloneRemat(Remat *Remat, - RematNode &Node, std::vector &hotBlocks, +void ApplyCloneRemat(Remat *Remat, RematNode &Node, + std::vector &hotBlocks, MachineDominatorTree *pDT, MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { @@ -1185,10 +1164,9 @@ void ApplyCloneRemat(Remat *Remat, InsertPointMI = UseMI; } } - + MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash( - DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII - ); + DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII); for (MachineMemOperand *MO : DefMI->memoperands()) { NewDef->addMemOperand(MF, MO); @@ -1221,10 +1199,11 @@ void ApplyCloneRemat(Remat *Remat, void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { + const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII) { MachineInstr *DefMI = Node.DefMI; MachineInstr *InsertPointMI = Node.InsertPointMI; - MachineBasicBlock* MBB = nullptr; + MachineBasicBlock *MBB = nullptr; // Find a valid insert point. MachineBasicBlock::iterator InsertPoint; @@ -1236,10 +1215,9 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, MBB = Node.InsertBlock; } - InsertPoint = AdjustInsertPointToAvoidSccSmash( - DefMI, MBB, InsertPoint, MRI, SIRI, SIII - ); - + InsertPoint = AdjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI, + SIRI, SIII); + // Move instruction to new location. DefMI->removeFromParent(); InsertPoint->getParent()->insert(InsertPoint, DefMI); @@ -1271,7 +1249,8 @@ void ApplyRemat(Remat *Remat, MapVector &RematMap, if (Node.Kind == RematNode::RematKind::OneDefOneUse) { ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII); } else if (Node.Kind == RematNode::RematKind::Clone) { - ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF); + ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, + MF); } } } @@ -1505,7 +1484,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); if (UseMI.getParent() != MBB) continue; - int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false); + int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, + /*bVGPR*/ false); if (gain > 0) { // Skip case when DefMI has implicit define which used by UseMI. if (isImplicitDefUse(&MI, &UseMI)) { @@ -1539,8 +1519,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, bool bNeedVRemat = rematVCnt > 0; // If sgpr spill, always do remat. bool bSRematOK = - (newRematSCnt <= 0 && !SRematMap.empty()) || - bForceRematSgpr; + (newRematSCnt <= 0 && !SRematMap.empty()) || bForceRematSgpr; bool bVRematOK = (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty(); if (bNeedSRemat && bNeedVRemat) { @@ -1575,7 +1554,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, if (!SRematMap.empty()) { bUpdated = true; - ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF); + ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, + MF); LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); } @@ -1595,49 +1575,46 @@ bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) { return DefMIs.size() == 1; } -static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) -{ - if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) - { - return false; - } +static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) { + if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) { + return false; + } - return MO.getReg() == Reg; + return MO.getReg() == Reg; } -static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) -{ - if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) - { - return false; - } +static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) { + if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) { + return false; + } - return MO.getReg() == Reg; + return MO.getReg() == Reg; } -static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII) -{ - // Make sure UseMI is not wqm like sample. - if (SIII->isWQM(UseMI->getOpcode())) - return false; - if (UseMI->getOpcode() == AMDGPU::PHI) - return false; - - return true; +static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, + const SIInstrInfo *SIII) { + // Make sure UseMI is not wqm like sample. + if (SIII->isWQM(UseMI->getOpcode())) + return false; + if (UseMI->getOpcode() == AMDGPU::PHI) + return false; + + return true; } static bool isConvergent(Remat *Remat, const MachineInstr &MI) { return MI.isConvergent() && - // This flag is set on readfirstlane's to indicate that they - // are redundant (the value being read is already uniform). - // Normally, readfirstlanes are convergent, because different exec - // will cause a different value to be read; a known uniform - // readfirstlane is safe to move or clone and not actually convergent. - !Remat->TotalUniformInsts.count(&MI); + // This flag is set on readfirstlane's to indicate that they + // are redundant (the value being read is already uniform). + // Normally, readfirstlanes are convergent, because different exec + // will cause a different value to be read; a known uniform + // readfirstlane is safe to move or clone and not actually convergent. + !Remat->TotalUniformInsts.count(&MI); } bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) { + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + bool bSink) { if (Reg.isPhysical()) return false; bool bVGPR = SIRI->isVGPR(MRI, Reg); @@ -1664,7 +1641,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, if (!Op.isReg()) continue; Register OpReg = Op.getReg(); - if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO)) + if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || + IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO)) continue; if (IsImplicitUseOfReg(Op, AMDGPU::MODE)) continue; @@ -1675,7 +1653,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, continue; if (OpReg.isPhysical()) return false; - if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) { + if (!MRI.getUniqueVRegDef(OpReg) && + !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) { return false; } } @@ -1696,12 +1675,10 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, } std::vector buildSubExpFromCandidates( - Remat *Remat, - GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, + Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, - GCNRPTracker::LiveRegSet &unUsedPassThrus, - bool bAllowPartialUseInSubExp) { + GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) { InstSet CandidateDefs; DenseSet RemovedCandidates; std::vector CandidateRegs; @@ -1798,7 +1775,7 @@ std::vector buildSubExpFromCandidates( break; } - if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true)) + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) continue; // If all users of MI are in candidate defs, add MI into candidate defs. @@ -1877,10 +1854,8 @@ std::vector buildSubExpFromCandidates( return dag.SubExps; } - std::vector buildSubExpFromCandidatesTopBottom( - Remat* Remat, - GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, + Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) { InstSet CandidateDefs; @@ -2052,7 +2027,7 @@ std::vector buildSubExpFromCandidatesTopBottom( } dbgs() << "\nFinished Candidate Defs End\n";); LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it - : LocalCandidates) { + : LocalCandidates) { pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs()); } dbgs() << "\nLocalCandidates End\n";); // Make sure all input reg are uniqueDef. @@ -2064,7 +2039,6 @@ std::vector buildSubExpFromCandidatesTopBottom( return dag.SubExps; } - void print_vreg(Register Reg, const MachineRegisterInfo &MRI) { if (Reg.isVirtual()) { StringRef Name = MRI.getVRegName(Reg); @@ -2102,8 +2076,7 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, MachineDominatorTree *pDT, - SlotIndexes *slotIndexes, - const SIInstrInfo *SIII, + SlotIndexes *slotIndexes, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI) { // Move from bottom. MachineBasicBlock *FromBB = Exp.FromBB; @@ -2118,12 +2091,14 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, continue; // Do not overwrite a live scc. - MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin()); + MachineBasicBlock::iterator InsertPoint = + ToBB->SkipPHIsAndLabels(ToBB->begin()); if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint)) continue; DefMI->removeFromParent(); - assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point"); + assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && + "invalid insert point"); ToBB->insert(InsertPoint, DefMI); // Debug insts don't need slot index. if (DefMI->isDebugInstr()) @@ -2134,12 +2109,11 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, } } - void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, - MachineDominatorTree *pDT, - SlotIndexes *slotIndexes, - const SIInstrInfo *SIII, - const SIRegisterInfo *SIRI) { + MachineDominatorTree *pDT, + SlotIndexes *slotIndexes, + const SIInstrInfo *SIII, + const SIRegisterInfo *SIRI) { // Move from top. // Find lowest input def. MachineBasicBlock *ToBB = Exp.ToBB; @@ -2155,9 +2129,8 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, Terminator = ToBB->end(); } - Terminator = AdjustInsertPointForSubExpToAvoidSccSmash( - Exp, ToBB, Terminator, MRI, SIRI, SIII - ); + Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator, + MRI, SIRI, SIII); for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { MachineInstr *DefMI = *it; @@ -2391,11 +2364,12 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT); // Sort to make stable order. - std::sort(userBlocks.begin(), userBlocks.end(), - [](std::pair>& it0, - std::pair>& it1) { + std::sort( + userBlocks.begin(), userBlocks.end(), + [](std::pair> &it0, + std::pair> &it1) { return it0.first->getNumber() < it1.first->getNumber(); - }); + }); const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); @@ -2484,7 +2458,6 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, } } - void ApplySubExpCloneNearUserInBlock( SubExp &Exp, DenseMap &inBlockHotVInstMap, @@ -2623,7 +2596,7 @@ unsigned getPacifistLevel(unsigned Reg, } bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB, - const MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { for (MachineInstr &def : MRI.def_instructions(Reg)) { if (def.getParent() != MBB) continue; @@ -2658,8 +2631,8 @@ bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive, return inputLive.count(Reg) && outputLive.count(Reg); } -// Instructions which only use imm/passThru reg/output only reg will not kill any -// live reg, so name them pacifist here. +// Instructions which only use imm/passThru reg/output only reg will not kill +// any live reg, so name them pacifist here. bool collectPacifist(MachineInstr &MI, const GCNRPTracker::LiveRegSet &inputLive, const GCNRPTracker::LiveRegSet &outputLive, @@ -2676,7 +2649,8 @@ bool collectPacifist(MachineInstr &MI, continue; Register Reg = MO.getReg(); - if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::MODE)) + if (MO.isImplicit() && + (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::MODE)) continue; if (Reg.isPhysical()) return false; @@ -2702,7 +2676,8 @@ bool collectPacifist(MachineInstr &MI, if (Reg.isPhysical()) return false; - if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) + if (nullptr == + getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) return false; bHasDef = true; @@ -2711,30 +2686,27 @@ bool collectPacifist(MachineInstr &MI, return bHasDef; } -static MachineInstr* findFirstAliasingLoadOrStoreInMBB( - MachineInstr &MI, - MachineBasicBlock &MBB, - AliasAnalysis *AA -) -{ - if (MI.mayLoadOrStore()) - { - for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I) - { - const bool UseTBAA = false; - if (MI.mayAlias(AA, *I, UseTBAA)) - { - return &*I; - } - } +static MachineInstr *findFirstAliasingLoadOrStoreInMBB(MachineInstr &MI, + MachineBasicBlock &MBB, + AliasAnalysis *AA) { + if (MI.mayLoadOrStore()) { + for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); + I != E; ++I) { + const bool UseTBAA = false; + if (MI.mayAlias(AA, *I, UseTBAA)) { + return &*I; + } } + } - return nullptr; + return nullptr; } -static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI, - AliasAnalysis *AA, - SlotIndexes *slotIndexes) { +static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, + MachineBasicBlock &MBB, + MachineRegisterInfo &MRI, + AliasAnalysis *AA, + SlotIndexes *slotIndexes) { SmallVector users; @@ -2742,14 +2714,13 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock // op with which it aliases. Find the first instruction // that aliases the pacifist MI (if any) and add it to the list // of users. The sort() below will select the earliest user instruction. - if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) { + if (MachineInstr *AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) { users.push_back(AliasMI); } for (MachineOperand &MO : MI.defs()) { unsigned Reg = MO.getReg(); - for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) - { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { if (&MBB != UseMI.getParent()) continue; users.emplace_back(&UseMI); @@ -2773,8 +2744,7 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, AliasAnalysis *AA, - RematStatus &status) -{ + RematStatus &status) { const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB]; const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; @@ -2794,10 +2764,11 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, bool bUpdated = false; // Move pacifist to its first user. - //for (MachineInstr *MI : pacifistList) { + // for (MachineInstr *MI : pacifistList) { for (auto it = pacifistList.rbegin(); it != pacifistList.rend(); it++) { MachineInstr *MI = *it; - MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes); + MachineInstr *firstUser = + findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes); if (firstUser == MI) continue; if (firstUser == MI->getNextNode()) @@ -2814,14 +2785,15 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, // BRANCH may have exec update before it. insertPoint--; - insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); + insertPoint = + llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) || insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) && - insertPoint != MI->getIterator()) - { + insertPoint != MI->getIterator()) { insertPoint--; - insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); + insertPoint = + llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); } if (insertPoint == MI->getIterator()) continue; @@ -2887,7 +2859,7 @@ bool collectVToSCrossHotSpot( const SIInstrInfo *SIII) { unsigned VLimit = status.TargetVLimit; unsigned SLimit = status.TargetSLimit; - auto& ST = MBB.getParent()->getSubtarget(); + auto &ST = MBB.getParent()->getSubtarget(); GCNDownwardRPTracker Tracker(*LIS); @@ -2926,24 +2898,23 @@ bool collectVToSCrossHotSpot( VExtra--; bUpdated = true; } - } return bUpdated; } // Return true if the user is outside of the def's loop. -static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI) -{ - MachineLoop* L = MLI->getLoopFor(Def->getParent()); +static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, + MachineLoopInfo *MLI) { + MachineLoop *L = MLI->getLoopFor(Def->getParent()); return L && !L->contains(User->getParent()); } bool rematUniformVgprToSgpr( - Remat *Remat, - MachineFunction &MF, RematStatus &status, + Remat *Remat, MachineFunction &MF, RematStatus &status, DenseMap &MBBPressureMap, - std::vector &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) { + std::vector &hotBlocks, LiveIntervals *LIS, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, MachineLoopInfo *MLI) { DenseMap UniformVgprMap = collectUniformVgprs(Remat, MF, MRI, SIRI); @@ -2977,7 +2948,8 @@ bool rematUniformVgprToSgpr( // Do not replace v->s across loops. Even if the value is uniform // branch divergence can cause a uniform value in a loop to be // non-uniform when used outside a loop. - if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI)) + if (IsSafeRematCandidateUser(&userMI, SIII) && + !IsCrossLoopUse(MI, &userMI, MLI)) userMIs.emplace_back(&userMI); } @@ -2993,7 +2965,7 @@ bool rematUniformVgprToSgpr( for (MachineInstr *userMI : userMIs) { const auto &Desc = userMI->getDesc(); bool bIllegal = false; - for (unsigned i=0;igetNumOperands();i++) { + for (unsigned i = 0; i < userMI->getNumOperands(); i++) { MachineOperand &MO = userMI->getOperand(i); if (!MO.isReg()) continue; @@ -3026,7 +2998,8 @@ bool rematUniformVgprToSgpr( auto rit = userMI->getReverseIterator(); rit++; auto endIt = userMI->getParent()->rend(); - while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit)) + while (rit != endIt && !rit->isDebugInstr() && + !slotIndexes->hasIndex(*rit)) slotIndexes->insertMachineInstrInMaps(*(rit++)); } } @@ -3112,9 +3085,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, DenseSet &hotSet, int vDistance, int sDistance, unsigned VLimit, unsigned SLimit, const DenseSet &MemWriteMBBSet, - LiveIntervals *LIS, - const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII) { + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { auto &ST = MBB.getParent()->getSubtarget(); const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex(); const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI); @@ -3139,7 +3111,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, continue; // Igonre inst in hot range. - if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) { + if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || + RP.getMaxSGPR() > SLimit) { Tracker.advance(); continue; } @@ -3254,7 +3227,7 @@ bool tryRematInHotSpot( unsigned VLimit = status.TargetVLimit; unsigned SLimit = status.TargetSLimit; - auto& ST = MBB.getParent()->getSubtarget(); + auto &ST = MBB.getParent()->getSubtarget(); const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB]; const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; @@ -3305,9 +3278,8 @@ bool tryRematInHotSpot( // Use hotVMI when apply. inBlockHotSInstMap[&MBB] = nullptr; if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive, - outputLive, hotSet, vDistance, sDistance, VLimit, SLimit, - status.MemWriteMBBSet, - LIS, MRI, SIRI, SIII)) + outputLive, hotSet, vDistance, sDistance, VLimit, SLimit, + status.MemWriteMBBSet, LIS, MRI, SIRI, SIII)) return true; } @@ -3317,8 +3289,7 @@ bool tryRematInHotSpot( inBlockHotVInstMap[&MBB] = nullptr; return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false, inputLive, outputLive, hotSet, vDistance, sDistance, VLimit, - SLimit, status.MemWriteMBBSet, - LIS, MRI, SIRI, SIII); + SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII); } return false; } @@ -3449,7 +3420,8 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { } } -// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1. +// Compare pressure, return ture if maxV0/maxS0 pressure is higher than +// maxV1/maxS1. bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1, unsigned maxS1, const GCNSubtarget *ST) { unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0); @@ -3472,10 +3444,11 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1, } // Return true if the subExp can help pressure for passThrus. -bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus, - const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, const MachineLoopInfo *MLI, - MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) { +bool canHelpPressureWhenSink( + SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, const MachineLoopInfo *MLI, + MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound) { LLVM_DEBUG(subExp.dump(MRI, SIRI)); if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) return false; @@ -3591,8 +3564,7 @@ bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI, } SmallVector> -groupPassThruByDefBlock(Remat *Remat, - const GCNRPTracker::LiveRegSet &passThrus, +groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedPassThrus, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { @@ -3618,8 +3590,9 @@ groupPassThruByDefBlock(Remat *Remat, GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()]; DefInMBB[Reg] = it.second; } - - llvm::SmallVector> result = Candidates.takeVector(); + + llvm::SmallVector> + result = Candidates.takeVector(); LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it : result) { @@ -3636,7 +3609,7 @@ groupPassThruByDefBlock(Remat *Remat, }); LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it - : result) { + : result) { MachineBasicBlock *MBB = it.first; auto &defInMBB = it.second; MBB->dump(); @@ -3693,7 +3666,8 @@ collectPassThrus(MachineBasicBlock *MBB, return passThrus; } // Try to build a free subExp which all input is passThrus. -SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus, +SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, + GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { SubExp freeExp; // Try to split the subExp to find a help case. @@ -3818,9 +3792,9 @@ std::vector buildSubExpCandidates( // Try to remove out reg def sub exp from DefMBB. GCNRPTracker::LiveRegSet &DefInMBB = it.second; // Go up on the dag until reach share node. - auto subExps = - buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, - slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp); + auto subExps = buildSubExpFromCandidates( + Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus, + bAllowPartialUseInSubExp); for (SubExp &subExp : subExps) { if (subExp.bHasMemInst) { // Skip when memory ld/st inst need to cross MBB which write memory. @@ -3847,11 +3821,13 @@ std::vector buildSubExpCandidates( } } if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT, - bCanClone, bSgprBound)) { - if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) { - SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI); - if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT, - bCanClone, bSgprBound)) { + bCanClone, bSgprBound)) { + if (bAllowPartialUseInSubExp && + subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) { + SubExp freeSubExp = + buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI); + if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, + MLI, pDT, bCanClone, bSgprBound)) { subExpCandidates.emplace_back(freeSubExp); } } @@ -3936,8 +3912,8 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); LLVM_DEBUG(std::string movStr = Exp.bHoist ? "output hoist:" : "output sink:"; - dbgs() << movStr << Register::virtReg2Index(Reg) - << " " << Size); + dbgs() + << movStr << Register::virtReg2Index(Reg) << " " << Size); // Exp out live at block input. // It will descrease live for MBB when sink and increase when hoist. if (SIRI->isVGPR(MRI, Reg)) { @@ -3974,10 +3950,9 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, // It will increase live for MBB. unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); - LLVM_DEBUG(std::string movStr = - Exp.bHoist ? "input hoist:" : "input sink:"; - dbgs() << movStr << Register::virtReg2Index(Reg) - << " " << Size); + LLVM_DEBUG( + std::string movStr = Exp.bHoist ? "input hoist:" : "input sink:"; + dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size); if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); if (Exp.bHoist) @@ -4019,8 +3994,8 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, LaneBitmask profitMask = outMask & MBBBeginMask; if (MBBBeginMask.any()) { unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); - LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg) - << " " << Size); + LLVM_DEBUG(dbgs() + << "move:" << Register::virtReg2Index(Reg) << " " << Size); // Exp out live at block input. // It will descrease live for MBB. if (SIRI->isVGPR(MRI, Reg)) { @@ -4048,8 +4023,8 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, // It will increase live for MBB. unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); - LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg) - << " " << Size); + LLVM_DEBUG(dbgs() + << "add:" << Register::virtReg2Index(Reg) << " " << Size); if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); vgprDiff += Size; @@ -4095,8 +4070,8 @@ void addExpCandidates(std::vector &subExpCandidates, } bool tryToAddSubExps( - Remat *Remat, - HotBlock &hotBB, RematStatus &status, std::vector &subExpCandidates, + Remat *Remat, HotBlock &hotBB, RematStatus &status, + std::vector &subExpCandidates, std::vector &inBlockCloneSubExps, DenseMap &inBlockHotVInstMap, DenseMap &inBlockHotSInstMap, @@ -4110,9 +4085,9 @@ bool tryToAddSubExps( SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT, bool bCanClone, bool bVOutBound, bool bSOutBound, GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) { - std::vector partialSubExps = buildSubExpCandidates(Remat, - Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone, - bSOutBound, unUsedPassThrus, status.MemWriteMBBSet, + std::vector partialSubExps = buildSubExpCandidates( + Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, + bCanClone, bSOutBound, unUsedPassThrus, status.MemWriteMBBSet, bAllowPartialUseInSubExp); GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive; @@ -4182,8 +4157,8 @@ bool tryToAddSubExps( // Try to remove out reg def sub exp from DefMBB. GCNRPTracker::LiveRegSet &UseInMBB = it.second; // Go up on the dag until reach share node. - auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI, - SIII, MRI, slotIndexes); + auto subExps = buildSubExpFromCandidatesTopBottom( + Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes); for (SubExp &subExp : subExps) { if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound)) continue; @@ -4216,8 +4191,7 @@ bool tryToAddSubExps( if (EnableVmemDegree && // Only expect vmem when last tryToAddSubExps. // If not, bAllowPartialUseInSubExp will no chance to be true. - (bAllowPartialUseInSubExp || - !EnableSubExpAggressive)) { + (bAllowPartialUseInSubExp || !EnableSubExpAggressive)) { // Assume vmemLdSize could be optimized by not parallel. if (((vgpr - hotBB.vmemLdInputSize) <= VLimit || (vgpr - hotBB.vmemLdOutputSize) <= VLimit) && @@ -4256,8 +4230,7 @@ bool tryToAddSubExps( // Reason to do it per block is to make sure passthru reuse is precise. // If try remat on all hot blocks together, the passthru might be on one block, // but the reuse in on another block which the reg is not passthru there. -bool perBlockPassthruRemat(Remat *Remat, - std::vector &hotBlocks, +bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, RematStatus &status, GCNRPTracker::LiveRegSet &liveRegCandidates, const GCNSubtarget *ST, LiveIntervals *LIS, @@ -4266,8 +4239,7 @@ bool perBlockPassthruRemat(Remat *Remat, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { bool bUpdated = false; - bool bCanClone = EnableSubExpClone | - EnableSubExpAggressive; + bool bCanClone = EnableSubExpClone | EnableSubExpAggressive; SlotIndexes *slotIndexes = LIS->getSlotIndexes(); // Sort hot blocks by pressure first. @@ -4331,19 +4303,19 @@ bool perBlockPassthruRemat(Remat *Remat, // Group pass thru regs by def MBB. SmallVector> - Candidates = - groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII); + Candidates = groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, + MRI, SIRI, SIII); // unUsedPassThrus used to collect passThru which is skipped when build // subExp. GCNRPTracker::LiveRegSet unusedPassThrus; // Build exp dag on define blocks. bool bAllowPartialUseInSubExp = false; - if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps, - inBlockHotVInstMap, inBlockHotSInstMap, Candidates, - vgpr, sgpr, savingInputLive, savingOutputLive, - passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes, - LIS, pDT, bCanClone, bVOutBound, bSOutBound, - unusedPassThrus, bAllowPartialUseInSubExp)) { + if (tryToAddSubExps( + Remat, it, status, subExpCandidates, inBlockCloneSubExps, + inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr, + savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI, + SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound, + unusedPassThrus, bAllowPartialUseInSubExp)) { // Remove unusedPassThrus from passThrus first. llvm::andNotLiveRegSet(passThrus, unusedPassThrus); llvm::mergeLiveRegSet(usedPassThrus, passThrus); @@ -4359,12 +4331,12 @@ bool perBlockPassthruRemat(Remat *Remat, return false; bAllowPartialUseInSubExp = true; - if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps, - inBlockHotVInstMap, inBlockHotSInstMap, Candidates, - vgpr, sgpr, savingInputLive, savingOutputLive, - passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes, - LIS, pDT, bCanClone, bVOutBound, bSOutBound, - unusedPassThrus, bAllowPartialUseInSubExp)) { + if (!tryToAddSubExps( + Remat, it, status, subExpCandidates, inBlockCloneSubExps, + inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr, + savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI, + SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound, + unusedPassThrus, bAllowPartialUseInSubExp)) { return false; } // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp. @@ -4430,10 +4402,9 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, } // namespace -bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, - MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT, - AliasAnalysis *AA) -{ +bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, MachineDominatorTree *pDT, + MachinePostDominatorTree *pPDT, AliasAnalysis *AA) { if (MF.size() < 2) return false; const GCNSubtarget *ST = &MF.getSubtarget(); @@ -4495,7 +4466,6 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt maxLocalSPressure, status); maxLocalSPressure += RegForVCC; - } if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit) continue; @@ -4504,7 +4474,9 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt if (bBothOutLimit && maxLocalVPressure <= VLimit) continue; GCNRPTracker::LiveRegSet liveSet; - hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 }); + hotBlocks.push_back({&MBB, liveSet, + std::make_pair(maxLocalVPressure, maxLocalSPressure), + 0, 0}); } // Collect vmemLdInput/OutputSize. if (EnableVmemDegree) { @@ -4546,8 +4518,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt } if (EnableUniformVectorToScalar) { - if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI, - SIRI, SIII, MLI)) { + if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, + hotBlocks, LIS, MRI, SIRI, SIII, MLI)) { // Rebuild LIS. LIS->reanalyze(MF); status = GetRematStatus(MF, MLI, LIS, MRI, ST); @@ -4601,15 +4573,17 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt PressureUnderLimitSet.insert(MBB); } else { if (MaxLocalVGPR < it.maxPressures.first) - it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second); + it.maxPressures = + std::make_pair(MaxLocalVGPR, it.maxPressures.second); if (MaxLocalSGPR < it.maxPressures.second) it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR); } } } - bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, - ST, LIS, MLI, pDT, MRI, SIRI, SIII); + bool bUpdated = + perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST, + LIS, MLI, pDT, MRI, SIRI, SIII); return bUpdated; } @@ -4618,8 +4592,10 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { if (MF.size() < 2) return false; LiveIntervals *LIS = &getAnalysis().getLIS(); - MachineDominatorTree *DT = &getAnalysis().getDomTree(); - MachinePostDominatorTree *PDT = &getAnalysis().getPostDomTree(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); + MachinePostDominatorTree *PDT = + &getAnalysis().getPostDomTree(); MachineLoopInfo *MLI = &getAnalysis().getLI(); AliasAnalysis *AA = &getAnalysis().getAAResults(); @@ -4628,9 +4604,10 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { CI.compute(MF); auto TTI = MF.getTarget().getTargetTransformInfo(MF.getFunction()); MachineUniformityInfo MachineUniformity = - llvm::computeMachineUniformityInfo(MF, CI, *DT, /*HasBranchDivergence*/true); + llvm::computeMachineUniformityInfo(MF, CI, *DT, + /*HasBranchDivergence*/ true); - //llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI); + // llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI); for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { if (MachineUniformity.isUniform(&MI)) { @@ -4640,8 +4617,8 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { } } - //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)")); - // For non-cs/ps, set target occ as 4. + // LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)")); + // For non-cs/ps, set target occ as 4. bool bNearTarget = false; bool bFinalUpdated = false; bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget); @@ -4666,8 +4643,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) -INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize", - false, false) +INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) char AMDGPUHotBlockRematerialize::ID = 0; char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID; @@ -4675,4 +4652,3 @@ char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID; FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() { return new AMDGPUHotBlockRematerialize(); } - diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 6f44fec08239c..365fb058bf6b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -1,22 +1,11 @@ -/////////////////////////////////////////////////////////////////////////////// -// // -// AMDGPUMIRUtils.cpp // -// Copyright (C) Microsoft Corporation. All rights reserved. // -// This file is distributed under the University of Illinois Open Source // -// License. See LICENSE.TXT for details. // -// // -// Util functions for llvm MIR Passes. // -// // -/////////////////////////////////////////////////////////////////////////////// - -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/SlotIndexes.h" -#include "llvm/CodeGen/MachineLoopInfo.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/SlotIndexes.h" -//#include "dxc/DXIL/DxilMetadataHelper.h" +// #include "dxc/DXIL/DxilMetadataHelper.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" @@ -26,9 +15,9 @@ #include "llvm/Support/Debug.h" -#include "GCNRegPressure.h" #include "AMDGPUMIRUtils.h" #include "AMDGPUSubExpDag.h" +#include "GCNRegPressure.h" #include #define DEBUG_TYPE "xb-mir-util" @@ -48,7 +37,7 @@ class CFGWithPhi { phiInsts.insert(&I); unsigned Reg = I.getOperand(0).getReg(); // Add incoming values. - for (unsigned i=1;i &) const {} MachineFunction &F; - DenseMap> blockToPhiInstsMap; + DenseMap> + blockToPhiInstsMap; void dump(); }; @@ -110,7 +100,8 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { return R; } - static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) { + static std::string getNodeLabel(const MachineBasicBlock *BB, + const CFGWithPhi *G) { enum { MaxColumns = 8000 }; std::string Str; raw_string_ostream OS(Str); @@ -347,7 +338,7 @@ void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) { } MachineBasicBlock *split(MachineInstr *Inst) { - + // Create the fall-through block. MachineBasicBlock *MBB = Inst->getParent(); MachineFunction *MF = MBB->getParent(); @@ -462,9 +453,8 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, .addImm(offset * LaneSize); MachineInstr *OffsetAddMI = OffsetAdd.getInstr(); MachineBasicBlock::iterator InsertPoint = - llvm::FindOrCreateInsertionPointForSccDef( - MI.getParent(), MI, SIRI, SIII, &MRI - ); + llvm::FindOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI, + SIII, &MRI); MI.getParent()->insert(InsertPoint, OffsetAddMI); SIII->legalizeOperands(*OffsetAddMI); OffsetOp->setReg(NewOffsetReg); @@ -631,7 +621,7 @@ bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT, return bCross; } -} +} // namespace llvm namespace llvm { void viewCFGWithPhi(llvm::MachineFunction &F) { @@ -1520,12 +1510,12 @@ void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) { } } // namespace pressure -}// namespace llvm +} // namespace llvm namespace { class ContributionList { public: - ContributionList(MachineFunction &MF) : MF(MF){}; + ContributionList(MachineFunction &MF) : MF(MF) {}; void build(); bool propagateContribution(); MachineFunction &MF; @@ -1754,46 +1744,45 @@ void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) { } } // namespace llvm -static bool IsPhysReg(const MachineOperand &Op) -{ - return Op.isReg() && Op.getReg().isPhysical(); +static bool IsPhysReg(const MachineOperand &Op) { + return Op.isReg() && Op.getReg().isPhysical(); } // Sometimes split bb uses physical registers defined in BB, have to add them to // live-in or the ir is malformed. -void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI) -{ - // Initialize with current set of liveins. For new blocks this will be empty. - SmallDenseSet DefSet; - for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins()) - { - DefSet.insert(P.PhysReg); - } +void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, + const MachineRegisterInfo *MRI) { + // Initialize with current set of liveins. For new blocks this will be empty. + SmallDenseSet DefSet; + for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins()) { + DefSet.insert(P.PhysReg); + } - for (auto &MI : *NewBB) - { - // Add all undefined physical registers to the live in set. - for (MachineOperand &Use : MI.operands()) - { - // Only process physreg uses. - if (!IsPhysReg(Use) || !Use.isUse()) continue; + for (auto &MI : *NewBB) { + // Add all undefined physical registers to the live in set. + for (MachineOperand &Use : MI.operands()) { + // Only process physreg uses. + if (!IsPhysReg(Use) || !Use.isUse()) + continue; - // Reserved regs do not need to be tracked through live-in sets. - unsigned Reg = Use.getReg(); - if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue; + // Reserved regs do not need to be tracked through live-in sets. + unsigned Reg = Use.getReg(); + if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) + continue; - if (!DefSet.count(Reg)) - NewBB->addLiveIn(Reg); - } + if (!DefSet.count(Reg)) + NewBB->addLiveIn(Reg); + } - // Add all physical register defs (exlicit+implicit) to the def register set. - for (MachineOperand &Def : MI.operands()) - { - // Only process physreg defs. - if (!IsPhysReg(Def) || !Def.isDef()) continue; - DefSet.insert(Def.getReg()); - } + // Add all physical register defs (exlicit+implicit) to the def register + // set. + for (MachineOperand &Def : MI.operands()) { + // Only process physreg defs. + if (!IsPhysReg(Def) || !Def.isDef()) + continue; + DefSet.insert(Def.getReg()); } + } } void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, @@ -1829,50 +1818,41 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, } } -MachineReg llvm::CreateVirtualRegForOperand( - MachineOpcode Opcode, - unsigned OpNum, - MachineFunction &MF -) -{ - const TargetSubtargetInfo &ST = MF.getSubtarget(); - const TargetRegisterInfo *TRI = ST.getRegisterInfo(); - const TargetInstrInfo *TII = ST.getInstrInfo(); - const MCInstrDesc &Desc = TII->get(Opcode); - const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF); - if (!RC) - { - llvm::report_fatal_error("Unable to create virtual reg for instruction operand"); - } +MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode, + unsigned OpNum, + MachineFunction &MF) { + const TargetSubtargetInfo &ST = MF.getSubtarget(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetInstrInfo *TII = ST.getInstrInfo(); + const MCInstrDesc &Desc = TII->get(Opcode); + const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF); + if (!RC) { + llvm::report_fatal_error( + "Unable to create virtual reg for instruction operand"); + } - MachineRegisterInfo &MRI = MF.getRegInfo(); - return MRI.createVirtualRegister(RC); + MachineRegisterInfo &MRI = MF.getRegInfo(); + return MRI.createVirtualRegister(RC); } -MachineReg llvm::CreateVirtualDstReg( - MachineOpcode Opcode, - MachineFunction &MF -) -{ - return llvm::CreateVirtualRegForOperand(Opcode, 0, MF); +MachineReg llvm::CreateVirtualDstReg(MachineOpcode Opcode, + MachineFunction &MF) { + return llvm::CreateVirtualRegForOperand(Opcode, 0, MF); } // Return true if the MI is a copy of exec. // If true then sets pDst to the destination register. -bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst) -{ - enum {DST=0, SRC=1}; - bool FoundCopy = false; - if (MI.getOpcode() == AMDGPU::COPY - || MI.getOpcode() == AMDGPU::S_MOV_B32 - || MI.getOpcode() == AMDGPU::S_MOV_B64) - { - const MachineOperand &Src = MI.getOperand(SRC); - if (Src.isReg() && Src.getReg() == Exec) - { - FoundCopy = true; - } +bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, + MachineReg *pDst) { + enum { DST = 0, SRC = 1 }; + bool FoundCopy = false; + if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::S_MOV_B32 || + MI.getOpcode() == AMDGPU::S_MOV_B64) { + const MachineOperand &Src = MI.getOperand(SRC); + if (Src.isReg() && Src.getReg() == Exec) { + FoundCopy = true; } + } #if 0 // TODO: Delete this. else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO || MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32) @@ -1880,29 +1860,26 @@ bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst) FoundCopy = true; } #endif - - if (FoundCopy) - { - *pDst = MI.getOperand(DST).getReg(); - } - return FoundCopy; + if (FoundCopy) { + *pDst = MI.getOperand(DST).getReg(); + } + + return FoundCopy; } -llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) -{ - llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister}; - if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF)) - { - LiveLaneMask.Reg = MI->getOperand(0).getReg(); - LiveLaneMask.SubReg = MI->getOperand(0).getSubReg(); - } +llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) { + llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, + AMDGPU::NoSubRegister}; + if (MachineInstr *MI = GetWqmEntryActiveMaskInst(MF)) { + LiveLaneMask.Reg = MI->getOperand(0).getReg(); + LiveLaneMask.SubReg = MI->getOperand(0).getSubReg(); + } - return LiveLaneMask; + return LiveLaneMask; } -MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) -{ +MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) { #if 0 // TODO: Get rid of this // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction. // This instruction is added by the SIWholeQuadMode pass. @@ -1917,22 +1894,23 @@ MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) } #endif - return nullptr; + return nullptr; } -bool llvm::IsFetchShaderCall(const MachineInstr *MI) -{ +bool llvm::IsFetchShaderCall(const MachineInstr *MI) { #if 0 // TODO: Get rid of this. return MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER || MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall); #else - return false; + return false; #endif } -bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) { - const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo(); +bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock::iterator MI) { + const TargetRegisterInfo *TRI = + MBB->getParent()->getRegInfo().getTargetRegisterInfo(); for (auto it = MI; it != MBB->end(); ++it) { const MachineInstr &CurMI = *it; // Hit use of scc, it is live. @@ -1962,79 +1940,70 @@ bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::it // as the new insert location. // MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator MI, - const TargetRegisterInfo* TRI, - const SIInstrInfo* TII, - MachineRegisterInfo* MRI, - SccDefInsertPointConstraintFlags Constraints -) -{ - // If SCC is dead at MI when we can use MI as the insert point. - if (!llvm::IsSccLiveAt(MBB, MI)) - { - return MI; - } + MachineBasicBlock *MBB, MachineBasicBlock::iterator MI, + const TargetRegisterInfo *TRI, const SIInstrInfo *TII, + MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) { + // If SCC is dead at MI when we can use MI as the insert point. + if (!llvm::IsSccLiveAt(MBB, MI)) { + return MI; + } - const bool CheckForExecWrite = - Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; + const bool CheckForExecWrite = + Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; - // Get the starting reverse iterator taking care to handle the MBB->end() case. - MachineBasicBlock::reverse_iterator Start; - if (MI == MBB->end()) - { - Start = MBB->rbegin(); - } - else - { - Start = MI.getReverse(); - } - - // Otherwise, walk backwards through the block looking for a location where - // SCC is dead. - for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It) - { - // If the instruction modifies exec then we cannot use it as - // an insertion point (if that is a constraint from the caller). - // The check for EXEC works for both wave64 and wave32 because - // it will also catch writes to the subregisters (e.g. exec_lo). - if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) - { - break; - } + // Get the starting reverse iterator taking care to handle the MBB->end() + // case. + MachineBasicBlock::reverse_iterator Start; + if (MI == MBB->end()) { + Start = MBB->rbegin(); + } else { + Start = MI.getReverse(); + } - if (It->modifiesRegister(AMDGPU::SCC, TRI) - && !It->readsRegister(AMDGPU::SCC, TRI)) - { - return It->getIterator(); - } + // Otherwise, walk backwards through the block looking for a location where + // SCC is dead. + for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); + It != End; ++It) { + // If the instruction modifies exec then we cannot use it as + // an insertion point (if that is a constraint from the caller). + // The check for EXEC works for both wave64 and wave32 because + // it will also catch writes to the subregisters (e.g. exec_lo). + if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) { + break; } - // If no safe location can be found in the block we can save and restore - // SCC around MI. There is no way to directly read or write SCC so we use - // s_cselect to read the current value of SCC and s_cmp to write the saved - // value back to SCC. - // - // The generated code will look like this; - // - // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC - // <----- Newly created safe insert point. - // MI - // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC - // - unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - DebugLoc DL = MI->getDebugLoc(); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) - .addImm(-1) - .addImm(0); - BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32)) - .addReg(TmpScc, RegState::Kill) - .addImm(0); + if (It->modifiesRegister(AMDGPU::SCC, TRI) && + !It->readsRegister(AMDGPU::SCC, TRI)) { + return It->getIterator(); + } + } - return MI; + // If no safe location can be found in the block we can save and restore + // SCC around MI. There is no way to directly read or write SCC so we use + // s_cselect to read the current value of SCC and s_cmp to write the saved + // value back to SCC. + // + // The generated code will look like this; + // + // S_CSELECT_B32 %SavedSCC, -1, 0 # Save SCC + // <----- Newly created safe insert point. + // MI + // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC + // + unsigned int TmpScc = + MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) + .addImm(-1) + .addImm(0); + BuildMI(*MBB, std::next(MI->getIterator()), DL, + TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(TmpScc, RegState::Kill) + .addImm(0); + + return MI; } - namespace { bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes, SmallDenseSet &touchedMBBSet) { @@ -2099,9 +2068,7 @@ bool llvm::isLocalLiveInterval( return isLocalLiveRange(&LI, Indexes, touchedMBBSet); } - -bool llvm::isLocalLiveInterval( - const LiveInterval &LI, SlotIndexes *Indexes) { +bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { if (LI.hasSubRanges()) { for (const auto &S : LI.subranges()) { if (!isLocalLiveRange(&S, Indexes)) @@ -2117,8 +2084,8 @@ bool llvm::isLocalLiveInterval( void llvm::buildEndLiveMap( llvm::LiveIntervals *LIS, llvm::MachineFunction &MF, const llvm::MachineRegisterInfo &MRI, - llvm::DenseMap - &MBBLiveMap, bool After) { + llvm::DenseMap &MBBLiveMap, + bool After) { // When only have one block, end live reg must be empty. if (MF.size() == 1) return; @@ -2158,7 +2125,8 @@ void llvm::buildEndLiveMap( } } -unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) { +unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, + const SIRegisterInfo *SIRI) { auto &MRI = MF.getRegInfo(); for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { if (MRI.isPhysRegUsed(Reg)) { @@ -2168,14 +2136,16 @@ unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterIn return 0; } -unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) { +unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, + const SIRegisterInfo *SIRI) { const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned MaxSGPR = 0; for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { if (MRI.isPhysRegUsed(Reg)) { - // Skip scratch reserved reg, which is a big register that don't really contribute to this stat. + // Skip scratch reserved reg, which is a big register that don't really + // contribute to this stat. if (ScratchRSrcReg != 0) { if (SIRI->isSubRegister(ScratchRSrcReg, Reg)) continue; @@ -2187,8 +2157,7 @@ unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterIn return 1 + llvm::RegForVCC + MaxSGPR; } -void llvm::dumpLiveSet(const LiveSet &LiveSet, - const SIRegisterInfo *SIRI) { +void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { dbgs() << "\n live set: \n"; for (auto it : LiveSet) { @@ -2227,15 +2196,16 @@ bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage) } #endif -MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ) -{ - for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It) - { - if (*It == Succ) - { - return It; - } +MachineBasicBlock::succ_iterator +llvm::FindSuccessor(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock *Succ) { + for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), + End = MBB->succ_end(); + It != End; ++It) { + if (*It == Succ) { + return It; } + } - return MBB->succ_end(); + return MBB->succ_end(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index 16b55c5c94583..1e9f0bad12d19 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -2,9 +2,9 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/MC/LaneBitmask.h" -#include "llvm/IR/CallingConv.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/MC/LaneBitmask.h" namespace llvm { @@ -37,10 +37,10 @@ using LiveSet = llvm::DenseMap; unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI); -void CollectLiveSetPressure( - const LiveSet &liveSet, - const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, - unsigned &VPressure, unsigned &SPressure); +void CollectLiveSetPressure(const LiveSet &liveSet, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + unsigned &VPressure, unsigned &SPressure); bool isExecUpdateForControlFlow(llvm::MachineInstr &MI); @@ -60,60 +60,57 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI, const llvm::SIInstrInfo *TII, llvm::SlotIndexes *SlotIndexes); -bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT, +bool reach_block(llvm::MachineBasicBlock *FromBB, + llvm::MachineDominatorTree *DT, llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI, llvm::MachineBasicBlock *ToBB); - void viewCFGWithPhi(llvm::MachineFunction &MF); void write_contribution_list(llvm::MachineFunction &MF, const char *Filename); -llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII); +llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, + const llvm::SIInstrInfo *TII); bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, llvm::MachineBasicBlock &MBB); -void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI); +void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, + const llvm::MachineRegisterInfo *MRI); void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, - llvm::SmallDenseSet &LiveOutSet, - const llvm::MachineRegisterInfo *MRI); + llvm::SmallDenseSet &LiveOutSet, + const llvm::MachineRegisterInfo *MRI); -MachineReg CreateVirtualRegForOperand( - MachineOpcode Opcode, - unsigned Operand, - llvm::MachineFunction &MF -); +MachineReg CreateVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand, + llvm::MachineFunction &MF); -MachineReg CreateVirtualDstReg( - MachineOpcode Opcode, - llvm::MachineFunction &MF -); +MachineReg CreateVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF); -bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst); +bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, + MachineReg *pDst); struct MachineRegWithSubReg { - MachineReg Reg = AMDGPU::NoRegister; - unsigned SubReg = AMDGPU::NoSubRegister; + MachineReg Reg = /*NoRegister*/ 0; + unsigned SubReg = /*NoSubRegister*/ 0; }; MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF); llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF); -// Return true if this machine instruction represents a call to the fetch shader. -// We curently have two mechanisims for calling fetch shader: +// Return true if this machine instruction represents a call to the fetch +// shader. We curently have two mechanisims for calling fetch shader: // 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction // 2. A CALL instruction with the `FetchShaderCall` flag set to true. -bool IsFetchShaderCall(const llvm::MachineInstr* MI); - -bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI); +bool IsFetchShaderCall(const llvm::MachineInstr *MI); +bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, + llvm::MachineBasicBlock::iterator MI); // An enum used to pass additional constraints to // `FindOrCreateInsertionPointForSccDef()`. This will further // constrain the location where the scc def can be inserted. -enum SccDefInsertPointConstraintFlags -{ - None = 0, // No additional constraints. - NoExecWrite = 1, // Should be no modification of exec between BeforeInst and insert point. +enum SccDefInsertPointConstraintFlags { + None = 0, // No additional constraints. + NoExecWrite = 1, // Should be no modification of exec between BeforeInst and + // insert point. }; // Look for a safe place to insert an instruction that defines scc. @@ -130,55 +127,53 @@ enum SccDefInsertPointConstraintFlags // as the new insert location. // llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef( - llvm::MachineBasicBlock* MBB, - llvm::MachineBasicBlock::iterator BeforeInst, - const llvm::TargetRegisterInfo* TRI, - const llvm::SIInstrInfo* TII, - llvm::MachineRegisterInfo* MRI, - SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None -); + llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst, + const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII, + llvm::MachineRegisterInfo *MRI, + SccDefInsertPointConstraintFlags Constraints = + SccDefInsertPointConstraintFlags::None); // Check if LI live cross basic blocks, save all touched basic block if is // local. bool isLocalLiveInterval( const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes, llvm::SmallDenseSet &touchedMBBSet); -bool isLocalLiveInterval( - const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes); +bool isLocalLiveInterval(const llvm::LiveInterval &LI, + llvm::SlotIndexes *Indexes); // build liveRegSet at end of each MBB. void buildEndLiveMap( llvm::LiveIntervals *LIS, llvm::MachineFunction &MF, const llvm::MachineRegisterInfo &MRI, - llvm::DenseMap - &MBBLiveMap, bool After); + llvm::DenseMap &MBBLiveMap, bool After); -void dumpLiveSet(const LiveSet &LiveSet, - const llvm::SIRegisterInfo *SIRI); +void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI); -unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI); -unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI); +unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, + const llvm::SIRegisterInfo *SIRI); +unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, + const llvm::SIRegisterInfo *SIRI); bool isFastMathInst(llvm::MachineInstr &MI); namespace pressure { void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI, - const llvm::SIRegisterInfo *SIRI, - llvm::raw_ostream &os); + const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &os); void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, const char *Filename); void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, llvm::raw_ostream &os); -} +} // namespace pressure // bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage); // Look for the successor `Succ` of the given `MBB`. // Returns MBB->succ_end() if `Succ` is not a successor of MBB. -llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ); +llvm::MachineBasicBlock::succ_iterator +FindSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ); // The enum and helper function for v_perm selection mask. // -// The input byte layout of v_perm is as below: +// The input byte layout of v_perm is as below: // // BYTE in[8] // in[0] = $src1_BYTE0; @@ -211,7 +206,7 @@ constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0, V_PERM_IN_BYTE_POS Sel_1, V_PERM_IN_BYTE_POS Sel_2, V_PERM_IN_BYTE_POS Sel_3) { - return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) | - ((int)Sel_1 << 8) | (int)Sel_0); -} + return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) | ((int)Sel_1 << 8) | + (int)Sel_0); } +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp index ceb22b5ff9243..21aa5db0c6f27 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp @@ -69,7 +69,8 @@ // ... // // label: -// v3 = phi v0, v1 ; divergent! because of divergent branch. +// v3 = phi v0, v1 ; divergent! because of divergent +// branch. // // The boolean value is bit-divergent. When passed to the branch as an operand, // the branch becomes divergent, whose sync dependency will be computed as @@ -81,13 +82,14 @@ // control flow. // For case like // %163:sreg_64_xexec = S_MOV_B64 $exec -//bb.1: +// bb.1: //; predecessors: %bb.1, %bb.0 -// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%) -// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 +// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), +// %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 // %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec // %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec -// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec +// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, +// implicit-def $scc, implicit $exec //... // $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc // S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -164,20 +166,20 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPU.h" #include "AMDGPUMirDivergenceAnalysis.h" -#include "GCNSubtarget.h" +#include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "SIInstrInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" -#include "TargetInfo/AMDGPUTargetInfo.h" -#include "SIInstrInfo.h" -//#include "llvm/Analysis/Passes.h" -#include "llvm/CodeGen/MachinePostDominators.h" +// #include "llvm/Analysis/Passes.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Support/Debug.h" -//#include "newbe/cli/newbe_opts.h" // AMDGPU change. +// #include "newbe/cli/newbe_opts.h" // AMDGPU change. #include "llvm/Support/raw_ostream.h" #include @@ -1223,24 +1225,24 @@ bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) { case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10: case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si: case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si: - //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si: + // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10: case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si: @@ -1555,8 +1557,8 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI, if (MO.isUse()) continue; unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || - Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) + if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::VCC || + Reg == AMDGPU::VCC_LO) return true; // Check if the written register class overlaps the bool register class. @@ -1567,15 +1569,15 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI, // // The underlying problem is that we have two notions of divergence // (bit divergence and wave divergence) but the algorithm only propagates - // wave divergence. The bit divergence is important for bools because it determines - // if a branch is uniform or not (and thus catches cases where a uniform value is - // used outside of a divergent control flow region). For bool values the - // algorithm will treat normally uniform values (i.e. scalar registers) as divergent - // in order to try and propagate bit divergence. + // wave divergence. The bit divergence is important for bools because it + // determines if a branch is uniform or not (and thus catches cases where a + // uniform value is used outside of a divergent control flow region). For + // bool values the algorithm will treat normally uniform values (i.e. scalar + // registers) as divergent in order to try and propagate bit divergence. // - // To fix all the possible bugs here I think we need to actually proagate bit - // divergence as well as wave divergences. That is a bigger fix and this check should - // cover most cases of treating a bool value as divergent. + // To fix all the possible bugs here I think we need to actually proagate + // bit divergence as well as wave divergences. That is a bigger fix and this + // check should cover most cases of treating a bool value as divergent. const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); if (SIRI->getCommonSubClass(BoolRC, RC)) return true; @@ -1597,13 +1599,13 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII, !MI->isTerminator()) return true; break; - //case AMDGPU::AMDGPU_MAKE_UNIFORM: - //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST: + // case AMDGPU::AMDGPU_MAKE_UNIFORM: + // case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST: case AMDGPU::V_READFIRSTLANE_B32: case AMDGPU::V_READLANE_B32: - //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32: - //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64: - // bool readfirstlane should be 1 bit, which means bit uniform. + // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32: + // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64: + // bool readfirstlane should be 1 bit, which means bit uniform. return true; case AMDGPU::S_OR_B32: case AMDGPU::S_OR_B64: { @@ -1638,7 +1640,8 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII, } bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) { - return reg.isPhysical();; + return reg.isPhysical(); + ; } bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) { @@ -1646,13 +1649,14 @@ bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) { } // For input reg of MF, vgpr will be divergent. -bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { +bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { if (isPhysicalReg(MRI, Reg)) { unsigned vir_reg = MRI.getLiveInVirtReg(Reg); if (SIRI->isVGPR(MRI, vir_reg)) return true; } else { - if (SIRI->isVGPR(MRI, Reg)) + if (SIRI->isVGPR(MRI, Reg)) return true; } return false; @@ -1660,8 +1664,8 @@ bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegiste bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { - //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent)) - // return true; + // if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent)) + // return true; if (isAMDGPUOpcodeDivergent(MI)) return true; @@ -1715,8 +1719,7 @@ bool isWriteExec(const MachineInstr *MI) { if (MO.isUse()) continue; unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::EXEC || - Reg == AMDGPU::EXEC_LO) + if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) return true; } return false; @@ -1735,7 +1738,6 @@ bool isVCndMask(unsigned Opcode) { } } - bool isExecRegionOp(unsigned Op) { switch (Op) { default: @@ -1812,17 +1814,18 @@ bool isInsideExecRegion(const MachineBasicBlock &MBB, return PDT.dominates(RegionEndMBB, &MBB); } -// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region? -// Then when hit saveExec, propagate leaked users of define inside the exec region. +// Map from BB to nearest Exec Region. How to build? Add every MBB unless +// already has smaller region? Then when hit saveExec, propagate leaked users of +// define inside the exec region. } // namespace namespace llvm { // class DivergenceAnalysis DivergenceAnalysis::DivergenceAnalysis( - const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI, - SyncDependenceAnalysis &SDA, bool IsLCSSAForm, + const MachineFunction &F, const MachineLoop *RegionLoop, + const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm, // AMDGPU change begin. DivergentJoinMapTy &JoinMap // AMDGPU change end. @@ -1841,7 +1844,7 @@ void DivergenceAnalysis::markDivergent(const ValueTy DivVal) { LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget(); const SIRegisterInfo *SIRI = ST->getRegisterInfo(); dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI);); - //AMDGPU change end. + // AMDGPU change end. DivergentValues.insert(DivVal); } @@ -1948,7 +1951,7 @@ bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const { // Check bit uniform here if not divergent. return !isBitUniform(Term, Processed); } - //case AMDGPU::AMDGPU_CALL_INDIRECT: + // case AMDGPU::AMDGPU_CALL_INDIRECT: case AMDGPU::SI_CALL: return true; } @@ -1965,13 +1968,10 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const { continue; Register Reg = Op.getReg(); if (Reg.isPhysical()) { - if (Reg == AMDGPU::EXEC || - Reg == AMDGPU::EXEC_LO || - Reg == AMDGPU::SCC) + if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::SCC) continue; - else - if (const MachineInstr *DefMI = - findPhysicalDefineInSameMBB(Op.getParent(), Reg)) { + else if (const MachineInstr *DefMI = + findPhysicalDefineInSameMBB(Op.getParent(), Reg)) { if (isDivergent(*DefMI)) return true; } else { @@ -1986,15 +1986,17 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const { return false; } -bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock, - const ValueTy Val, - const MachineBasicBlock &IncomingBlock) const { // AMDGPU change - const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants. +bool DivergenceAnalysis::isTemporalDivergent( + const MachineBasicBlock &ObservingBlock, const ValueTy Val, + const MachineBasicBlock &IncomingBlock) const { // AMDGPU change + const MachineBasicBlock *DefBlock = + &IncomingBlock; // AMDGPU change: Take def point as incoming block for + // constants. const auto *Inst = MRI.getUniqueVRegDef(Val); if (Inst == nullptr) return true; if (Inst) - DefBlock = Inst->getParent(); + DefBlock = Inst->getParent(); // check whether any divergent loop carrying Val terminates before control // proceeds to ObservingBlock @@ -2020,13 +2022,14 @@ static bool HasIncomingUndefValue(const PHINode_ *Phi) { // For case like // %163:sreg_64_xexec = S_MOV_B64 $exec -//bb.1: +// bb.1: //; predecessors: %bb.1, %bb.0 -// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%) -// %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 +// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), +// %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 // %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec // %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec -// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec +// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, +// implicit-def $scc, implicit $exec //... // $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc // S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -2091,8 +2094,8 @@ findSaveExec(const MachineInstr *MI, // It will only run on divergent branch, so (A, B) is not in // DivergentDisjointMap when A is uniform. static bool isJoinDivergentOnlyOnSameIncomingValue( - const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT, - DivergentJoinMapTy &DivergentJoinMap) { + const PHINode_ &Phi, const DivergenceAnalysis *pDA, + const MachineDominatorTree &DT, DivergentJoinMapTy &DivergentJoinMap) { // for phi which join divergent, if the incoming values from divergent // branch are the same, the phi is still uniform. // A @@ -2183,14 +2186,14 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const { // joining divergent disjoint path in Phi parent block if (isJoinDivergent(*Phi.getParent())) { // AMDGPU CHANGE BEGIN - if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) { + if (true /*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) { // Continue if the divergent join only on same incoming value. if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT, DivergentJoinMap)) return true; } else - // AMDGPU CHANGE END - return true; + // AMDGPU CHANGE END + return true; } // An incoming value could be divergent by itself. @@ -2213,7 +2216,6 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const { if (isDivergent(Reg) || isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB())) return true; - } return false; @@ -2259,7 +2261,8 @@ bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const { // marks all users of loop-carried values of the loop headed by LoopHeader as // divergent -void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) { +void DivergenceAnalysis::taintLoopLiveOuts( + const MachineBasicBlock &LoopHeader) { auto *DivLoop = LI.getLoopFor(&LoopHeader); assert(DivLoop && "loopHeader is not actually part of a loop"); @@ -2324,7 +2327,7 @@ void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) } } -void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { +void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { Worklist.push_back(&I); } void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) { @@ -2355,8 +2358,8 @@ void DivergenceAnalysis::pushUsers(const MachineInstr &I) { } } -bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock, - const MachineLoop *BranchLoop) { +bool DivergenceAnalysis::propagateJoinDivergence( + const MachineBasicBlock &JoinBlock, const MachineLoop *BranchLoop) { LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); // ignore divergence outside the region @@ -2403,8 +2406,10 @@ void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) { } } -void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) { - LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n"); +void DivergenceAnalysis::propagateLoopDivergence( + const MachineLoop &ExitingLoop) { + LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() + << "\n"); // don't propagate beyond region if (!inRegion(*ExitingLoop.getHeader())) @@ -2444,20 +2449,21 @@ void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) // For case like // %149:sreg_64_xexec = S_MOV_B64 $exec // -//bb.3: +// bb.3: //; predecessors: %bb.3, %bb.2 -// successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%) +// successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), +// %bb.4(50.00%) // // %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3 // %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec // %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec -// %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec -// $m0 = S_MOV_B32 %153:sgpr_32 -// %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec -// $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc +// %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, +// implicit-def $scc, implicit $exec $m0 = S_MOV_B32 %153:sgpr_32 %55:vreg_512 +// = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit +// $exec $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc // S_CBRANCH_EXECNZ %bb.3, implicit $exec // -//bb.4: +// bb.4: //; predecessors: %bb.3 // successors: %bb.5(0x80000000); %bb.5(100.00%) // @@ -2596,7 +2602,7 @@ void DivergenceAnalysis::compute() { // propagate divergence while (!Worklist.empty()) { - const MachineInstr *I= Worklist.back(); + const MachineInstr *I = Worklist.back(); Worklist.pop_back(); // maintain uniformity of overrides @@ -2715,23 +2721,23 @@ bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const { void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const { // iterate instructions using instructions() to ensure a deterministic order. for (auto &MBB : F) - for (auto &I : MBB) { - if (isDivergent(I)) - OS << "DIVERGENT:" << I ; - // AMDGPU changes begin - else - OS << "UNIFORM:" << I ; - // AMDGPU changes end - } + for (auto &I : MBB) { + if (isDivergent(I)) + OS << "DIVERGENT:" << I; + // AMDGPU changes begin + else + OS << "UNIFORM:" << I; + // AMDGPU changes end + } } // class GPUDivergenceAnalysis -MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F, - const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI) - : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap), - DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) { +MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis( + MachineFunction &F, const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI) + : SDA(DT, PDT, LI, /*AMDGPU change*/ DivergentJoinMap), + DA(F, nullptr, DT, PDT, LI, SDA, false, + /*AMDGPU change*/ DivergentJoinMap) { MachineRegisterInfo &MRI = F.getRegInfo(); const GCNSubtarget *ST = &F.getSubtarget(); const SIRegisterInfo *SIRI = ST->getRegisterInfo(); @@ -2758,10 +2764,11 @@ bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const { return DA.isDivergent(*I); } -void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const { +void MirGPUDivergenceAnalysis::print(raw_ostream &OS, + const Module_ *mod) const { OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; DA.print(OS, mod); OS << "}\n"; } -} +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h index edcf96ec44a4d..e721ac323255e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h @@ -1,4 +1,4 @@ -//===- AMDGPUMirDivergenceAnalysis.h - Mir Divergence Analysis -*- C++ -*-===// +//===- AMDGPUMirDivergenceAnalysis.h - Mir Divergence Analysis -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,11 +14,11 @@ #pragma once -#include "llvm/ADT/DenseSet.h" +#include "AMDGPUMirSyncDependenceAnalysis.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" -#include "AMDGPUMirSyncDependenceAnalysis.h" #include "llvm/Pass.h" #include @@ -50,8 +50,10 @@ class DivergenceAnalysis { /// Otherwise the whole function is analyzed. /// \param IsLCSSAForm whether the analysis may assume that the IR in the /// region in in LCSSA form. - DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop, - const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, + DivergenceAnalysis(const llvm::MachineFunction &F, + const MachineLoop *RegionLoop, + const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm, // AMDGPU change begin. @@ -98,10 +100,12 @@ class DivergenceAnalysis { bool updateTerminator(const MachineInstr &Term) const; bool updatePHINode(const PHINode_ &Phi) const; bool updateVCndMask(const MachineInstr &VCndMask) const; - bool isBitUniform(const MachineInstr &I, - llvm::DenseMap &Processed) const; - bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO, - llvm::DenseMap &Processed) const; + bool + isBitUniform(const MachineInstr &I, + llvm::DenseMap &Processed) const; + bool + isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO, + llvm::DenseMap &Processed) const; /// \brief Computes whether \p Inst is divergent based on the /// divergence of its operands. @@ -136,9 +140,9 @@ class DivergenceAnalysis { } /// \brief Whether \p Val is divergent when read in \p ObservingBlock. - bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock, - const ValueTy Val, - const MachineBasicBlock &incomingBlock) const; // AMDGPU change + bool isTemporalDivergent( + const MachineBasicBlock &ObservingBlock, const ValueTy Val, + const MachineBasicBlock &incomingBlock) const; // AMDGPU change /// \brief Whether \p Block is join divergent /// @@ -207,14 +211,14 @@ class DivergenceAnalysis { // Set of known-uniform values. llvm::DenseSet UniformOverrides; - llvm::DenseSet UniformOverridesInsts; + llvm::DenseSet UniformOverridesInsts; // Blocks with joining divergent control from different predecessors. llvm::DenseSet DivergentJoinBlocks; // Detected/marked divergent values. llvm::DenseSet DivergentValues; - llvm::DenseSet DivergentInsts; + llvm::DenseSet DivergentInsts; // Mir change for EXEC control flow. // Map from MBB to the exec region it belongs too. @@ -226,16 +230,15 @@ class DivergenceAnalysis { struct ExecRegion { const llvm::MachineInstr *begin; const llvm::MachineInstr *end; - std::vector blocks; + std::vector blocks; bool bPropagated = false; - ExecRegion(const llvm::MachineInstr *b, - const llvm::MachineInstr *e) + ExecRegion(const llvm::MachineInstr *b, const llvm::MachineInstr *e) : begin(b), end(e), bPropagated(false) {} }; llvm::DenseMap ExecRegionMap; // Internal worklist for divergence propagation. - std::vector Worklist; + std::vector Worklist; }; /// \brief Divergence analysis frontend for GPU kernels. @@ -251,15 +254,17 @@ class MirGPUDivergenceAnalysis { // When A is divergent branch, B and C are divergent join at D. // Then DivergentJoinMap[B].count(C) > 0 and // DivergentJoinMap[C].count(B) > 0. - DivergentJoinMapTy DivergentJoinMap; + DivergentJoinMapTy DivergentJoinMap; // AMDGPU change end SyncDependenceAnalysis SDA; DivergenceAnalysis DA; public: /// Runs the divergence analysis on @F, a GPU kernel - MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI); + MirGPUDivergenceAnalysis(llvm::MachineFunction &F, + const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI); /// Whether any divergence was detected. bool hasDivergence() const { return DA.hasDetectedDivergence(); } @@ -278,4 +283,3 @@ class MirGPUDivergenceAnalysis { }; } // namespace llvm - diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp index 7213f7b4b11b4..302939c76a4df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp @@ -1,4 +1,5 @@ -//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation +//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence +//Calculation //--===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. @@ -101,15 +102,15 @@ // loop exit and the loop header (_after_ SSA construction). // //===----------------------------------------------------------------------===// +#include "AMDGPUMirSyncDependenceAnalysis.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "AMDGPUMirSyncDependenceAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include #include @@ -120,19 +121,18 @@ namespace llvm { ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet; -SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI, - // AMDGPU change begin. - DivergentJoinMapTy &JoinMap - // AMDGPU change end. +SyncDependenceAnalysis::SyncDependenceAnalysis( + const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI, + // AMDGPU change begin. + DivergentJoinMapTy &JoinMap + // AMDGPU change end. ) : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI), - // AMDGPU change begin. + // AMDGPU change begin. DivergentJoinMap(JoinMap) - // AMDGPU change end. -{ -} +// AMDGPU change end. +{} SyncDependenceAnalysis::~SyncDependenceAnalysis() {} @@ -155,19 +155,23 @@ struct DivergencePropagator { // if DefMap[B] ~ undef then we haven't seen B yet // if DefMap[B] == B then B is a join point of disjoint paths from X or B is // an immediate successor of X (initial value). - using DefiningBlockMap = std::map; + using DefiningBlockMap = + std::map; DefiningBlockMap DefMap; // all blocks with pending visits std::unordered_set PendingUpdates; - DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI) + DivergencePropagator(const FunctionRPOT &FuncRPOT, + const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, + const MachineLoopInfo &LI) : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI), JoinBlocks(new ConstBlockSet) {} // set the definition at @block and mark @block as pending for a visit - void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) { + void addPending(const MachineBasicBlock &Block, + const MachineBasicBlock &DefBlock) { bool WasAdded = DefMap.emplace(&Block, &DefBlock).second; if (WasAdded) PendingUpdates.insert(&Block); @@ -190,7 +194,8 @@ struct DivergencePropagator { // process @succBlock with reaching definition @defBlock // the original divergent branch was in @parentLoop (if any) - void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop, + void visitSuccessor(const MachineBasicBlock &SuccBlock, + const MachineLoop *ParentLoop, const MachineBasicBlock &DefBlock) { // @succBlock is a loop exit @@ -223,14 +228,14 @@ struct DivergencePropagator { // divergent exits. // @rootBlock is either the block containing the branch or the header of the // divergent loop. - // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator) - // headed by @rootBlock. - // @parentLoop is the parent loop of the MachineLoop or the loop that contains the - // Terminator. + // @nodeSuccessors is the set of successors of the node (MachineLoop or + // Terminator) headed by @rootBlock. + // @parentLoop is the parent loop of the MachineLoop or the loop that contains + // the Terminator. template - std::unique_ptr - computeJoinPoints(const MachineBasicBlock &RootBlock, - SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) { + std::unique_ptr computeJoinPoints( + const MachineBasicBlock &RootBlock, SuccessorIterable NodeSuccessors, + const MachineLoop *ParentLoop, const MachineBasicBlock *PdBoundBlock) { assert(JoinBlocks); // bootstrap with branch targets @@ -250,7 +255,8 @@ struct DivergencePropagator { auto ItBeginRPO = FuncRPOT.begin(); // skip until term (TODO RPOT won't let us start at @term directly) - for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {} + for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) { + } auto ItEndRPO = FuncRPOT.end(); assert(ItBeginRPO != ItEndRPO); @@ -337,30 +343,26 @@ struct DivergencePropagator { // | B C // | | / | // +--L P - // + // // In this cfg, C is the RootBlock and P is C's post-dominator. // It will only visit L and P and then stop because it hits the // post dominator. Most loops do not hit this case because the // loop exiting block (C) will branch directly back to the loop // header. - // - if (HeaderDefBlock) - { - for (const auto *ExitBlock : ReachedLoopExits) { - auto ItExitDef = DefMap.find(ExitBlock); - assert((ItExitDef != DefMap.end()) && - "no reaching def at reachable loop exit"); - if (ItExitDef->second != HeaderDefBlock) { - JoinBlocks->insert(ExitBlock); - } - } - } - else - { - for (const auto *ExitBlock : ReachedLoopExits) - { - JoinBlocks->insert(ExitBlock); + // + if (HeaderDefBlock) { + for (const auto *ExitBlock : ReachedLoopExits) { + auto ItExitDef = DefMap.find(ExitBlock); + assert((ItExitDef != DefMap.end()) && + "no reaching def at reachable loop exit"); + if (ItExitDef->second != HeaderDefBlock) { + JoinBlocks->insert(ExitBlock); } + } + } else { + for (const auto *ExitBlock : ReachedLoopExits) { + JoinBlocks->insert(ExitBlock); + } } } @@ -370,12 +372,14 @@ struct DivergencePropagator { // AMDGPU change begin. // For all join blocks caused by divergent RootBlock, the prevs of a join block -// which are in DefMap or the RootBlock are divergent join each other on the join block because -// of divergent RootBlock. -static void updateJoinMap( - const MachineBasicBlock *RootBlock, - DenseMap> &JoinMap, - DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) { +// which are in DefMap or the RootBlock are divergent join each other on the +// join block because of divergent RootBlock. +static void +updateJoinMap(const MachineBasicBlock *RootBlock, + DenseMap> &JoinMap, + DivergencePropagator::DefiningBlockMap &DefMap, + ConstBlockSet &JoinBlocks) { for (const MachineBasicBlock *JoinBB : JoinBlocks) { // makr divergent join for all pred pair which in DefMap. for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end(); @@ -400,7 +404,8 @@ static void updateJoinMap( } // AMDGPU change end. -const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) { +const ConstBlockSet & +SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) { using LoopExitVec = SmallVector; LoopExitVec LoopExits; MachineLoop.getExitBlocks(LoopExits); @@ -415,7 +420,8 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach } // dont propagte beyond the immediate post dom of the loop - const auto *PdNode = PDT.getNode(const_cast(MachineLoop.getHeader())); + const auto *PdNode = + PDT.getNode(const_cast(MachineLoop.getHeader())); const auto *IpdNode = PdNode->getIDom(); const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) { @@ -426,15 +432,17 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach // compute all join points DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; auto JoinBlocks = Propagator.computeJoinPoints( - *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock); + *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), + PdBoundBlock); // AMDGPU change begin. // Save divergent join pairs. updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap, - *JoinBlocks.get()); + *JoinBlocks.get()); // AMDGPU change end. - auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks)); + auto ItInserted = + CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks)); assert(ItInserted.second); return *ItInserted.first->second; } @@ -452,18 +460,18 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) { return *ItCached->second; // dont propagate beyond the immediate post dominator of the branch - const auto *PdNode = PDT.getNode(const_cast(Term.getParent())); + const auto *PdNode = + PDT.getNode(const_cast(Term.getParent())); const auto *IpdNode = PdNode->getIDom(); const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; - // compute all join points DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; const auto &TermBlock = *Term.getParent(); - + // AMDGPU CHANGE // Make sure the post-dominator is outside the loop for the loop header. - // Otherwise, we may not find all the join blocks in the loop + // Otherwise, we may not find all the join blocks in the loop // because the search stops too early. Some join points can be reached // after the post-dominator! // @@ -477,30 +485,30 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) { // // In this cfg, A is the loop header and P is A's post-dominator. // The algorithm to mark join points does an Reverse Post Order walk - // from A and stops when it reaches the post dominator. It would not + // from A and stops when it reaches the post dominator. It would not // mark the phi node in L as divergent even when A had a divergent branch. // The fix we made was to make the join point search continue all the way // to the loops post dominator (which is X in this example). // // NOTE: They already made this change for the loop case above, but for - // a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&) - // + // a different bug apparently. See + // SyncDependenceAnalysis::join_blocks(MachineLoop&) + // const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock); - if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) - { - while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) { - IpdNode = IpdNode->getIDom(); - PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; - } + if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) { + while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) { + IpdNode = IpdNode->getIDom(); + PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + } } - + auto JoinBlocks = Propagator.computeJoinPoints( TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock); // AMDGPU change begin. // Save divergent join pairs. updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap, - *JoinBlocks.get()); + *JoinBlocks.get()); // AMDGPU change end. auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h index a52bcc7bc9e7c..92059d85b848a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h @@ -1,4 +1,5 @@ -//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===// +//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ +//-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -20,8 +21,9 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include +#include "llvm/CodeGen/MachineFunction.h" #include +#include namespace llvm { class MachineBasicBlock; @@ -44,14 +46,16 @@ using ConstBlockSet = llvm::SmallPtrSet; /// This analysis relates points of divergent control to points of converging /// divergent control. The analysis requires all loops to be reducible. class SyncDependenceAnalysis { - void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop, + void visitSuccessor(const MachineBasicBlock &succBlock, + const MachineLoop *termLoop, const MachineBasicBlock *defBlock); public: bool inRegion(const MachineBasicBlock &BB) const; ~SyncDependenceAnalysis(); - SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, + SyncDependenceAnalysis(const MachineDominatorTree &DT, + const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI, // AMDGPU change begin DivergentJoinMapTy &JoinMap @@ -88,11 +92,10 @@ class SyncDependenceAnalysis { // AMDGPU change begin. DivergentJoinMapTy &DivergentJoinMap; // AMDGPU change end. - std::map> CachedLoopExitJoins; + std::map> + CachedLoopExitJoins; std::map> CachedBranchJoins; }; } // namespace llvm - - diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp index 648df7f724617..2e48ec44f979c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -1,22 +1,22 @@ -//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===// +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -//===--------------------------------------------------------------------------------===// +//==------------------------------------------------------------------------==// // /// \file /// \brief Helper functions for occupancy and latency. // -//===--------------------------------------------------------------------------------===// +//==------------------------------------------------------------------------==// -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" +#include "AMDGPUOccupancyAndLatencyHelper.h" #include "AMDGPUSubtarget.h" #include "GCNSubtarget.h" -#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -57,7 +57,7 @@ bool SchedScore::isBetter(const SchedScore &s) const { bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const { unsigned gain = latencyGain(TargetOccupancy, ExtraOcc); // 10% is good enough. - if ((10*gain) >= Alu) + if ((10 * gain) >= Alu) return true; else return false; @@ -65,7 +65,7 @@ bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const { unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const { unsigned latency = MemLatency; - return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc)); + return (latency / (TgtOcc)) - (latency / (TgtOcc + ExtraOcc)); } // AMDGPULatencyTracker @@ -73,7 +73,8 @@ AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST) : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {} void AMDGPULatencyTracker::scan(const MachineInstr &MI) { - if (MI.isDebugInstr()) return; + if (MI.isDebugInstr()) + return; int latency = SIII->getInstrLatency(ItinerayData, MI); // If inside latency hide. if (!LatencyMIs.empty()) { @@ -184,5 +185,3 @@ SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, } } // namespace llvm - - diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h index f108bab24bd39..a9a15f7538a58 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -1,18 +1,19 @@ -//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===// +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -//===--------------------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// // /// \file /// \brief Helper functions for occupancy and latency. // -//===--------------------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// #include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCInstrItineraries.h" namespace llvm { @@ -30,7 +31,7 @@ struct SchedScore { unsigned MemLatency = 0; // Only save mem latency. // We want mem latency small and hide big. Compare // memLatency - hide * Occ, smaller is better. - unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1. + unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1. unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy. unsigned Lds = 0; // Todo: count lds. SchedScore() {} @@ -39,9 +40,9 @@ struct SchedScore { float computeScore() const; float computeScore2() const; - void sum(const SchedScore &s, unsigned loopDepth=0); + void sum(const SchedScore &s, unsigned loopDepth = 0); bool isBetter(const SchedScore &s) const; - bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const; + bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const; // More latency can be hiden with ExtraOcc. unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; }; @@ -71,4 +72,4 @@ struct AMDGPULatencyTracker { SchedScore CollectLatency(llvm::MachineFunction &MF, const llvm::GCNSubtarget &ST, const llvm::MachineLoopInfo *MLI = nullptr); -} +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index a0f2a5d4dc121..b133659d8fb66 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -1,9 +1,9 @@ -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/SlotIndexes.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/SlotIndexes.h" -//#include "dxc/DXIL/DxilMetadataHelper.h" +// #include "dxc/DXIL/DxilMetadataHelper.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" @@ -14,9 +14,9 @@ #include "llvm/Support/Debug.h" -#include "GCNRegPressure.h" #include "AMDGPUMIRUtils.h" #include "AMDGPUSubExpDag.h" +#include "GCNRegPressure.h" #include #define DEBUG_TYPE "xb-sub-exp-dag" @@ -27,37 +27,35 @@ namespace llvm { // Expression Dag. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const { - dbgs() << "\nSubExp:\n"; - dbgs() << "input regs:\n"; - for (auto &input : inputLive) { - pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs()); - dbgs() << "\n"; - } - dbgs() << "output regs:\n"; - for (auto &output : outputLive) { - pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs()); - dbgs() << "\n"; - } +void SubExp::dump(const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) const { + dbgs() << "\nSubExp:\n"; + dbgs() << "input regs:\n"; + for (auto &input : inputLive) { + pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs()); + dbgs() << "\n"; + } + dbgs() << "output regs:\n"; + for (auto &output : outputLive) { + pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs()); + dbgs() << "\n"; + } - for (MachineInstr *MI : SUnits) { - MI->dump(); - } - dbgs() << "End of SubExp\n"; + for (MachineInstr *MI : SUnits) { + MI->dump(); + } + dbgs() << "End of SubExp\n"; } #endif -bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const -{ - for (const MachineInstr *MI : SUnits) - { - if (MI->modifiesRegister(Reg, SIRI)) - { - return true; - } +bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const { + for (const MachineInstr *MI : SUnits) { + if (MI->modifiesRegister(Reg, SIRI)) { + return true; } + } - return false; + return false; } void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, @@ -95,7 +93,9 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) { MachineInstr *MI = *it; - auto *ST = &MI->getMF()->getSubtarget(); // TODO: Better way to get this. + auto *ST = + &MI->getMF() + ->getSubtarget(); // TODO: Better way to get this. for (MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; @@ -149,8 +149,8 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const { } ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI, - const llvm::SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, const bool bJoinInput) + const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + const bool bJoinInput) : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {} template @@ -196,9 +196,9 @@ template void ExpDag::build>(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, DenseSet &instRange); -template void ExpDag::build>(const LiveSet &InputLiveReg, - const LiveSet &OutputLiveReg, - std::vector &instRange); +template void ExpDag::build>( + const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, + std::vector &instRange); void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { @@ -311,7 +311,8 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, // UserMI should always be in same subExp. unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum]; if (UseSubIdx != OriginSubIdx) { - // When reg has multiple def, it is possible for user def in different subExp. + // When reg has multiple def, it is possible for user def in + // different subExp. if (MRI.getUniqueVRegDef(Reg)) llvm::report_fatal_error("user and def in different subExp"); break; @@ -470,9 +471,8 @@ void BlockExpDag::buildWithPressure() { buildPressure(StartLiveReg, EndLiveReg); } -void BlockExpDag::buildAvail( - const LiveSet &passThruSet, - DenseMap &DagAvailRegMap) { +void BlockExpDag::buildAvail(const LiveSet &passThruSet, + DenseMap &DagAvailRegMap) { DenseSet Processed; DenseSet WorkList; @@ -596,10 +596,10 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, // Using pass thru as base because output of current SU should not // affect other output SUs. GCNUpwardRPTracker RP(*LIS); - RP.reset(BeginMI, &passThruSet, /*After*/true); + RP.reset(BeginMI, &passThruSet, /*After*/ true); MachineInstr *MI = SU.getInstr(); if (MI) { - RP.reset(*MI, &passThruSet, /*After*/true); + RP.reset(*MI, &passThruSet, /*After*/ true); RP.recede(*MI); } DagPressureMap[&SU] = RP.getLiveRegs(); @@ -639,9 +639,9 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU]; GCNUpwardRPTracker RP(*LIS); - RP.reset(BeginMI, &SuccLive, /*After*/true); + RP.reset(BeginMI, &SuccLive, /*After*/ true); if (MI) { - RP.reset(*MI, &SuccLive, /*After*/true); + RP.reset(*MI, &SuccLive, /*After*/ true); // Update SuccLive based on MI. RP.recede(*MI); } @@ -684,9 +684,7 @@ std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const { } /// Return the label. -std::string ExpDag::getDAGName() const { - return "dag.exp"; -} +std::string ExpDag::getDAGName() const { return "dag.exp"; } /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG /// rendered using 'dot'. @@ -707,7 +705,7 @@ void ExpDag::dump() { viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName()); } -} +} // namespace llvm // Expression Dag dump. namespace llvm { @@ -757,7 +755,8 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { SS << "SU:" << SU->NodeNum; return SS.str(); } - static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) { + static std::string getNodeDescription(const SUnit *SU, + const llvm::ExpDag *G) { return G->getGraphNodeLabel(SU); } static std::string getNodeAttributes(const SUnit *N, @@ -804,7 +803,9 @@ void getRegBound(llvm::MachineBasicBlock *MBB, const GCNRPTracker::LiveRegSet outputLive = llvm::getLiveRegs(EndSlot, *LIS, MRI); - auto* ST = &MBB->getParent()->getSubtarget(); // TODO: Better way to get this. + auto *ST = + &MBB->getParent() + ->getSubtarget(); // TODO: Better way to get this. if (MBB->empty()) { GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive); MaxSGPR = MaxPressure.getSGPRNum(); @@ -845,7 +846,7 @@ void getRegBound(llvm::MachineBasicBlock *MBB, auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI); GCNUpwardRPTracker RPTracker(*LIS); - RPTracker.reset(MBB->front(), &outputLive, /*After*/true); + RPTracker.reset(MBB->front(), &outputLive, /*After*/ true); for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) { const SUnit *SU = *it; if (!SU->isInstr()) @@ -1116,8 +1117,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector &SUnits) { return Heir; } -HRB::Lineage HRB::buildChain(SUnit *Node, - std::vector &SUnits) { +HRB::Lineage HRB::buildChain(SUnit *Node, std::vector &SUnits) { HRB::Lineage chain; chain.addNode(Node); ChainedNodes.insert(Node); @@ -1754,7 +1754,7 @@ std::vector hrbSched(std::vector &SUnits, SUnit *SU = *it; if (!Color.isHead(SU)) { - continue; + continue; } Candidate = SU; // Remove Candidate from ReadyList. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h index c234f32370793..a7d29430b4276 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h @@ -4,7 +4,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/MC/LaneBitmask.h" -#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. +#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. namespace llvm { class MachineFunction; @@ -14,8 +14,7 @@ class SIRegisterInfo; class SIInstrInfo; class MachineInstr; class MachineBasicBlock; -template -class GraphWriter; +template class GraphWriter; class SUnit; class IntEqClasses; class Twine; @@ -55,13 +54,12 @@ struct SubExp { const llvm::SIRegisterInfo *SIRI); void dump(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI) const; - bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const; + bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo *SIRI) const; }; struct ExpDag { ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, - const llvm::SIInstrInfo *SIII, - const bool bJoinInput); + const llvm::SIInstrInfo *SIII, const bool bJoinInput); const llvm::MachineRegisterInfo &MRI; const llvm::SIRegisterInfo *SIRI; const llvm::SIInstrInfo *SIII; @@ -83,13 +81,14 @@ struct ExpDag { std::string getDAGName() const; /// Adds custom features for a visualization of the ScheduleDAG. void addCustomGraphFeatures(llvm::GraphWriter &) const {} + private: - template - void initNodes(const LiveSet &InputLiveReg, T &insts); + template void initNodes(const LiveSet &InputLiveReg, T &insts); void addDataDep(const llvm::SIRegisterInfo *SIRI); void addCtrlDep(); void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, - const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII); + const llvm::SIRegisterInfo *SIRI, + const llvm::SIInstrInfo *SIII); }; struct BlockExpDag : public ExpDag { @@ -103,11 +102,11 @@ struct BlockExpDag : public ExpDag { std::vector SubExps; void build(); void buildWithPressure(); + private: void buildAvail(const LiveSet &passThruSet, llvm::DenseMap &DagAvailRegMap); - void buildPressure(const LiveSet &StartLiveReg, - const LiveSet &EndLiveReg); + void buildPressure(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg); }; void getRegBound(llvm::MachineBasicBlock *MBB, @@ -194,4 +193,4 @@ std::vector hrbSched(std::vector &SUnits, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI); -} +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h index c9172bae2cb4a..c49590a7d8f7f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h @@ -1,4 +1,4 @@ -//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===// +//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG ------------===// // // The LLVM Compiler Infrastructure // @@ -14,9 +14,9 @@ //===----------------------------------------------------------------------===// #pragma once -#include #include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. +#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. +#include namespace llvm { class MachineBasicBlock; @@ -42,7 +42,6 @@ class SimpleDAG { void addCtrlDep(); }; - // Collect height/depth for high latency mem ld, which only update height/depth // when cross high latency mem ld. Call the height/depth as VMem degree here. // The rule is sample and its user should has different degree. @@ -60,15 +59,13 @@ class SimpleDAG { class VMemDegreeDAG { public: - VMemDegreeDAG(std::vector &Units, - const llvm::SIInstrInfo *TII) + VMemDegreeDAG(std::vector &Units, const llvm::SIInstrInfo *TII) : SUnits(Units), SIII(TII) {} std::vector &SUnits; // InstrInfo. const llvm::SIInstrInfo *SIII; void build(); - bool isHighLatency(const llvm::SUnit *SU) const; bool isHighLatency(const llvm::MachineInstr *MI) const; // height/depth based on Long latency inst. @@ -79,28 +76,24 @@ class VMemDegreeDAG { std::vector VMemFullDepth; llvm::SmallVector VMemSUs; llvm::SmallVector, 16> GroupedVMemSUs; - llvm::SmallVector, 16> GroupedVMemSUsByDepth; - + llvm::SmallVector, 16> + GroupedVMemSUsByDepth; void dump(); private: static constexpr unsigned kNoReg = -1; - - std::pair buildVMemDepthHeight(std::vector &VMemHeight, - std::vector &VMemDepth, bool bDataOnly); + std::pair + buildVMemDepthHeight(std::vector &VMemHeight, + std::vector &VMemDepth, bool bDataOnly); // Compute vmem height/depth. void buildVMemDepthHeight(); void buildVMemDataDepthHeight(); void groupVmemSUnits(); - }; - - // Split block based on vmem depth. void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag); -} - +} // namespace llvm From bf396df7b968e2c82e58504957a2fd9bacb3a307 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 11 Mar 2025 12:20:21 -0700 Subject: [PATCH 07/25] Added option to enable it in the target profile --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 564c92239acdf..ec39b385ecbd2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -394,6 +394,12 @@ static cl::opt cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable Hot block rematerialize +static cl::opt + EnableHotBlockRemat("amdgpu-enable-hot-block-remat", + cl::desc("Enable HotBlock Rematerialize optimization"), + cl::init(false), cl::Hidden); + // Enable GFX11+ VOPD static cl::opt EnableVOPD("amdgpu-enable-vopd", @@ -1523,6 +1529,10 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (TM->getOptLevel() > CodeGenOptLevel::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + // Must be run before phi elimination + if (isPassEnabled(EnableHotBlockRemat)) + addPass(&AMDGPUHotBlockRematerializeID); + TargetPassConfig::addOptimizedRegAlloc(); } From c64c4e40dd8fe0391ac4cd135bbcf913380c46fe Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 11 Mar 2025 11:28:59 -0700 Subject: [PATCH 08/25] Fix PHI node handling in regpressure tracker --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index a438ad00bc41d..390c2f05ffe69 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -549,22 +549,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, if (!S.liveAt(SI)) { if (It == LiveRegs.end()) { It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) - llvm_unreachable("register isn't live"); } - auto PrevMask = It->second; - It->second &= ~S.LaneMask; - CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + if (It != LiveRegs.end()) { + auto PrevMask = It->second; + It->second &= ~S.LaneMask; + CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + } } } if (It != LiveRegs.end() && It->second.none()) LiveRegs.erase(It); } else if (!LI.liveAt(SI)) { auto It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) - llvm_unreachable("register isn't live"); - CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); - LiveRegs.erase(It); + if (It != LiveRegs.end()) { + CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); + LiveRegs.erase(It); + } } } From 3dc22d43a3cda3abfd13bab02d5c75a948485cfa Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 11 Mar 2025 16:31:14 -0700 Subject: [PATCH 09/25] Fixed the PHI issue --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 16 +- llvm/test/CodeGen/AMDGPU/remat/phi.mir | 709 ++++++++++++++++++ 2 files changed, 724 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 4656e28499a0d..2cd28513f10f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -204,9 +204,23 @@ FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, return BB; } +// Maybe expensive to be called all over the place +bool IsUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + for (auto &Def : DefMI->defs()) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) { + if (UseMI.isPHI()) + return true; + } + } + return false; +} + bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { - unsigned OpNum = DefMI->getNumOperands(); + // Do not move PHI nodes + if (IsUsedByPhi(DefMI, MRI)) + return false; + unsigned OpNum = DefMI->getNumOperands(); // Only move DefMI which all operand is unique def. for (unsigned i = 0; i < OpNum; i++) { MachineOperand &Op = DefMI->getOperand(i); diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir new file mode 100644 index 0000000000000..5ee563e7a633f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/phi.mir @@ -0,0 +1,709 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that none of these defs are moved to their uses since they're used by +# PHIS. +# CHECK: bb.0: +# CHECK: %[[#r3000:]]:sgpr_32 = S_MOV_B32 0 +# CHECK: %[[#r3001:]]:sgpr_32 = S_MOV_B32 1 +# CHECK: %[[#r3002:]]:sgpr_32 = S_MOV_B32 2 +# CHECK: %[[#r3003:]]:sgpr_32 = S_MOV_B32 3 +# CHECK: %[[#r3004:]]:sgpr_32 = S_MOV_B32 4 +# CHECK: %[[#r3005:]]:sgpr_32 = S_MOV_B32 5 +# CHECK: %[[#r3006:]]:sgpr_32 = S_MOV_B32 6 +# CHECK: %[[#r3007:]]:sgpr_32 = S_MOV_B32 7 +# CHECK: %[[#r3008:]]:sgpr_32 = S_MOV_B32 8 +# CHECK: %[[#r3009:]]:sgpr_32 = S_MOV_B32 9 +# CHECK: %[[#r3010:]]:sgpr_32 = S_MOV_B32 10 +# CHECK: %[[#r3011:]]:sgpr_32 = S_MOV_B32 11 +# CHECK: %[[#r3012:]]:sgpr_32 = S_MOV_B32 12 +# CHECK: %[[#r3013:]]:sgpr_32 = S_MOV_B32 13 +# CHECK: %[[#r3014:]]:sgpr_32 = S_MOV_B32 14 +# CHECK: %[[#r3015:]]:sgpr_32 = S_MOV_B32 15 +# CHECK: %[[#r3016:]]:sgpr_32 = S_MOV_B32 16 +# CHECK: %[[#r3017:]]:sgpr_32 = S_MOV_B32 17 +# CHECK: %[[#r3018:]]:sgpr_32 = S_MOV_B32 18 +# CHECK: %[[#r3019:]]:sgpr_32 = S_MOV_B32 19 +# CHECK: %[[#r3020:]]:sgpr_32 = S_MOV_B32 20 +# CHECK: %[[#r3021:]]:sgpr_32 = S_MOV_B32 21 +# CHECK: %[[#r3022:]]:sgpr_32 = S_MOV_B32 22 +# CHECK: %[[#r3023:]]:sgpr_32 = S_MOV_B32 23 +# CHECK: %[[#r3024:]]:sgpr_32 = S_MOV_B32 24 +# CHECK: %[[#r3025:]]:sgpr_32 = S_MOV_B32 25 +# CHECK: %[[#r3026:]]:sgpr_32 = S_MOV_B32 26 +# CHECK: %[[#r3027:]]:sgpr_32 = S_MOV_B32 27 +# CHECK: %[[#r3028:]]:sgpr_32 = S_MOV_B32 28 +# CHECK: %[[#r3029:]]:sgpr_32 = S_MOV_B32 29 +# CHECK: %[[#r3030:]]:sgpr_32 = S_MOV_B32 30 +# CHECK: %[[#r3031:]]:sgpr_32 = S_MOV_B32 31 +# CHECK: %[[#r3032:]]:sgpr_32 = S_MOV_B32 32 +# CHECK: %[[#r3033:]]:sgpr_32 = S_MOV_B32 33 +# CHECK: %[[#r3034:]]:sgpr_32 = S_MOV_B32 34 +# CHECK: %[[#r3035:]]:sgpr_32 = S_MOV_B32 35 +# CHECK: %[[#r3036:]]:sgpr_32 = S_MOV_B32 36 +# CHECK: %[[#r3037:]]:sgpr_32 = S_MOV_B32 37 +# CHECK: %[[#r3038:]]:sgpr_32 = S_MOV_B32 38 +# CHECK: %[[#r3039:]]:sgpr_32 = S_MOV_B32 39 +# CHECK: %[[#r3040:]]:sgpr_32 = S_MOV_B32 40 +# CHECK: %[[#r3041:]]:sgpr_32 = S_MOV_B32 41 +# CHECK: %[[#r3042:]]:sgpr_32 = S_MOV_B32 42 +# CHECK: %[[#r3043:]]:sgpr_32 = S_MOV_B32 43 +# CHECK: %[[#r3044:]]:sgpr_32 = S_MOV_B32 44 +# CHECK: %[[#r3045:]]:sgpr_32 = S_MOV_B32 45 +# CHECK: %[[#r3046:]]:sgpr_32 = S_MOV_B32 46 +# CHECK: %[[#r3047:]]:sgpr_32 = S_MOV_B32 47 +# CHECK: %[[#r3048:]]:sgpr_32 = S_MOV_B32 48 +# CHECK: %[[#r3049:]]:sgpr_32 = S_MOV_B32 49 +# CHECK: %[[#r3050:]]:sgpr_32 = S_MOV_B32 50 +# CHECK: %[[#r3051:]]:sgpr_32 = S_MOV_B32 51 +# CHECK: %[[#r3052:]]:sgpr_32 = S_MOV_B32 52 +# CHECK: %[[#r3053:]]:sgpr_32 = S_MOV_B32 53 +# CHECK: %[[#r3054:]]:sgpr_32 = S_MOV_B32 54 +# CHECK: %[[#r3055:]]:sgpr_32 = S_MOV_B32 55 +# CHECK: %[[#r3056:]]:sgpr_32 = S_MOV_B32 56 +# CHECK: %[[#r3057:]]:sgpr_32 = S_MOV_B32 57 +# CHECK: %[[#r3058:]]:sgpr_32 = S_MOV_B32 58 +# CHECK: %[[#r3059:]]:sgpr_32 = S_MOV_B32 59 +# CHECK: %[[#r3060:]]:sgpr_32 = S_MOV_B32 60 +# CHECK: %[[#r3061:]]:sgpr_32 = S_MOV_B32 61 +# CHECK: %[[#r3062:]]:sgpr_32 = S_MOV_B32 62 +# CHECK: %[[#r3063:]]:sgpr_32 = S_MOV_B32 63 +# CHECK: %[[#r3064:]]:sgpr_32 = S_MOV_B32 64 +# CHECK: %[[#r3065:]]:sgpr_32 = S_MOV_B32 65 +# CHECK: %[[#r3066:]]:sgpr_32 = S_MOV_B32 66 +# CHECK: %[[#r3067:]]:sgpr_32 = S_MOV_B32 67 +# CHECK: %[[#r3068:]]:sgpr_32 = S_MOV_B32 68 +# CHECK: %[[#r3069:]]:sgpr_32 = S_MOV_B32 69 +# CHECK: %[[#r3070:]]:sgpr_32 = S_MOV_B32 70 +# CHECK: %[[#r3071:]]:sgpr_32 = S_MOV_B32 71 +# CHECK: %[[#r3072:]]:sgpr_32 = S_MOV_B32 72 +# CHECK: %[[#r3073:]]:sgpr_32 = S_MOV_B32 73 +# CHECK: %[[#r3074:]]:sgpr_32 = S_MOV_B32 74 +# CHECK: %[[#r3075:]]:sgpr_32 = S_MOV_B32 75 +# CHECK: %[[#r3076:]]:sgpr_32 = S_MOV_B32 76 +# CHECK: %[[#r3077:]]:sgpr_32 = S_MOV_B32 77 +# CHECK: %[[#r3078:]]:sgpr_32 = S_MOV_B32 78 +# CHECK: %[[#r3079:]]:sgpr_32 = S_MOV_B32 79 +# CHECK: %[[#r3080:]]:sgpr_32 = S_MOV_B32 80 +# CHECK: %[[#r3081:]]:sgpr_32 = S_MOV_B32 81 +# CHECK: %[[#r3082:]]:sgpr_32 = S_MOV_B32 82 +# CHECK: %[[#r3083:]]:sgpr_32 = S_MOV_B32 83 +# CHECK: %[[#r3084:]]:sgpr_32 = S_MOV_B32 84 +# CHECK: %[[#r3085:]]:sgpr_32 = S_MOV_B32 85 +# CHECK: %[[#r3086:]]:sgpr_32 = S_MOV_B32 86 +# CHECK: %[[#r3087:]]:sgpr_32 = S_MOV_B32 87 +# CHECK: %[[#r3088:]]:sgpr_32 = S_MOV_B32 88 +# CHECK: %[[#r3089:]]:sgpr_32 = S_MOV_B32 89 +# CHECK: %[[#r3090:]]:sgpr_32 = S_MOV_B32 90 +# CHECK: %[[#r3091:]]:sgpr_32 = S_MOV_B32 91 +# CHECK: %[[#r3092:]]:sgpr_32 = S_MOV_B32 92 +# CHECK: %[[#r3093:]]:sgpr_32 = S_MOV_B32 93 +# CHECK: %[[#r3094:]]:sgpr_32 = S_MOV_B32 94 +# CHECK: %[[#r3095:]]:sgpr_32 = S_MOV_B32 95 +# CHECK: %[[#r3096:]]:sgpr_32 = S_MOV_B32 96 +# CHECK: %[[#r3097:]]:sgpr_32 = S_MOV_B32 97 +# CHECK: %[[#r3098:]]:sgpr_32 = S_MOV_B32 98 +# CHECK: %[[#r3099:]]:sgpr_32 = S_MOV_B32 99 +# CHECK: bb.1: +# CHECK: bb.2: + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + ; undef %0.sub0:sgpr_64 = COPY $sgpr0 + ; undef %0.sub1:sgpr_64 = COPY $sgpr1 + + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + ; undef %1.sub0:sgpr_128 = COPY $sgpr4 + ; undef %1.sub1:sgpr_128 = COPY $sgpr5 + ; undef %1.sub2:sgpr_128 = COPY $sgpr6 + ; undef %1.sub3:sgpr_128 = COPY $sgpr7 + + + %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %3000:sgpr_32 = S_MOV_B32 0 + %3001:sgpr_32 = S_MOV_B32 1 + %3002:sgpr_32 = S_MOV_B32 2 + %3003:sgpr_32 = S_MOV_B32 3 + %3004:sgpr_32 = S_MOV_B32 4 + %3005:sgpr_32 = S_MOV_B32 5 + %3006:sgpr_32 = S_MOV_B32 6 + %3007:sgpr_32 = S_MOV_B32 7 + %3008:sgpr_32 = S_MOV_B32 8 + %3009:sgpr_32 = S_MOV_B32 9 + %3010:sgpr_32 = S_MOV_B32 10 + %3011:sgpr_32 = S_MOV_B32 11 + %3012:sgpr_32 = S_MOV_B32 12 + %3013:sgpr_32 = S_MOV_B32 13 + %3014:sgpr_32 = S_MOV_B32 14 + %3015:sgpr_32 = S_MOV_B32 15 + %3016:sgpr_32 = S_MOV_B32 16 + %3017:sgpr_32 = S_MOV_B32 17 + %3018:sgpr_32 = S_MOV_B32 18 + %3019:sgpr_32 = S_MOV_B32 19 + %3020:sgpr_32 = S_MOV_B32 20 + %3021:sgpr_32 = S_MOV_B32 21 + %3022:sgpr_32 = S_MOV_B32 22 + %3023:sgpr_32 = S_MOV_B32 23 + %3024:sgpr_32 = S_MOV_B32 24 + %3025:sgpr_32 = S_MOV_B32 25 + %3026:sgpr_32 = S_MOV_B32 26 + %3027:sgpr_32 = S_MOV_B32 27 + %3028:sgpr_32 = S_MOV_B32 28 + %3029:sgpr_32 = S_MOV_B32 29 + %3030:sgpr_32 = S_MOV_B32 30 + %3031:sgpr_32 = S_MOV_B32 31 + %3032:sgpr_32 = S_MOV_B32 32 + %3033:sgpr_32 = S_MOV_B32 33 + %3034:sgpr_32 = S_MOV_B32 34 + %3035:sgpr_32 = S_MOV_B32 35 + %3036:sgpr_32 = S_MOV_B32 36 + %3037:sgpr_32 = S_MOV_B32 37 + %3038:sgpr_32 = S_MOV_B32 38 + %3039:sgpr_32 = S_MOV_B32 39 + %3040:sgpr_32 = S_MOV_B32 40 + %3041:sgpr_32 = S_MOV_B32 41 + %3042:sgpr_32 = S_MOV_B32 42 + %3043:sgpr_32 = S_MOV_B32 43 + %3044:sgpr_32 = S_MOV_B32 44 + %3045:sgpr_32 = S_MOV_B32 45 + %3046:sgpr_32 = S_MOV_B32 46 + %3047:sgpr_32 = S_MOV_B32 47 + %3048:sgpr_32 = S_MOV_B32 48 + %3049:sgpr_32 = S_MOV_B32 49 + %3050:sgpr_32 = S_MOV_B32 50 + %3051:sgpr_32 = S_MOV_B32 51 + %3052:sgpr_32 = S_MOV_B32 52 + %3053:sgpr_32 = S_MOV_B32 53 + %3054:sgpr_32 = S_MOV_B32 54 + %3055:sgpr_32 = S_MOV_B32 55 + %3056:sgpr_32 = S_MOV_B32 56 + %3057:sgpr_32 = S_MOV_B32 57 + %3058:sgpr_32 = S_MOV_B32 58 + %3059:sgpr_32 = S_MOV_B32 59 + %3060:sgpr_32 = S_MOV_B32 60 + %3061:sgpr_32 = S_MOV_B32 61 + %3062:sgpr_32 = S_MOV_B32 62 + %3063:sgpr_32 = S_MOV_B32 63 + %3064:sgpr_32 = S_MOV_B32 64 + %3065:sgpr_32 = S_MOV_B32 65 + %3066:sgpr_32 = S_MOV_B32 66 + %3067:sgpr_32 = S_MOV_B32 67 + %3068:sgpr_32 = S_MOV_B32 68 + %3069:sgpr_32 = S_MOV_B32 69 + %3070:sgpr_32 = S_MOV_B32 70 + %3071:sgpr_32 = S_MOV_B32 71 + %3072:sgpr_32 = S_MOV_B32 72 + %3073:sgpr_32 = S_MOV_B32 73 + %3074:sgpr_32 = S_MOV_B32 74 + %3075:sgpr_32 = S_MOV_B32 75 + %3076:sgpr_32 = S_MOV_B32 76 + %3077:sgpr_32 = S_MOV_B32 77 + %3078:sgpr_32 = S_MOV_B32 78 + %3079:sgpr_32 = S_MOV_B32 79 + %3080:sgpr_32 = S_MOV_B32 80 + %3081:sgpr_32 = S_MOV_B32 81 + %3082:sgpr_32 = S_MOV_B32 82 + %3083:sgpr_32 = S_MOV_B32 83 + %3084:sgpr_32 = S_MOV_B32 84 + %3085:sgpr_32 = S_MOV_B32 85 + %3086:sgpr_32 = S_MOV_B32 86 + %3087:sgpr_32 = S_MOV_B32 87 + %3088:sgpr_32 = S_MOV_B32 88 + %3089:sgpr_32 = S_MOV_B32 89 + %3090:sgpr_32 = S_MOV_B32 90 + %3091:sgpr_32 = S_MOV_B32 91 + %3092:sgpr_32 = S_MOV_B32 92 + %3093:sgpr_32 = S_MOV_B32 93 + %3094:sgpr_32 = S_MOV_B32 94 + %3095:sgpr_32 = S_MOV_B32 95 + %3096:sgpr_32 = S_MOV_B32 96 + %3097:sgpr_32 = S_MOV_B32 97 + %3098:sgpr_32 = S_MOV_B32 98 + %3099:sgpr_32 = S_MOV_B32 99 + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + %8001:vgpr_32 = COPY %8000 + %8002:vgpr_32 = COPY %8000 + %8003:vgpr_32 = COPY %8000 + %8004:vgpr_32 = COPY %8000 + %8005:vgpr_32 = COPY %8000 + %8006:vgpr_32 = COPY %8000 + %8007:vgpr_32 = COPY %8000 + %8008:vgpr_32 = COPY %8000 + %8009:vgpr_32 = COPY %8000 + %8010:vgpr_32 = COPY %8000 + %8011:vgpr_32 = COPY %8000 + %8012:vgpr_32 = COPY %8000 + %8013:vgpr_32 = COPY %8000 + %8014:vgpr_32 = COPY %8000 + %8015:vgpr_32 = COPY %8000 + %8016:vgpr_32 = COPY %8000 + %8017:vgpr_32 = COPY %8000 + + %9001:vgpr_32 = COPY %8001 + %9002:vgpr_32 = COPY %8002 + %9003:vgpr_32 = COPY %8003 + %9004:vgpr_32 = COPY %8004 + %9005:vgpr_32 = COPY %8005 + %9006:vgpr_32 = COPY %8006 + %9007:vgpr_32 = COPY %8007 + %9008:vgpr_32 = COPY %8008 + %9009:vgpr_32 = COPY %8009 + %9010:vgpr_32 = COPY %8010 + %9011:vgpr_32 = COPY %8011 + %9012:vgpr_32 = COPY %8012 + %9013:vgpr_32 = COPY %8013 + %9014:vgpr_32 = COPY %8014 + %9015:vgpr_32 = COPY %8015 + %9016:vgpr_32 = COPY %8016 + %9017:vgpr_32 = COPY %8017 + + S_BRANCH %bb.2 + + bb.2: + %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1 + %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1 + %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1 + %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1 + %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1 + %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1 + %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1 + %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1 + %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1 + %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1 + %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1 + %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1 + %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1 + %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1 + %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1 + %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1 + %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1 + %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1 + %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1 + %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1 + %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1 + %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1 + %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1 + %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1 + %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1 + %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1 + %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1 + %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1 + %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1 + %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1 + %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1 + %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1 + %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1 + %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1 + %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1 + %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1 + %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1 + %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1 + %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1 + %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1 + %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1 + %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1 + %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1 + %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1 + %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1 + %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1 + %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1 + %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1 + %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1 + %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1 + %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1 + %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1 + %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1 + %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1 + %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1 + %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1 + %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1 + %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1 + %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1 + %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1 + %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1 + %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1 + %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1 + %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1 + %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1 + %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1 + %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1 + %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1 + %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1 + %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1 + %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1 + %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1 + %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1 + %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1 + %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1 + %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1 + %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1 + %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1 + %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1 + %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1 + %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1 + %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1 + %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1 + %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1 + %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1 + %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1 + %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1 + %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1 + %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1 + %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1 + %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1 + %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1 + %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1 + %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1 + %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1 + %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1 + %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1 + %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1 + %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1 + %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1 + + + %3:vgpr_32 = IMPLICIT_DEF + + %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec + %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec + %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec + %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec + %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec + %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec + %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec + %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec + %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec + %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec + %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec + %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec + %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec + %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec + %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec + %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec + %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec + %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec + %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec + %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec + %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec + %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec + %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec + %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec + %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec + %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec + %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec + %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec + %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec + %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec + %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec + %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec + %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec + %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec + %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec + %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec + %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec + %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec + %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec + %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec + %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec + %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec + %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec + %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec + %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec + %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec + %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec + %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec + %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec + %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec + %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec + %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec + %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec + %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec + %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec + %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec + %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec + %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec + %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec + %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec + %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec + %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec + %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec + %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec + %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec + %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec + %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec + %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec + %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec + %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec + %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec + %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec + %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec + %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec + %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec + %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec + %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec + %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec + %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec + %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec + %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec + %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec + %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec + %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec + %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec + %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec + %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec + %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec + %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec + %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec + %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec + %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec + %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec + %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec + %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec + %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec + %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec + %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec + %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec + %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec + EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + \ No newline at end of file From 29eca4aa9b360ec1e98ece2539cbdeba2f7c24dd Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Wed, 12 Mar 2025 10:03:01 -0700 Subject: [PATCH 10/25] Removed old forks of things --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 4 +- .../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp | 2774 ----------------- .../AMDGPU/AMDGPUMirDivergenceAnalysis.h | 285 -- .../AMDGPUMirSyncDependenceAnalysis.cpp | 519 --- .../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h | 101 - llvm/lib/Target/AMDGPU/CMakeLists.txt | 2 - 6 files changed, 1 insertion(+), 3684 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 2cd28513f10f3..e508ed2a6e2cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -14,7 +14,6 @@ #include "AMDGPU.h" #include "AMDGPUMIRUtils.h" -#include "AMDGPUMirDivergenceAnalysis.h" #include "AMDGPUOccupancyAndLatencyHelper.h" #include "AMDGPUSubExpDag.h" #include "AMDGPUSubtarget.h" @@ -4620,8 +4619,7 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { MachineUniformityInfo MachineUniformity = llvm::computeMachineUniformityInfo(MF, CI, *DT, /*HasBranchDivergence*/ true); - - // llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI); + TotalUniformInsts.clear(); for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { if (MachineUniformity.isUniform(&MI)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp deleted file mode 100644 index 21aa5db0c6f27..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp +++ /dev/null @@ -1,2774 +0,0 @@ -//===- MirDivergenceAnalysis.cpp -- Mir Divergence Analysis Implementation -==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is based on Analysis/DivergenceAnalysis.cpp, -// The most important difference is -// introduction of the idea of "Bit-Divergence". -// -// The way booleans are represented in in AMD GPU is a 64-bit uint in a pair of -// scalar registers, where each bit represents a boolean value for one lane. If -// all active lanes have the same bool value (all 1's or all 0's), then we can -// generate a scalar branch, otherwise we must use exec mask to selectively -// execute lanes based on the boolean mask. When all values in a boolean mask -// are the same for all active lanes, we call that mask "bit-uniform", -// otherwise we call it "bit-divergent". This differs from the normal concept -// of "uniform" and "divergent", which represents whether the value may be -// different across the 64 lanes. A "bit-divergent" value is still "uniform" in -// the sense that it is the same 64-bit value from the perspective of all the -// lanes, but when used as branch condition, will cause the branch to be -// divergent, which will cause the uses of any values outside of the control -// flow region to be divergent. -// -// The original DA marks everything including bools as divergent or uniform -// based on the propagation of divergent sources. However, booleans in AMDGPU -// are in fact never "divergent". Comparison operations that receive divergent -// operands instead produce "bit-divergent" or "bit-uniform" 64-bit booleans. -// Between the definition of any boolean mask and its use (particularly in -// branches, cndmasks, or anything that specifially consumes booleans), there -// can be any arbitrary number and types of operations performed on it, -// including combining it with other boolean masks via bit operations. -// -// The XDA algorithm is a modified version of the original DA algorithm to -// simultaneously propagate regular divergence and bit-divergence. -// -// First off, XDA identifies all sources of divergence as well as -// bit-divergence and adds them to the worklist. Then, just like with LLVM DA, -// it pops values off of the worklist to propagate (bit-)divergence to all its -// users, unless the user is always (bit-)uniform when given (bit-)divergent -// operand. It's possible for a value to be marked as both divergent and -// bit-divergent, in which case the regular divergence will trump -// bit-divergence. -// -// The important difference in this propagation step is that there are special -// instructions that when given bit-divergent operands, produce divergent -// values and vice versa. -// -// An example is comparison: -// -// v0 = interp ... ; divergent -// v1 = interp ... ; divergent -// s[0:1] = v_cmp v0, v1 ; bit-divergent -// -// v0 and v1 are both divergent, but when propagating them, the v_cmp (and its -// result) is bit-divergent value instead of divergent. -// -// -// An example of the reverse: -// -// v0 = ... ; uniform -// s[0:1] = v_cmp v0, v1 ; bit-divergent -// ... -// branch s[0:1], label ; divergent! -// ... -// v1 = ... ; uniform -// ... -// -// label: -// v3 = phi v0, v1 ; divergent! because of divergent -// branch. -// -// The boolean value is bit-divergent. When passed to the branch as an operand, -// the branch becomes divergent, whose sync dependency will be computed as -// normal to mark the appropriate values divergent (see description in normal -// DA on how this works). -// -// Another difference is in MIR, some branch will be changed into exec update, -// so only propagate control flow divergent on branch inst will not cover exec -// control flow. -// For case like -// %163:sreg_64_xexec = S_MOV_B64 $exec -// bb.1: -//; predecessors: %bb.1, %bb.0 -// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), -// %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 -// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec -// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec -// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, -// implicit-def $scc, implicit $exec -//... -// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc -// S_CBRANCH_EXECNZ %bb.1, implicit $exec -// The ... code after SAVEEXEC will be divergent if %168 is divergent. -// The PHI should be divergent when %40 is inside the ... -// To propagate divergent from %168 to the PHI, need to start the propagate from -// SAVEEXEC which is the control flow by update exec. -// -// -// Original: -// This file implements a general divergence analysis for loop vectorization -// and GPU programs. It determines which branches and values in a loop or GPU -// program are divergent. It can help branch optimizations such as jump -// threading and loop unswitching to make better decisions. -// -// GPU programs typically use the SIMD execution model, where multiple threads -// in the same execution group have to execute in lock-step. Therefore, if the -// code contains divergent branches (i.e., threads in a group do not agree on -// which path of the branch to take), the group of threads has to execute all -// the paths from that branch with different subsets of threads enabled until -// they re-converge. -// -// Due to this execution model, some optimizations such as jump -// threading and loop unswitching can interfere with thread re-convergence. -// Therefore, an analysis that computes which branches in a GPU program are -// divergent can help the compiler to selectively run these optimizations. -// -// This implementation is derived from the Vectorization Analysis of the -// Region Vectorizer (RV). That implementation in turn is based on the approach -// described in -// -// Improving Performance of OpenCL on CPUs -// Ralf Karrenberg and Sebastian Hack -// CC '12 -// -// This DivergenceAnalysis implementation is generic in the sense that it does -// not itself identify original sources of divergence. -// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and -// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence -// (e.g., special variables that hold the thread ID or the iteration variable). -// -// The generic implementation propagates divergence to variables that are data -// or sync dependent on a source of divergence. -// -// While data dependency is a well-known concept, the notion of sync dependency -// is worth more explanation. Sync dependence characterizes the control flow -// aspect of the propagation of branch divergence. For example, -// -// %cond = icmp slt i32 %tid, 10 -// br i1 %cond, label %then, label %else -// then: -// br label %merge -// else: -// br label %merge -// merge: -// %a = phi i32 [ 0, %then ], [ 1, %else ] -// -// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid -// because %tid is not on its use-def chains, %a is sync dependent on %tid -// because the branch "br i1 %cond" depends on %tid and affects which value %a -// is assigned to. -// -// The sync dependence detection (which branch induces divergence in which join -// points) is implemented in the SyncDependenceAnalysis. -// -// The current DivergenceAnalysis implementation has the following limitations: -// 1. intra-procedural. It conservatively considers the arguments of a -// non-kernel-entry function and the return value of a function call as -// divergent. -// 2. memory as black box. It conservatively considers values loaded from -// generic or local address as divergent. This can be improved by leveraging -// pointer analysis and/or by modelling non-escaping memory objects in SSA -// as done in RV. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMirDivergenceAnalysis.h" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "GCNSubtarget.h" -#include "SIInstrInfo.h" -#include "TargetInfo/AMDGPUTargetInfo.h" -#include "Utils/AMDGPUAsmUtils.h" -#include "Utils/AMDGPUBaseInfo.h" -// #include "llvm/Analysis/Passes.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/Support/Debug.h" -// #include "newbe/cli/newbe_opts.h" // AMDGPU change. -#include "llvm/Support/raw_ostream.h" -#include - -using namespace llvm; - -#define DEBUG_TYPE "mir-divergence-analysis" - -namespace llvm { -bool isAMDGPUOpcodeDivergent(class MachineInstr *MI); -} - -// -// TODO: TableGen these -// -bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) { - switch (MI->getOpcode()) { - // case R600::INTERP_LOAD_P0: - // case R600::INTERP_PAIR_XY: - // case R600::INTERP_PAIR_ZW: - // case R600::INTERP_VEC_LOAD: - // case R600::INTERP_XY: - // case R600::INTERP_ZW: - case AMDGPU::V_WRITELANE_B32: - - case AMDGPU::V_INTERP_MOV_F32: - case AMDGPU::V_INTERP_MOV_F32_e64: - case AMDGPU::V_INTERP_MOV_F32_e64_vi: - case AMDGPU::V_INTERP_MOV_F32_si: - case AMDGPU::V_INTERP_MOV_F32_vi: - case AMDGPU::V_INTERP_P1LL_F16: - case AMDGPU::V_INTERP_P1LL_F16_vi: - case AMDGPU::V_INTERP_P1LV_F16: - case AMDGPU::V_INTERP_P1LV_F16_vi: - case AMDGPU::V_INTERP_P1_F32: - case AMDGPU::V_INTERP_P1_F32_16bank: - case AMDGPU::V_INTERP_P1_F32_16bank_si: - case AMDGPU::V_INTERP_P1_F32_16bank_vi: - case AMDGPU::V_INTERP_P1_F32_e64: - case AMDGPU::V_INTERP_P1_F32_e64_vi: - case AMDGPU::V_INTERP_P1_F32_si: - case AMDGPU::V_INTERP_P1_F32_vi: - case AMDGPU::V_INTERP_P2_F16: - case AMDGPU::V_INTERP_P2_F16_vi: - case AMDGPU::V_INTERP_P2_F32: - case AMDGPU::V_INTERP_P2_F32_e64: - case AMDGPU::V_INTERP_P2_F32_e64_vi: - case AMDGPU::V_INTERP_P2_F32_si: - case AMDGPU::V_INTERP_P2_F32_vi: - - case AMDGPU::V_MBCNT_HI_U32_B32_e32: - case AMDGPU::V_MBCNT_HI_U32_B32_e32_gfx6_gfx7: - case AMDGPU::V_MBCNT_HI_U32_B32_e64: - case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx10: - case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx6_gfx7: - case AMDGPU::V_MBCNT_HI_U32_B32_e64_vi: - case AMDGPU::V_MBCNT_HI_U32_B32_sdwa: - case AMDGPU::V_MBCNT_LO_U32_B32_e32: - case AMDGPU::V_MBCNT_LO_U32_B32_e32_gfx6_gfx7: - case AMDGPU::V_MBCNT_LO_U32_B32_e64: - case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx10: - case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx6_gfx7: - case AMDGPU::V_MBCNT_LO_U32_B32_e64_vi: - case AMDGPU::V_MBCNT_LO_U32_B32_sdwa: - - case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64: - case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_AND_ADDR64: - case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64: - case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_INC_ADDR64: - case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_OR_ADDR64: - case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64: - case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64: - case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64: - case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_vi: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx10: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx6_gfx7: - case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_vi: - - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si: - // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_vi: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_si: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_vi: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_si: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_vi: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_si: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_vi: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_si: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_vi: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_si: - case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_vi: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_nsa_gfx10: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_si: - case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_vi: - - case AMDGPU::SI_PS_LIVE: - - case AMDGPU::DS_SWIZZLE_B32: - case AMDGPU::DS_SWIZZLE_B32_gfx10: - case AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7: - case AMDGPU::DS_SWIZZLE_B32_vi: - - return true; - - default: - break; - } - return false; -} - -namespace { -bool hasImmOperandWithVal(const MachineInstr *MI, uint16_t srcNameIdx, - uint16_t srcModNameIdx, uint64_t Val) { - unsigned Op = MI->getOpcode(); - unsigned srcIdx = AMDGPU::getNamedOperandIdx(Op, srcNameIdx); - if (srcIdx == -1) - return false; - const MachineOperand &srcMO = MI->getOperand(srcIdx); - if (srcMO.isImm() && srcMO.getImm() == Val) { - - unsigned modIdx = AMDGPU::getNamedOperandIdx(Op, srcModNameIdx); - if (modIdx == -1) - return true; - - const MachineOperand &modMO = MI->getOperand(modIdx); - if (modMO.getImm() == 0) - return true; - } - return false; -} - -bool isConstant(const MachineInstr *MI) { - unsigned Op = MI->getOpcode(); - switch (Op) { - default: - break; - case AMDGPU::V_OR_B32_e32: - case AMDGPU::V_OR_B32_e64: { - // Check special case or -1, which will get result -1. - const uint64_t kImm = -1; - if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0, - AMDGPU::OpName::src0_modifiers, kImm)) - return true; - if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1, - AMDGPU::OpName::src1_modifiers, kImm)) - return true; - } break; - case AMDGPU::S_OR_B32: - case AMDGPU::S_OR_B64: { - // Check special case or -1, which will get result -1. - const uint64_t kImm = -1; - if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0, - AMDGPU::OpName::src0_modifiers, kImm)) - return true; - if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1, - AMDGPU::OpName::src1_modifiers, kImm)) - return true; - } break; - case AMDGPU::S_AND_B32: - case AMDGPU::S_AND_B64: - case AMDGPU::V_AND_B32_e32: - case AMDGPU::V_AND_B32_e64: { - // Check special case and 0, which will get result 0. - const uint64_t kImm = 0; - if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0, - AMDGPU::OpName::src0_modifiers, kImm)) - return true; - if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1, - AMDGPU::OpName::src1_modifiers, kImm)) - return true; - } break; - } - return false; -} - -bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI, - const MachineRegisterInfo &MRI) { - const auto *BoolRC = SIRI->getBoolRC(); - for (const MachineOperand &MO : MI->operands()) { - if (!MO.isReg()) - continue; - if (MO.isUse()) - continue; - unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::VCC || - Reg == AMDGPU::VCC_LO) - return true; - - // Check if the written register class overlaps the bool register class. - // - // Note that this check is insufficent to catch all of the cases where - // a "bool" value could be created (for example writing to a register - // pair s[0:1], then using s0 as a bool value in wave32). - // - // The underlying problem is that we have two notions of divergence - // (bit divergence and wave divergence) but the algorithm only propagates - // wave divergence. The bit divergence is important for bools because it - // determines if a branch is uniform or not (and thus catches cases where a - // uniform value is used outside of a divergent control flow region). For - // bool values the algorithm will treat normally uniform values (i.e. scalar - // registers) as divergent in order to try and propagate bit divergence. - // - // To fix all the possible bugs here I think we need to actually proagate - // bit divergence as well as wave divergences. That is a bigger fix and this - // check should cover most cases of treating a bool value as divergent. - const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); - if (SIRI->getCommonSubClass(BoolRC, RC)) - return true; - } - return false; -} - -bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII, - const SIRegisterInfo *SIRI, - const MachineRegisterInfo &MRI) { - unsigned Op = MI->getOpcode(); - switch (Op) { - default: - // Mark all s_inst always uniform except write to bool dst. This doesn't - // mean it is bit uniform. When check branch/exec region, will use - // isBitUniform. A bool might be sreg, but still divergent, since it is just - // put all lanes in one 64/32 bits sreg. - if (SIII->isScalarUnit(*MI) && !writeBoolDst(MI, SIRI, MRI) && - !MI->isTerminator()) - return true; - break; - // case AMDGPU::AMDGPU_MAKE_UNIFORM: - // case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST: - case AMDGPU::V_READFIRSTLANE_B32: - case AMDGPU::V_READLANE_B32: - // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32: - // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64: - // bool readfirstlane should be 1 bit, which means bit uniform. - return true; - case AMDGPU::S_OR_B32: - case AMDGPU::S_OR_B64: { - // Check special case or -1, which will get result -1. - if (isConstant(MI)) - return true; - - return !writeBoolDst(MI, SIRI, MRI); - } break; - case AMDGPU::V_OR_B32_e32: - case AMDGPU::V_OR_B32_e64: { - // Check special case or -1, which will get result -1. - if (isConstant(MI)) - return true; - } break; - case AMDGPU::S_AND_B32: - case AMDGPU::S_AND_B64: { - // Check special case and 0, which will get result 0. - if (isConstant(MI)) - return true; - - return !writeBoolDst(MI, SIRI, MRI); - } break; - case AMDGPU::V_AND_B32_e32: - case AMDGPU::V_AND_B32_e64: { - // Check special case and 0, which will get result 0. - if (isConstant(MI)) - return true; - } break; - } - return false; -} - -bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) { - return reg.isPhysical(); - ; -} - -bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) { - return MRI.getRegClass(reg)->getID() == regClassID; -} - -// For input reg of MF, vgpr will be divergent. -bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI) { - if (isPhysicalReg(MRI, Reg)) { - unsigned vir_reg = MRI.getLiveInVirtReg(Reg); - if (SIRI->isVGPR(MRI, vir_reg)) - return true; - } else { - if (SIRI->isVGPR(MRI, Reg)) - return true; - } - return false; -} - -bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { - // if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent)) - // return true; - if (isAMDGPUOpcodeDivergent(MI)) - return true; - - if (isAlwaysUniformMI(MI, SIII, SIRI, MRI)) - return false; - - // If the instruction is neither guaranteed to - // be uniform or divergent, check whether any - // of its operands are passed in to the shader as - // args through vector regs. - // - // This makes them divergent. - for (MachineOperand &op : MI->operands()) { - if (!op.isReg()) - continue; - if (op.isDef()) - continue; - unsigned reg = op.getReg(); - if (MRI.isLiveIn(reg)) { - if (isDivergentInputReg(reg, MRI, SIRI)) - return true; - } - } - - return false; -} - -// For VCC, try to find the nearest define inside same BB. -const MachineInstr *findPhysicalDefineInSameMBB(const MachineInstr *MI, - unsigned PhyReg) { - const MachineBasicBlock *MBB = MI->getParent(); - auto it = MI->getReverseIterator(); - for (it++; it != MBB->rend(); it++) { - const MachineInstr &TmpMI = *it; - for (const MachineOperand &DefMO : TmpMI.operands()) { - if (!DefMO.isReg()) - continue; - if (DefMO.isUse()) - continue; - if (DefMO.getReg() == PhyReg) - return &TmpMI; - } - } - return nullptr; -} - -bool isWriteExec(const MachineInstr *MI) { - for (const MachineOperand &MO : MI->operands()) { - if (!MO.isReg()) - continue; - if (MO.isUse()) - continue; - unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) - return true; - } - return false; -} - -bool isVCndMask(unsigned Opcode) { - switch (Opcode) { - default: - return false; - case AMDGPU::V_CNDMASK_B32_e32: - case AMDGPU::V_CNDMASK_B32_e64: - case AMDGPU::V_CNDMASK_B32_dpp: - case AMDGPU::V_CNDMASK_B32_sdwa: - case AMDGPU::V_CNDMASK_B64_PSEUDO: - return true; - } -} - -bool isExecRegionOp(unsigned Op) { - switch (Op) { - default: - return false; - case AMDGPU::COPY: - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - return true; - } -} - -bool isRestoreExec(const MachineInstr *MI) { - unsigned Op = MI->getOpcode(); - if (!isExecRegionOp(Op)) - return false; - - return isWriteExec(MI); -} - -const MachineInstr * -findExecRegionBeginFromRegionEnd(const MachineInstr *MI, - const MachineRegisterInfo &MRI) { - const MachineOperand &MO = MI->getOperand(1); - if (!MO.isReg()) - return nullptr; - unsigned Reg = MO.getReg(); - const MachineInstr *Def = MRI.getUniqueVRegDef(Reg); - if (!Def) - return nullptr; - // Make sure the def is S_MOV Reg, Exec. - if (!isExecRegionOp(Def->getOpcode())) - return nullptr; - const MachineOperand &ExecMO = Def->getOperand(1); - if (!ExecMO.isReg()) - return nullptr; - unsigned ExecReg = ExecMO.getReg(); - if (ExecReg == AMDGPU::EXEC || ExecReg == AMDGPU::EXEC_LO) - return Def; - else - return nullptr; -} - -bool isInsideExecRegion(const MachineInstr &MI, const MachineInstr &RegionBegin, - const MachineInstr &RegionEnd, - const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT) { - if (!DT.dominates(&RegionBegin, &MI)) - return false; - - const MachineBasicBlock *MBB = MI.getParent(); - const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent(); - if (MBB != RegionEndMBB) { - return PDT.dominates(RegionEndMBB, MBB); - } else { - // MachineLoop through the basic block until we find A or B. - MachineBasicBlock::const_iterator I = MBB->begin(); - for (; I != MI.getIterator() && I != RegionEnd.getIterator(); ++I) - /*empty*/; - - // RegionEnd post-dominates MI if MI is found first in the basic block. - return I == MI.getIterator(); - } -} - -bool isInsideExecRegion(const MachineBasicBlock &MBB, - const MachineInstr &RegionBegin, - const MachineInstr &RegionEnd, - const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT) { - const MachineBasicBlock *RegionBeginMBB = RegionBegin.getParent(); - const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent(); - if (!DT.dominates(RegionBeginMBB, &MBB)) - return false; - return PDT.dominates(RegionEndMBB, &MBB); -} - -// Map from BB to nearest Exec Region. How to build? Add every MBB unless -// already has smaller region? Then when hit saveExec, propagate leaked users of -// define inside the exec region. - -} // namespace - -namespace llvm { -// class DivergenceAnalysis -DivergenceAnalysis::DivergenceAnalysis( - const MachineFunction &F, const MachineLoop *RegionLoop, - const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm, - // AMDGPU change begin. - DivergentJoinMapTy &JoinMap - // AMDGPU change end. - ) - : F(F), MRI(F.getRegInfo()), RegionLoop(RegionLoop), DT(DT), PDT(PDT), - LI(LI), SDA(SDA), DivergentJoinMap(JoinMap), // AMDGPU change - IsLCSSAForm(IsLCSSAForm) { - const GCNSubtarget *ST = &F.getSubtarget(); - SIRI = ST->getRegisterInfo(); - SIII = ST->getInstrInfo(); -} - -void DivergenceAnalysis::markDivergent(const ValueTy DivVal) { - assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); - // AMDGPU change begin. - LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget(); - const SIRegisterInfo *SIRI = ST->getRegisterInfo(); - dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI);); - // AMDGPU change end. - DivergentValues.insert(DivVal); -} - -// Mir change. -void DivergenceAnalysis::markDivergent(const MachineInstr &I) { - for (const MachineOperand &DstMO : I.defs()) { - unsigned Reg = DstMO.getReg(); - markDivergent(Reg); - } - DivergentInsts.insert(&I); -} - -void DivergenceAnalysis::addUniformOverride(const ValueTy UniVal) { - // TODO: support uniform multi-def. - if (MRI.getUniqueVRegDef(UniVal) == nullptr) - return; - - UniformOverrides.insert(UniVal); -} - -void DivergenceAnalysis::addUniformOverride(const MachineInstr &I) { - for (const MachineOperand &DstMO : I.defs()) { - unsigned Reg = DstMO.getReg(); - addUniformOverride(Reg); - } - UniformOverridesInsts.insert(&I); -} - -bool DivergenceAnalysis::isBitUniform( - const MachineInstr &I, const llvm::MachineOperand &UseMO, - llvm::DenseMap &Processed) const { - if (UseMO.isImm()) { - uint64_t val = UseMO.getImm(); - // 0 and -1 are OK since all lanes are still the same. - if (val == 0 || val == -1) - return true; - else - return false; - } - if (!UseMO.isReg()) - return true; - unsigned Reg = UseMO.getReg(); - // Exec is always bituniform, because all active lanes are 1. - if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || - // SCC only has 1 bit. Always bituniform. - Reg == AMDGPU::SCC) - return true; - - const MachineInstr *UseMI = nullptr; - if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) { - // Try to find define of this VCC. - UseMI = findPhysicalDefineInSameMBB(&I, Reg); - } else { - UseMI = MRI.getUniqueVRegDef(Reg); - } - if (!UseMI) { - return false; - } - - bool bResult = isBitUniform(*UseMI, Processed); - Processed[UseMI] = bResult; - return bResult; -} - -bool DivergenceAnalysis::isBitUniform( - const MachineInstr &I, - llvm::DenseMap &Processed) const { - auto it = Processed.find(&I); - if (it != Processed.end()) - return it->second; - // For branch on MIR, need to make sure all activi lanes are the same. - // cmp of uniform value will make sure all active lanes are the same. - // Imm is also the same for all active lanes. - if (isDivergent(I)) - return false; - // Uniform cmp is bit uniform. - if (I.isCompare()) - return true; - if (isConstant(&I)) - return true; - - // Conservatively consider bituniform to be false. - Processed[&I] = false; - - // If all operand is bit uniform, then result is bit uniform. - bool bAllOperandBitUniform = true; - for (const MachineOperand &UseMO : I.uses()) { - if (isBitUniform(I, UseMO, Processed)) - continue; - bAllOperandBitUniform = false; - break; - } - return bAllOperandBitUniform; -} - -bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const { - if (Term.getParent()->succ_size() <= 1) - return false; - switch (Term.getOpcode()) { - default: { - if (updateNormalInstruction(Term)) - return true; - llvm::DenseMap Processed; - // Check bit uniform here if not divergent. - return !isBitUniform(Term, Processed); - } - // case AMDGPU::AMDGPU_CALL_INDIRECT: - case AMDGPU::SI_CALL: - return true; - } -} - -bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const { - // TODO function calls with side effects, etc - if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end()) - return false; - if (DivergentInsts.find(&I) != DivergentInsts.end()) - return true; - for (const auto &Op : I.uses()) { - if (!Op.isReg()) - continue; - Register Reg = Op.getReg(); - if (Reg.isPhysical()) { - if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::SCC) - continue; - else if (const MachineInstr *DefMI = - findPhysicalDefineInSameMBB(Op.getParent(), Reg)) { - if (isDivergent(*DefMI)) - return true; - } else { - // If cannot find def in same MBB, just treat it as divergent. - return true; - } - } else { - if (isDivergent(Op.getReg())) - return true; - } - } - return false; -} - -bool DivergenceAnalysis::isTemporalDivergent( - const MachineBasicBlock &ObservingBlock, const ValueTy Val, - const MachineBasicBlock &IncomingBlock) const { // AMDGPU change - const MachineBasicBlock *DefBlock = - &IncomingBlock; // AMDGPU change: Take def point as incoming block for - // constants. - const auto *Inst = MRI.getUniqueVRegDef(Val); - if (Inst == nullptr) - return true; - if (Inst) - DefBlock = Inst->getParent(); - - // check whether any divergent loop carrying Val terminates before control - // proceeds to ObservingBlock - for (const auto *MachineLoop = LI.getLoopFor(DefBlock); // AMDGPU change - MachineLoop != RegionLoop && !MachineLoop->contains(&ObservingBlock); - MachineLoop = MachineLoop->getParentLoop()) { - if (DivergentLoops.find(MachineLoop) != DivergentLoops.end()) - return true; - } - - return false; -} - -// AMDGPU CHANGE BEGIN -static bool HasIncomingUndefValue(const PHINode_ *Phi) { - for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) { - const MachineOperand &Op = Phi->getOperand(I); - if (Op.isUndef()) - return true; - } - return false; -} - -// For case like -// %163:sreg_64_xexec = S_MOV_B64 $exec -// bb.1: -//; predecessors: %bb.1, %bb.0 -// successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), -// %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1 -// %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec -// %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec -// %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, -// implicit-def $scc, implicit $exec -//... -// $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc -// S_CBRANCH_EXECNZ %bb.1, implicit $exec -// The ... code after SAVEEXEC will be divergent if %168 is divergent. -// Return the SaveExec which affect MI. -// If not exist, return nullptr. -static const MachineInstr * -findSaveExec(const MachineInstr *MI, - const SmallVector &SaveExecs) { - // No save exec. - if (SaveExecs.empty()) - return nullptr; - if (SaveExecs.size() > 1) - llvm::report_fatal_error( - "Not support case where, MBB has more than one SaveExec"); - const MachineInstr *SaveExec = SaveExecs.front(); - const MachineBasicBlock *MBB = SaveExec->getParent(); - // Make sure MI is after SaveExec by check it is not before SaveExec. - // Assume MBB.begin to SaveExec is short here. - bool bIsAfterSaveExec = true; - for (auto it = MBB->begin(); it != SaveExec->getIterator(); it++) { - if (MI == it) { - bIsAfterSaveExec = false; - break; - } - } - // Not affect by save exec. - if (!bIsAfterSaveExec) - return nullptr; - - return SaveExec; -} - -// When a Phi's parent isJoinDivergent,the case make phi divergent is that 2 -// incoming values merged from different path of a divergent branch. -// isJoinDivergentOnlyOnSameIncomingValue will check for all -// combinations of incoming values except the BB with same incoming value, -// because if values are same then even divergent branch is not divergent. -// For example phi a:A, b:B, a:C. -// It will check (A,B) (B,C) but not (A, C) Because A -// and C has same value a. -// If only (A,C) is sharing divergent branch, -// then phi a:A, b:B, a:C is still uniform. -// DivergentJoinMap saving MachineBasicBlock pairs which on different path of a -// divergent branch and joined at one block. -// For example, -// A -// / \ -// | \ -// | \ -// B / -// | \ / -// | \ / -// C D -// | / -// \ / -// E -// If A is uniform branch, B is divergent branch. Then only (C, D) will be saved -// in DivergentJoinMap. -// DivergentJoinMap is build with updateDisjointMap in -// SyncDependenceAnalysis.cpp when SyncDependenceAnalysis::join_block is called. -// It will only run on divergent branch, so (A, B) is not in -// DivergentDisjointMap when A is uniform. -static bool isJoinDivergentOnlyOnSameIncomingValue( - const PHINode_ &Phi, const DivergenceAnalysis *pDA, - const MachineDominatorTree &DT, DivergentJoinMapTy &DivergentJoinMap) { - // for phi which join divergent, if the incoming values from divergent - // branch are the same, the phi is still uniform. - // A - // | \ - // | \ - // B \ - // |\ \ - // | \ | - // C D E - // | / / - // \/ / - // \ / - // F - // for phi in F like. - // phi (a:C, a:D, b:E) - // If A is uniform branch, B is non-uniform branch, phi is uniform. - SmallDenseSet ValueToBlockMap; - for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { - const MachineOperand &Op = Phi.getOperand(I); - if (!Op.isReg()) - continue; - unsigned Reg = Op.getReg(); - if (pDA->isDivergent(Reg)) - return false; - - ValueToBlockMap.insert(Reg); - } - unsigned NumIncoming = (Phi.getNumOperands() - 1) / 2; - // When there's same incoming value from different incoming block. - // If divergent select is only on same value, then it is still uniform. - if (ValueToBlockMap.size() != NumIncoming) { - // When a phi is on divergent join block, there is incoming block which is - // comeing from different path of a divergent branch. - // Check all combination here. - for (unsigned i = 0; i < NumIncoming; i++) { - MachineBasicBlock *BB0 = Phi.getOperand(2 + 2 * i).getMBB(); - const MachineOperand &MO0 = Phi.getOperand(1 + 2 * i); - for (unsigned j = i + 1; j < NumIncoming; j++) { - MachineBasicBlock *BB1 = Phi.getOperand(2 + 2 * j).getMBB(); - const MachineOperand &MO1 = Phi.getOperand(1 + 2 * j); - // If value match, no divergent. - if (MO0.isImm() && MO1.isImm() && MO0.getImm() == MO1.getImm()) - continue; - if (MO0.isReg() && MO1.isReg() && MO0.getReg() == MO1.getReg() && - MO0.getSubReg() == MO1.getSubReg()) - continue; - - // If BB and BB2 is from divergent disjoint, then they will - // divergent join on phi. - // This is for case like - // A - // / \ - // | \ - // | \ - // B / - // | \ / - // | \ / - // C D - // | / - // \ / - // E - // - // phi(a:C, b:D) - // When nearestCommonDominator is A, but B also can be divergent - // disjoint for C and D. - if (DivergentJoinMap[BB0].count(BB1)) - return false; - } - } - return true; - } else { - return false; - } -} -// AMDGPU CHANGE END - -bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const { - // AMDGPU CHANGE BEGIN - // Do not mark phis with undef as incoming values as uniform. - // When promoting to scalar we will readfirstlane on - // the phi output. If some of the inputs are undef then - // this could replace a well defined vector value with an - // undefined scalar value. - if (HasIncomingUndefValue(&Phi)) - return true; - // AMDGPU CHANGE END - - // joining divergent disjoint path in Phi parent block - if (isJoinDivergent(*Phi.getParent())) { - // AMDGPU CHANGE BEGIN - if (true /*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) { - // Continue if the divergent join only on same incoming value. - if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT, - DivergentJoinMap)) - return true; - } else - // AMDGPU CHANGE END - return true; - } - - // An incoming value could be divergent by itself. - // Otherwise, an incoming value could be uniform within the loop - // that carries its definition but it may appear divergent - // from outside the loop. This happens when divergent loop exits - // drop definitions of that uniform value in different iterations. - // - // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop - // if (i % thread_id == 0) break; // divergent loop exit - // } - // int divI = i; // divI is divergent - for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { - const MachineOperand &Op = Phi.getOperand(I); - if (!Op.isReg()) - continue; - - unsigned Reg = Op.getReg(); - const MachineOperand &BB = Phi.getOperand(I + 1); - if (isDivergent(Reg) || - isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB())) - return true; - } - - return false; -} - -bool DivergenceAnalysis::updateVCndMask(const MachineInstr &VCndMask) const { - // VCndMask require the Cond bituniform to be uniform. - unsigned Op = VCndMask.getOpcode(); - unsigned src0Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src0); - unsigned src1Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src1); - unsigned src2Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src2); - - const MachineOperand &src0 = VCndMask.getOperand(src0Idx); - const MachineOperand &src1 = VCndMask.getOperand(src1Idx); - - const MachineOperand &cond = VCndMask.getOperand(src2Idx); - - if (isDivergent(src0)) - return true; - - // If src0 == src1, then return src0 divergent. - if (src0.isReg() && src1.isReg() && src0.getReg() == src1.getReg()) { - if (src0.getSubReg() == src1.getSubReg() && - SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src0_modifiers) == - SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src1_modifiers)) - return false; - } - - if (isDivergent(src1)) - return true; - - llvm::DenseMap Processed; - return !isBitUniform(VCndMask, cond, Processed); -} - -bool DivergenceAnalysis::inRegion(const MachineInstr &I) const { - return I.getParent() && inRegion(*I.getParent()); -} - -bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const { - return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); -} - -// marks all users of loop-carried values of the loop headed by LoopHeader as -// divergent -void DivergenceAnalysis::taintLoopLiveOuts( - const MachineBasicBlock &LoopHeader) { - auto *DivLoop = LI.getLoopFor(&LoopHeader); - assert(DivLoop && "loopHeader is not actually part of a loop"); - - SmallVector TaintStack; - DivLoop->getExitBlocks(TaintStack); - - // Otherwise potential users of loop-carried values could be anywhere in the - // dominance region of DivLoop (including its fringes for phi nodes) - DenseSet Visited; - for (auto *Block : TaintStack) { - Visited.insert(Block); - } - Visited.insert(&LoopHeader); - - while (!TaintStack.empty()) { - auto *UserBlock = TaintStack.back(); - TaintStack.pop_back(); - - // don't spread divergence beyond the region - if (!inRegion(*UserBlock)) - continue; - - assert(!DivLoop->contains(UserBlock) && - "irreducible control flow detected"); - - // phi nodes at the fringes of the dominance region - if (!DT.dominates(&LoopHeader, UserBlock)) { - // all PHI nodes of UserBlock become divergent - pushPHINodes(*UserBlock); - continue; - } - - // taint outside users of values carried by DivLoop - for (auto &I : *UserBlock) { - if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) - continue; - if (isDivergent(I)) - continue; - - for (auto &Op : I.uses()) { - if (!Op.isReg()) - continue; - unsigned OpReg = Op.getReg(); - MachineInstr *OpInst = MRI.getUniqueVRegDef(OpReg); - if (!OpInst) - continue; - if (DivLoop->contains(OpInst->getParent())) { - markDivergent(I); - pushUsers(I); - break; - } - } - } - - // visit all blocks in the dominance region - for (auto *SuccBlock : UserBlock->successors()) { - if (!Visited.insert(SuccBlock).second) { - continue; - } - TaintStack.push_back(SuccBlock); - } - } -} - -void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { - Worklist.push_back(&I); -} -void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) { - for (const auto &Phi : Block.phis()) { - if (isDivergent(Phi)) - continue; - pushInstruction(Phi); - } -} - -void DivergenceAnalysis::pushUsers(const ValueTy V) { - for (const auto &UserInst : MRI.use_nodbg_instructions(V)) { - - if (isDivergent(UserInst)) - continue; - - // only compute divergent inside loop - if (!inRegion(UserInst)) - continue; - - Worklist.push_back(&UserInst); - } -} -void DivergenceAnalysis::pushUsers(const MachineInstr &I) { - for (const auto &DstMO : I.defs()) { - unsigned Reg = DstMO.getReg(); - pushUsers(Reg); - } -} - -bool DivergenceAnalysis::propagateJoinDivergence( - const MachineBasicBlock &JoinBlock, const MachineLoop *BranchLoop) { - LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); - - // ignore divergence outside the region - if (!inRegion(JoinBlock)) { - return false; - } - - // push non-divergent phi nodes in JoinBlock to the worklist - pushPHINodes(JoinBlock); - - // JoinBlock is a divergent loop exit - if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { - return true; - } - - // disjoint-paths divergent at JoinBlock - markBlockJoinDivergent(JoinBlock); - return false; -} - -void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) { - LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); - - markDivergent(Term); - - const auto *BranchLoop = LI.getLoopFor(Term.getParent()); - - // whether there is a divergent loop exit from BranchLoop (if any) - bool IsBranchLoopDivergent = false; - - // iterate over all blocks reachable by disjoint from Term within the loop - // also iterates over loop exits that become divergent due to Term. - for (const auto *JoinBlock : SDA.join_blocks(Term)) { - IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); - } - - // Branch loop is a divergent loop due to the divergent branch in Term - if (IsBranchLoopDivergent) { - assert(BranchLoop); - if (!DivergentLoops.insert(BranchLoop).second) { - return; - } - propagateLoopDivergence(*BranchLoop); - } -} - -void DivergenceAnalysis::propagateLoopDivergence( - const MachineLoop &ExitingLoop) { - LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() - << "\n"); - - // don't propagate beyond region - if (!inRegion(*ExitingLoop.getHeader())) - return; - - const auto *BranchLoop = ExitingLoop.getParentLoop(); - - // Uses of loop-carried values could occur anywhere - // within the dominance region of the definition. All loop-carried - // definitions are dominated by the loop header (reducible control). - // Thus all users have to be in the dominance region of the loop header, - // except PHI nodes that can also live at the fringes of the dom region - // (incoming defining value). - if (!IsLCSSAForm) - taintLoopLiveOuts(*ExitingLoop.getHeader()); - - // whether there is a divergent loop exit from BranchLoop (if any) - bool IsBranchLoopDivergent = false; - - // iterate over all blocks reachable by disjoint paths from exits of - // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn - // become divergent. - for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { - IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); - } - - // Branch loop is a divergent due to divergent loop exit in ExitingLoop - if (IsBranchLoopDivergent) { - assert(BranchLoop); - if (!DivergentLoops.insert(BranchLoop).second) { - return; - } - propagateLoopDivergence(*BranchLoop); - } -} - -// For case like -// %149:sreg_64_xexec = S_MOV_B64 $exec -// -// bb.3: -//; predecessors: %bb.3, %bb.2 -// successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), -// %bb.4(50.00%) -// -// %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3 -// %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec -// %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec -// %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, -// implicit-def $scc, implicit $exec $m0 = S_MOV_B32 %153:sgpr_32 %55:vreg_512 -// = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit -// $exec $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc -// S_CBRANCH_EXECNZ %bb.3, implicit $exec -// -// bb.4: -//; predecessors: %bb.3 -// successors: %bb.5(0x80000000); %bb.5(100.00%) -// -// $exec = S_MOV_B64 %149:sreg_64_xexec - -// bb.3 is inside exec region which exec is saved by %149. -// %152:sreg_64 = S_AND_SAVEEXEC_B64 will update the exec which cause divergence -// when it is not bituniform. Everything inside the exec region need to be -// scaned. Out region or phi use should be marked as divergent and add users to -// worklist. -void DivergenceAnalysis::propagateExecControlFlowDivergence( - const MachineInstr &SaveExec) { - const MachineBasicBlock *MBB = SaveExec.getParent(); - auto it = ExecRegionMap.find(MBB); - if (it == ExecRegionMap.end()) - return; - ExecRegion &Region = *it->second; - // One region only need to propagate once. - if (Region.bPropagated) - return; - Region.bPropagated = true; - // Scan all MIs in the region. Mark out region or phi use as divergent and add - // their users to worklist. - auto propagateExecDivergence = [this, Region](const MachineInstr *MI) { - for (const auto &DstMO : MI->defs()) { - Register Reg = DstMO.getReg(); - // Only VCC/Exec/m0. - // Exec always uniform. Assume VCC and m0 not cross region. - if (Reg.isPhysical()) - continue; - for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) { - - if (isDivergent(UserInst)) - continue; - - // only propagate user outside of region or phi which will not be - // guarded by saveExec. - if (UserInst.getOpcode() != AMDGPU::PHI && - isInsideExecRegion(UserInst, *Region.begin, *Region.end, DT, PDT)) { - continue; - } - // Write exec is not divergent. - if (isWriteExec(&UserInst)) - continue; - - markDivergent(UserInst); - pushUsers(UserInst); - } - } - }; - const MachineBasicBlock *RegionBeginMBB = Region.begin->getParent(); - const MachineBasicBlock *RegionEndMBB = Region.end->getParent(); - if (RegionBeginMBB != RegionEndMBB) { - auto it = Region.begin->getIterator(); - for (it++; it != RegionBeginMBB->end(); it++) { - const MachineInstr &MI = *it; - propagateExecDivergence(&MI); - } - - // All blocks between RegionBeginMBB and RegionEndMBB. - for (const MachineBasicBlock *MBB : Region.blocks) { - for (const MachineInstr &MI : *MBB) { - propagateExecDivergence(&MI); - } - } - - for (auto it = RegionEndMBB->begin(); it != Region.end->getIterator(); - it++) { - const MachineInstr &MI = *it; - propagateExecDivergence(&MI); - } - - } else { - auto it = Region.begin->getIterator(); - for (it++; it != Region.end->getIterator(); it++) { - const MachineInstr &MI = *it; - propagateExecDivergence(&MI); - } - } -} - -void DivergenceAnalysis::compute() { - SmallVector ExecRegions; - // Build exec regions. - // Add VCndMask for non-bituniform caused by input sreg. - for (const MachineBasicBlock &MBB : F) { - for (const MachineInstr &Term : MBB.terminators()) { - if (updateTerminator(Term)) - pushInstruction(Term); - } - - for (const MachineInstr &I : MBB) { - unsigned Opcode = I.getOpcode(); - if (isVCndMask(Opcode)) { - // Cond for CndMask needs bit uniform check. - // Add it to worklist to check bit uniform from input. - pushInstruction(I); - } else if (isRestoreExec(&I)) { - const MachineInstr *RegionBegin = - findExecRegionBeginFromRegionEnd(&I, MRI); - if (RegionBegin) { - ExecRegions.emplace_back(ExecRegion(RegionBegin, &I)); - } - } - } - } - - // Build exec region map. - for (const MachineBasicBlock &MBB : F) { - for (ExecRegion &Region : ExecRegions) { - if (isInsideExecRegion(MBB, *Region.begin, *Region.end, DT, PDT)) { - // Add block to region. - if (&MBB != Region.begin->getParent() && - &MBB != Region.end->getParent()) - Region.blocks.emplace_back(&MBB); - // Update ExecRegionMap. - auto it = ExecRegionMap.find(&MBB); - if (it == ExecRegionMap.end()) { - ExecRegionMap[&MBB] = &Region; - } else { - // When MBB inside multiple regions, save the smallest one. - if (isInsideExecRegion(*Region.begin, *it->second->begin, - *it->second->end, DT, PDT)) { - ExecRegionMap[&MBB] = &Region; - } - } - } - } - } - - for (auto DivVal : DivergentValues) { - LLVM_DEBUG(dbgs() << "\t sourceOfDivergence :"; printReg(DivVal, SIRI); - dbgs() << "\n";); - pushUsers(DivVal); - } - - // propagate divergence - while (!Worklist.empty()) { - const MachineInstr *I = Worklist.back(); - Worklist.pop_back(); - - // maintain uniformity of overrides - if (isAlwaysUniformMI(I, SIII, SIRI, MRI)) { - // If used by terminators, and not bit uniform. - // Add terminator. - SmallVector TermUsers; - for (const auto &DstMO : I->defs()) { - unsigned Reg = DstMO.getReg(); - for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) { - - if (isDivergent(UserInst)) - continue; - // Only check terminator here. - if (!UserInst.isTerminator()) - continue; - - // only compute divergent inside loop - if (!inRegion(UserInst)) - continue; - - TermUsers.emplace_back(&UserInst); - } - } - - if (!TermUsers.empty()) { - llvm::DenseMap Processed; - if (!isBitUniform(*I, Processed)) { - for (const MachineInstr *Term : TermUsers) { - Worklist.emplace_back(Term); - } - } - } - - continue; - } - - bool WasDivergent = isDivergent(*I); - if (WasDivergent) - continue; - - // propagate divergence caused by terminator - if (I->isTerminator()) { - if (updateTerminator(*I)) { - // propagate control divergence to affected instructions - propagateBranchDivergence(*I); - continue; - } - } - - // update divergence of I due to divergent operands - bool DivergentUpd = false; - unsigned Opcode = I->getOpcode(); - switch (I->getOpcode()) { - default: - if (isVCndMask(Opcode)) { - DivergentUpd = updateVCndMask(*I); - } else { - DivergentUpd = updateNormalInstruction(*I); - llvm::DenseMap Processed; - if ((DivergentUpd || !isBitUniform(*I, Processed)) && isWriteExec(I)) { - // propagate exec control divergence to affected instructions. - propagateExecControlFlowDivergence(*I); - } - } - break; - case AMDGPU::PHI: - DivergentUpd = updatePHINode(*I); - break; - } - - // propagate value divergence to users - if (DivergentUpd) { - markDivergent(*I); - pushUsers(*I); - } - } -} - -bool DivergenceAnalysis::isAlwaysUniform(const ValueTy V) const { - return UniformOverrides.find(V) != UniformOverrides.end(); -} - -bool DivergenceAnalysis::isDivergent(const ValueTy V) const { - return DivergentValues.find(V) != DivergentValues.end(); -} - -bool DivergenceAnalysis::isDivergent(const MachineOperand &MO) const { - if (!MO.isReg()) - return false; - Register Reg = MO.getReg(); - if (Reg.isPhysical()) { - const MachineInstr *MI = MO.getParent(); - if (MI) - return isDivergent(!MI); - - } else { - return isDivergent(Reg); - } - return true; -} - -bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const { - if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end()) - return false; - if (DivergentInsts.find(&I) != DivergentInsts.end()) - return true; - for (const MachineOperand &DstMO : I.defs()) { - unsigned Reg = DstMO.getReg(); - if (isDivergent(Reg)) - return true; - } - return false; -} - -void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const { - // iterate instructions using instructions() to ensure a deterministic order. - for (auto &MBB : F) - for (auto &I : MBB) { - if (isDivergent(I)) - OS << "DIVERGENT:" << I; - // AMDGPU changes begin - else - OS << "UNIFORM:" << I; - // AMDGPU changes end - } -} - -// class GPUDivergenceAnalysis -MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis( - MachineFunction &F, const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI) - : SDA(DT, PDT, LI, /*AMDGPU change*/ DivergentJoinMap), - DA(F, nullptr, DT, PDT, LI, SDA, false, - /*AMDGPU change*/ DivergentJoinMap) { - MachineRegisterInfo &MRI = F.getRegInfo(); - const GCNSubtarget *ST = &F.getSubtarget(); - const SIRegisterInfo *SIRI = ST->getRegisterInfo(); - const SIInstrInfo *SIII = ST->getInstrInfo(); - for (auto &MBB : F) - for (auto &I : MBB) { - if (isSourceOfDivergence(&I, MRI, SIRI, SIII)) { - DA.markDivergent(I); - } else if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) { - DA.addUniformOverride(I); - } - } - for (auto &ArgIt : F.getRegInfo().liveins()) { - unsigned Reg = ArgIt.first; - if (isDivergentInputReg(Reg, MRI, SIRI)) { - DA.markDivergent(Reg); - } - } - - DA.compute(); -} - -bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const { - return DA.isDivergent(*I); -} - -void MirGPUDivergenceAnalysis::print(raw_ostream &OS, - const Module_ *mod) const { - OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; - DA.print(OS, mod); - OS << "}\n"; -} - -} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h deleted file mode 100644 index e721ac323255e..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h +++ /dev/null @@ -1,285 +0,0 @@ -//===- AMDGPUMirDivergenceAnalysis.h - Mir Divergence Analysis -*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file -// The divergence analysis determines which instructions and branches are -// divergent given a set of divergent source instructions. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "AMDGPUMirSyncDependenceAnalysis.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/Pass.h" -#include - -namespace llvm { -class raw_ostream; -class TargetTransformInfo; -class MachineRegisterInfo; -class SIInstrInfo; -class SIRegisterInfo; -class MachineOperand; -class MachineBasicBlock; - -using Module_ = void; -class TargetTransformInfo; -using ValueTy = unsigned; -using PHINode_ = MachineInstr; - -/// \brief Generic divergence analysis for reducible CFGs. -/// -/// This analysis propagates divergence in a data-parallel context from sources -/// of divergence to all users. It requires reducible CFGs. All assignments -/// should be in SSA form. -class DivergenceAnalysis { -public: - /// \brief This instance will analyze the whole function \p F or the loop \p - /// RegionLoop. - /// - /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop. - /// Otherwise the whole function is analyzed. - /// \param IsLCSSAForm whether the analysis may assume that the IR in the - /// region in in LCSSA form. - DivergenceAnalysis(const llvm::MachineFunction &F, - const MachineLoop *RegionLoop, - const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, - bool IsLCSSAForm, - // AMDGPU change begin. - DivergentJoinMapTy &JoinMap - // AMDGPU change end. - ); - - /// \brief The loop that defines the analyzed region (if any). - const MachineLoop *getRegionLoop() const { return RegionLoop; } - const llvm::MachineFunction &getFunction() const { return F; } - - /// \brief Whether \p BB is part of the region. - bool inRegion(const MachineBasicBlock &BB) const; - /// \brief Whether \p I is part of the region. - bool inRegion(const MachineInstr &I) const; - - /// \brief Mark \p UniVal as a value that is always uniform. - void addUniformOverride(const ValueTy UniVal); - void addUniformOverride(const MachineInstr &I); - - /// \brief Mark \p DivVal as a value that is always divergent. - void markDivergent(const ValueTy DivVal); - void markDivergent(const MachineInstr &I); - - /// \brief Propagate divergence to all instructions in the region. - /// Divergence is seeded by calls to \p markDivergent. - void compute(); - - /// \brief Whether any value was marked or analyzed to be divergent. - bool hasDetectedDivergence() const { return !DivergentValues.empty(); } - - /// \brief Whether \p Val will always return a uniform value regardless of its - /// operands - bool isAlwaysUniform(const ValueTy Val) const; - - /// \brief Whether \p Val is a divergent value - bool isDivergent(const ValueTy Val) const; - bool isDivergent(const MachineInstr &I) const; - - void print(llvm::raw_ostream &OS, const Module_ *) const; - -private: - bool isDivergent(const llvm::MachineOperand &MO) const; - bool updateTerminator(const MachineInstr &Term) const; - bool updatePHINode(const PHINode_ &Phi) const; - bool updateVCndMask(const MachineInstr &VCndMask) const; - bool - isBitUniform(const MachineInstr &I, - llvm::DenseMap &Processed) const; - bool - isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO, - llvm::DenseMap &Processed) const; - - /// \brief Computes whether \p Inst is divergent based on the - /// divergence of its operands. - /// - /// \returns Whether \p Inst is divergent. - /// - /// This should only be called for non-phi, non-terminator instructions. - bool updateNormalInstruction(const MachineInstr &Inst) const; - - /// \brief Mark users of live-out users as divergent. - /// - /// \param LoopHeader the header of the divergent loop. - /// - /// Marks all users of live-out values of the loop headed by \p LoopHeader - /// as divergent and puts them on the worklist. - void taintLoopLiveOuts(const MachineBasicBlock &LoopHeader); - - /// \brief Push all users of \p Val (in the region) to the worklist - void pushUsers(const ValueTy I); - void pushUsers(const MachineInstr &I); - - void pushInstruction(const MachineInstr &I); - /// \brief Push all phi nodes in @block to the worklist - void pushPHINodes(const MachineBasicBlock &Block); - - /// \brief Mark \p Block as join divergent - /// - /// A block is join divergent if two threads may reach it from different - /// incoming blocks at the same time. - void markBlockJoinDivergent(const MachineBasicBlock &Block) { - DivergentJoinBlocks.insert(&Block); - } - - /// \brief Whether \p Val is divergent when read in \p ObservingBlock. - bool isTemporalDivergent( - const MachineBasicBlock &ObservingBlock, const ValueTy Val, - const MachineBasicBlock &incomingBlock) const; // AMDGPU change - - /// \brief Whether \p Block is join divergent - /// - /// (see markBlockJoinDivergent). - bool isJoinDivergent(const MachineBasicBlock &Block) const { - return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end(); - } - - /// \brief Propagate control-induced divergence to users (phi nodes and - /// instructions). - // - // \param JoinBlock is a divergent loop exit or join point of two disjoint - // paths. - // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop. - bool propagateJoinDivergence(const MachineBasicBlock &JoinBlock, - const MachineLoop *TermLoop); - - /// \brief Propagate induced value divergence due to control divergence in \p - /// Term. - void propagateBranchDivergence(const MachineInstr &Term); - - /// \brief Propagate induced value divergence due to exec update caused by \p - /// SaveExec. - void propagateExecControlFlowDivergence(const MachineInstr &SaveExec); - - /// \brief Propagate divergent caused by a divergent loop exit. - /// - /// \param ExitingLoop is a divergent loop. - void propagateLoopDivergence(const MachineLoop &ExitingLoop); - -private: - const llvm::MachineFunction &F; - const llvm::MachineRegisterInfo &MRI; - const llvm::SIRegisterInfo *SIRI; - const llvm::SIInstrInfo *SIII; - // If regionLoop != nullptr, analysis is only performed within \p RegionLoop. - // Otw, analyze the whole function - const MachineLoop *RegionLoop; - - const MachineDominatorTree &DT; - const MachinePostDominatorTree &PDT; - const MachineLoopInfo &LI; - - // Recognized divergent loops - llvm::DenseSet DivergentLoops; - - // AMDGPU change begin - // Save block pair which divergent disjoint. - // A - // | \ - // | \ - // B C - // | / - // D - // When A is divergent branch, B and C are divergent join at D. - // Then DivergentJoinMap[B].count(C) > 0 and - // DivergentJoinMap[C].count(B) > 0. - DivergentJoinMapTy &DivergentJoinMap; - // AMDGPU change end - - // The SDA links divergent branches to divergent control-flow joins. - SyncDependenceAnalysis &SDA; - - // Use simplified code path for LCSSA form. - bool IsLCSSAForm; - - // Set of known-uniform values. - llvm::DenseSet UniformOverrides; - llvm::DenseSet UniformOverridesInsts; - - // Blocks with joining divergent control from different predecessors. - llvm::DenseSet DivergentJoinBlocks; - - // Detected/marked divergent values. - llvm::DenseSet DivergentValues; - llvm::DenseSet DivergentInsts; - - // Mir change for EXEC control flow. - // Map from MBB to the exec region it belongs too. - // A exec region is begin with - // S_MOV_B64 sreg, exec - // end with - // S_MOV_B64 exec, sreg - // Inside the region, exec might be updated to make control flow with exec. - struct ExecRegion { - const llvm::MachineInstr *begin; - const llvm::MachineInstr *end; - std::vector blocks; - bool bPropagated = false; - ExecRegion(const llvm::MachineInstr *b, const llvm::MachineInstr *e) - : begin(b), end(e), bPropagated(false) {} - }; - llvm::DenseMap ExecRegionMap; - - // Internal worklist for divergence propagation. - std::vector Worklist; -}; - -/// \brief Divergence analysis frontend for GPU kernels. -class MirGPUDivergenceAnalysis { - // AMDGPU change begin - // Save block pair which divergent disjoint. - // A - // | \ - // | \ - // B C - // | / - // D - // When A is divergent branch, B and C are divergent join at D. - // Then DivergentJoinMap[B].count(C) > 0 and - // DivergentJoinMap[C].count(B) > 0. - DivergentJoinMapTy DivergentJoinMap; - // AMDGPU change end - SyncDependenceAnalysis SDA; - DivergenceAnalysis DA; - -public: - /// Runs the divergence analysis on @F, a GPU kernel - MirGPUDivergenceAnalysis(llvm::MachineFunction &F, - const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI); - - /// Whether any divergence was detected. - bool hasDivergence() const { return DA.hasDetectedDivergence(); } - - /// The GPU kernel this analysis result is for - const llvm::MachineFunction &getFunction() const { return DA.getFunction(); } - - /// Whether \p I is divergent. - bool isDivergent(const MachineInstr *I) const; - - /// Whether \p I is uniform/non-divergent - bool isUniform(const MachineInstr *I) const { return !isDivergent(I); } - - /// Print all divergent values in the kernel. - void print(llvm::raw_ostream &OS, const Module_ *) const; -}; - -} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp deleted file mode 100644 index 302939c76a4df..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp +++ /dev/null @@ -1,519 +0,0 @@ -//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence -//Calculation -//--===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is based on Analysis/MirSyncDependenceAnalysis.cpp, just change -// MachineBasicBlock to MachineBasicBlock. -// This file implements an algorithm that returns for a divergent branch -// the set of basic blocks whose phi nodes become divergent due to divergent -// control. These are the blocks that are reachable by two disjoint paths from -// the branch or loop exits that have a reaching path that is disjoint from a -// path to the loop latch. -// -// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model -// control-induced divergence in phi nodes. -// -// -- Summary -- -// The SyncDependenceAnalysis lazily computes sync dependences [3]. -// The analysis evaluates the disjoint path criterion [2] by a reduction -// to SSA construction. The SSA construction algorithm is implemented as -// a simple data-flow analysis [1]. -// -// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy -// [2] "Efficiently Computing Static Single Assignment Form -// and the Control Dependence Graph", TOPLAS '91, -// Cytron, Ferrante, Rosen, Wegman and Zadeck -// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack -// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira -// -// -- Sync dependence -- -// Sync dependence [4] characterizes the control flow aspect of the -// propagation of branch divergence. For example, -// -// %cond = icmp slt i32 %tid, 10 -// br i1 %cond, label %then, label %else -// then: -// br label %merge -// else: -// br label %merge -// merge: -// %a = phi i32 [ 0, %then ], [ 1, %else ] -// -// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid -// because %tid is not on its use-def chains, %a is sync dependent on %tid -// because the branch "br i1 %cond" depends on %tid and affects which value %a -// is assigned to. -// -// -- Reduction to SSA construction -- -// There are two disjoint paths from A to X, if a certain variant of SSA -// construction places a phi node in X under the following set-up scheme [2]. -// -// This variant of SSA construction ignores incoming undef values. -// That is paths from the entry without a definition do not result in -// phi nodes. -// -// entry -// / \ -// A \ -// / \ Y -// B C / -// \ / \ / -// D E -// \ / -// F -// Assume that A contains a divergent branch. We are interested -// in the set of all blocks where each block is reachable from A -// via two disjoint paths. This would be the set {D, F} in this -// case. -// To generally reduce this query to SSA construction we introduce -// a virtual variable x and assign to x different values in each -// successor block of A. -// entry -// / \ -// A \ -// / \ Y -// x = 0 x = 1 / -// \ / \ / -// D E -// \ / -// F -// Our flavor of SSA construction for x will construct the following -// entry -// / \ -// A \ -// / \ Y -// x0 = 0 x1 = 1 / -// \ / \ / -// x2=phi E -// \ / -// x3=phi -// The blocks D and F contain phi nodes and are thus each reachable -// by two disjoins paths from A. -// -// -- Remarks -- -// In case of loop exits we need to check the disjoint path criterion for loops -// [2]. To this end, we check whether the definition of x differs between the -// loop exit and the loop header (_after_ SSA construction). -// -//===----------------------------------------------------------------------===// -#include "AMDGPUMirSyncDependenceAnalysis.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachinePostDominators.h" - -#include -#include - -#define DEBUG_TYPE "sync-dependence" - -namespace llvm { - -ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet; - -SyncDependenceAnalysis::SyncDependenceAnalysis( - const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI, - // AMDGPU change begin. - DivergentJoinMapTy &JoinMap - // AMDGPU change end. - ) - : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI), - // AMDGPU change begin. - DivergentJoinMap(JoinMap) -// AMDGPU change end. -{} - -SyncDependenceAnalysis::~SyncDependenceAnalysis() {} - -using FunctionRPOT = ReversePostOrderTraversal; - -// divergence propagator for reducible CFGs -struct DivergencePropagator { - const FunctionRPOT &FuncRPOT; - const MachineDominatorTree &DT; - const MachinePostDominatorTree &PDT; - const MachineLoopInfo &LI; - - // identified join points - std::unique_ptr JoinBlocks; - - // reached loop exits (by a path disjoint to a path to the loop header) - SmallPtrSet ReachedLoopExits; - - // if DefMap[B] == C then C is the dominating definition at block B - // if DefMap[B] ~ undef then we haven't seen B yet - // if DefMap[B] == B then B is a join point of disjoint paths from X or B is - // an immediate successor of X (initial value). - using DefiningBlockMap = - std::map; - DefiningBlockMap DefMap; - - // all blocks with pending visits - std::unordered_set PendingUpdates; - - DivergencePropagator(const FunctionRPOT &FuncRPOT, - const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI) - : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI), - JoinBlocks(new ConstBlockSet) {} - - // set the definition at @block and mark @block as pending for a visit - void addPending(const MachineBasicBlock &Block, - const MachineBasicBlock &DefBlock) { - bool WasAdded = DefMap.emplace(&Block, &DefBlock).second; - if (WasAdded) - PendingUpdates.insert(&Block); - } - - void printDefs(raw_ostream &Out) { - Out << "Propagator::DefMap {\n"; - for (const auto *Block : FuncRPOT) { - auto It = DefMap.find(Block); - Out << Block->getName() << " : "; - if (It == DefMap.end()) { - Out << "\n"; - } else { - const auto *DefBlock = It->second; - Out << (DefBlock ? DefBlock->getName() : "") << "\n"; - } - } - Out << "}\n"; - } - - // process @succBlock with reaching definition @defBlock - // the original divergent branch was in @parentLoop (if any) - void visitSuccessor(const MachineBasicBlock &SuccBlock, - const MachineLoop *ParentLoop, - const MachineBasicBlock &DefBlock) { - - // @succBlock is a loop exit - if (ParentLoop && !ParentLoop->contains(&SuccBlock)) { - DefMap.emplace(&SuccBlock, &DefBlock); - ReachedLoopExits.insert(&SuccBlock); - return; - } - - // first reaching def? - auto ItLastDef = DefMap.find(&SuccBlock); - if (ItLastDef == DefMap.end()) { - addPending(SuccBlock, DefBlock); - return; - } - - // a join of at least two definitions - if (ItLastDef->second != &DefBlock) { - // do we know this join already? - if (!JoinBlocks->insert(&SuccBlock).second) - return; - - // update the definition - addPending(SuccBlock, SuccBlock); - } - } - - // find all blocks reachable by two disjoint paths from @rootTerm. - // This method works for both divergent terminators and loops with - // divergent exits. - // @rootBlock is either the block containing the branch or the header of the - // divergent loop. - // @nodeSuccessors is the set of successors of the node (MachineLoop or - // Terminator) headed by @rootBlock. - // @parentLoop is the parent loop of the MachineLoop or the loop that contains - // the Terminator. - template - std::unique_ptr computeJoinPoints( - const MachineBasicBlock &RootBlock, SuccessorIterable NodeSuccessors, - const MachineLoop *ParentLoop, const MachineBasicBlock *PdBoundBlock) { - assert(JoinBlocks); - - // bootstrap with branch targets - for (const auto *SuccBlock : NodeSuccessors) { - DefMap.emplace(SuccBlock, SuccBlock); - - if (ParentLoop && !ParentLoop->contains(SuccBlock)) { - // immediate loop exit from node. - ReachedLoopExits.insert(SuccBlock); - continue; - } else { - // regular successor - PendingUpdates.insert(SuccBlock); - } - } - - auto ItBeginRPO = FuncRPOT.begin(); - - // skip until term (TODO RPOT won't let us start at @term directly) - for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) { - } - - auto ItEndRPO = FuncRPOT.end(); - assert(ItBeginRPO != ItEndRPO); - - // propagate definitions at the immediate successors of the node in RPO - auto ItBlockRPO = ItBeginRPO; - while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) { - const auto *Block = *ItBlockRPO; - - // skip @block if not pending update - auto ItPending = PendingUpdates.find(Block); - if (ItPending == PendingUpdates.end()) - continue; - PendingUpdates.erase(ItPending); - - // propagate definition at @block to its successors - auto ItDef = DefMap.find(Block); - const auto *DefBlock = ItDef->second; - assert(DefBlock); - - auto *BlockLoop = LI.getLoopFor(Block); - if (ParentLoop && - (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) { - // if the successor is the header of a nested loop pretend its a - // single node with the loop's exits as successors - SmallVector BlockLoopExits; - BlockLoop->getExitBlocks(BlockLoopExits); - for (const auto *BlockLoopExit : BlockLoopExits) { - visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock); - } - - } else { - // the successors are either on the same loop level or loop exits - for (const auto *SuccBlock : Block->successors()) { - visitSuccessor(*SuccBlock, ParentLoop, *DefBlock); - } - } - } - - // We need to know the definition at the parent loop header to decide - // whether the definition at the header is different from the definition at - // the loop exits, which would indicate a divergent loop exits. - // - // A // loop header - // | - // B // nested loop header - // | - // C -> X (exit from B loop) -..-> (A latch) - // | - // D -> back to B (B latch) - // | - // proper exit from both loops - // - // D post-dominates B as it is the only proper exit from the "A loop". - // If C has a divergent branch, propagation will therefore stop at D. - // That implies that B will never receive a definition. - // But that definition can only be the same as at D (D itself in thise case) - // because all paths to anywhere have to pass through D. - // - const MachineBasicBlock *ParentLoopHeader = - ParentLoop ? ParentLoop->getHeader() : nullptr; - if (ParentLoop && ParentLoop->contains(PdBoundBlock)) { - DefMap[ParentLoopHeader] = DefMap[PdBoundBlock]; - } - - // analyze reached loop exits - if (!ReachedLoopExits.empty()) { - assert(ParentLoop); - const auto *HeaderDefBlock = DefMap[ParentLoopHeader]; - LLVM_DEBUG(printDefs(dbgs())); - - // AMDGPU CHANGE: Allow null HeaderDefBlock - // Because of the way they walk the blocks (a reverse post order traversal - // stopping at the immediate post dominator) it is possible that - // they will reach a loop exit, but not the loop header. - // - // We conservatively mark the exit blocks as divergent join points - // in this case. - // - // Problem CFG is below: - // - // +--> A - // | / \ - // | B C - // | | / | - // +--L P - // - // In this cfg, C is the RootBlock and P is C's post-dominator. - // It will only visit L and P and then stop because it hits the - // post dominator. Most loops do not hit this case because the - // loop exiting block (C) will branch directly back to the loop - // header. - // - if (HeaderDefBlock) { - for (const auto *ExitBlock : ReachedLoopExits) { - auto ItExitDef = DefMap.find(ExitBlock); - assert((ItExitDef != DefMap.end()) && - "no reaching def at reachable loop exit"); - if (ItExitDef->second != HeaderDefBlock) { - JoinBlocks->insert(ExitBlock); - } - } - } else { - for (const auto *ExitBlock : ReachedLoopExits) { - JoinBlocks->insert(ExitBlock); - } - } - } - - return std::move(JoinBlocks); - } -}; - -// AMDGPU change begin. -// For all join blocks caused by divergent RootBlock, the prevs of a join block -// which are in DefMap or the RootBlock are divergent join each other on the -// join block because of divergent RootBlock. -static void -updateJoinMap(const MachineBasicBlock *RootBlock, - DenseMap> &JoinMap, - DivergencePropagator::DefiningBlockMap &DefMap, - ConstBlockSet &JoinBlocks) { - for (const MachineBasicBlock *JoinBB : JoinBlocks) { - // makr divergent join for all pred pair which in DefMap. - for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end(); - predIt++) { - auto predIt2 = predIt; - const MachineBasicBlock *pred = *predIt; - if (DefMap.count(pred) == 0 && pred != RootBlock) - continue; - - for (predIt2++; predIt2 != JoinBB->pred_end(); predIt2++) { - const MachineBasicBlock *pred2 = *predIt2; - if (DefMap.count(pred2) == 0 && pred2 != RootBlock) - continue; - - JoinMap[pred].insert(pred2); - JoinMap[pred2].insert(pred); - LLVM_DEBUG(dbgs() << "joint_bb0: " << pred->getName() - << " joint_bb1: " << pred2->getName() << "\n";); - } - } - } -} -// AMDGPU change end. - -const ConstBlockSet & -SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) { - using LoopExitVec = SmallVector; - LoopExitVec LoopExits; - MachineLoop.getExitBlocks(LoopExits); - if (LoopExits.size() < 1) { - return EmptyBlockSet; - } - - // already available in cache? - auto ItCached = CachedLoopExitJoins.find(&MachineLoop); - if (ItCached != CachedLoopExitJoins.end()) { - return *ItCached->second; - } - - // dont propagte beyond the immediate post dom of the loop - const auto *PdNode = - PDT.getNode(const_cast(MachineLoop.getHeader())); - const auto *IpdNode = PdNode->getIDom(); - const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; - while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) { - IpdNode = IpdNode->getIDom(); - PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; - } - - // compute all join points - DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; - auto JoinBlocks = Propagator.computeJoinPoints( - *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), - PdBoundBlock); - - // AMDGPU change begin. - // Save divergent join pairs. - updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap, - *JoinBlocks.get()); - // AMDGPU change end. - - auto ItInserted = - CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks)); - assert(ItInserted.second); - return *ItInserted.first->second; -} - -const ConstBlockSet & -SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) { - // trivial case - if (Term.getParent()->succ_size() < 1) { - return EmptyBlockSet; - } - - // already available in cache? - auto ItCached = CachedBranchJoins.find(&Term); - if (ItCached != CachedBranchJoins.end()) - return *ItCached->second; - - // dont propagate beyond the immediate post dominator of the branch - const auto *PdNode = - PDT.getNode(const_cast(Term.getParent())); - const auto *IpdNode = PdNode->getIDom(); - const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; - - // compute all join points - DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; - const auto &TermBlock = *Term.getParent(); - - // AMDGPU CHANGE - // Make sure the post-dominator is outside the loop for the loop header. - // Otherwise, we may not find all the join blocks in the loop - // because the search stops too early. Some join points can be reached - // after the post-dominator! - // - // Problem CFG is below: - // - // +--> A - // | / \ - // | B P - // | | / | - // +--L X - // - // In this cfg, A is the loop header and P is A's post-dominator. - // The algorithm to mark join points does an Reverse Post Order walk - // from A and stops when it reaches the post dominator. It would not - // mark the phi node in L as divergent even when A had a divergent branch. - // The fix we made was to make the join point search continue all the way - // to the loops post dominator (which is X in this example). - // - // NOTE: They already made this change for the loop case above, but for - // a different bug apparently. See - // SyncDependenceAnalysis::join_blocks(MachineLoop&) - // - const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock); - if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) { - while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) { - IpdNode = IpdNode->getIDom(); - PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; - } - } - - auto JoinBlocks = Propagator.computeJoinPoints( - TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock); - - // AMDGPU change begin. - // Save divergent join pairs. - updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap, - *JoinBlocks.get()); - // AMDGPU change end. - - auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks)); - assert(ItInserted.second); - return *ItInserted.first->second; -} - -} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h deleted file mode 100644 index 92059d85b848a..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h +++ /dev/null @@ -1,101 +0,0 @@ -//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -//-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file -// This file defines the SyncDependenceAnalysis class, which computes for -// every divergent branch the set of phi nodes that the branch will make -// divergent. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include -#include - -namespace llvm { -class MachineBasicBlock; -class MachineDominatorTree; -class MachineLoop; -class MachinePostDominatorTree; -class MachineLoopInfo; -class MachineFunction; -class MachineInstr; - -using DivergentJoinMapTy = - llvm::DenseMap>; - -using ConstBlockSet = llvm::SmallPtrSet; - -/// \brief Relates points of divergent control to join points in -/// reducible CFGs. -/// -/// This analysis relates points of divergent control to points of converging -/// divergent control. The analysis requires all loops to be reducible. -class SyncDependenceAnalysis { - void visitSuccessor(const MachineBasicBlock &succBlock, - const MachineLoop *termLoop, - const MachineBasicBlock *defBlock); - -public: - bool inRegion(const MachineBasicBlock &BB) const; - - ~SyncDependenceAnalysis(); - SyncDependenceAnalysis(const MachineDominatorTree &DT, - const MachinePostDominatorTree &PDT, - const MachineLoopInfo &LI, - // AMDGPU change begin - DivergentJoinMapTy &JoinMap - // AMDGPU change end - ); - - /// \brief Computes divergent join points and loop exits caused by branch - /// divergence in \p Term. - /// - /// The set of blocks which are reachable by disjoint paths from \p Term. - /// The set also contains loop exits if there two disjoint paths: - /// one from \p Term to the loop exit and another from \p Term to the loop - /// header. Those exit blocks are added to the returned set. - /// If L is the parent loop of \p Term and an exit of L is in the returned - /// set then L is a divergent loop. - const ConstBlockSet &join_blocks(const MachineInstr &Term); - - /// \brief Computes divergent join points and loop exits (in the surrounding - /// loop) caused by the divergent loop exits of\p MachineLoop. - /// - /// The set of blocks which are reachable by disjoint paths from the - /// loop exits of \p MachineLoop. - /// This treats the loop as a single node in \p MachineLoop's parent loop. - /// The returned set has the same properties as for join_blocks(TermInst&). - const ConstBlockSet &join_blocks(const MachineLoop &MachineLoop); - -private: - static ConstBlockSet EmptyBlockSet; - - llvm::ReversePostOrderTraversal FuncRPOT; - const MachineDominatorTree &DT; - const MachinePostDominatorTree &PDT; - const MachineLoopInfo &LI; - // AMDGPU change begin. - DivergentJoinMapTy &DivergentJoinMap; - // AMDGPU change end. - std::map> - CachedLoopExitJoins; - std::map> - CachedBranchJoins; -}; - -} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index f089b210c8849..eac9b57dc9973 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -83,8 +83,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp AMDGPUMIRUtils.cpp - AMDGPUMirDivergenceAnalysis.cpp - AMDGPUMirSyncDependenceAnalysis.cpp AMDGPUIGroupLP.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp From 6b011fbeac70b6f6e7473cf77109ac627fdda811 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Wed, 12 Mar 2025 13:14:10 -0700 Subject: [PATCH 11/25] Clang format and warnings. --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 34 +++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index e508ed2a6e2cd..591cfef570d74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -339,7 +339,6 @@ unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS, if (!LIS->hasInterval(Reg)) continue; - LaneBitmask LiveMask; const auto &LI = LIS->getInterval(Reg); // Skip local live interval to make live input/ouput faster. @@ -506,15 +505,11 @@ struct RematNode { Clone, }; RematNode() - : Reg(0), DefMI(nullptr), Kind(RematKind::Candidate), - InsertPointMI(nullptr), InsertBlock(nullptr), Size(0) {} + : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(0) {} RematNode(unsigned R, MachineInstr *MI, unsigned S) - : Reg(R), DefMI(MI), Kind(RematKind::Candidate), InsertPointMI(nullptr), - InsertBlock(nullptr), Size(S) {} - RematNode(const RematNode &N) - : Reg(N.Reg), DefMI(N.DefMI), Kind(N.Kind), - InsertPointMI(N.InsertPointMI), InsertBlock(N.InsertBlock), - Size(N.Size) {} + : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(S) {} unsigned Reg; MachineInstr *DefMI; MachineBasicBlock *InsertBlock; @@ -528,10 +523,10 @@ struct RematNode { struct BlockLiveInfo { MachineBasicBlock *BB; - unsigned maxSReg; - unsigned maxVReg; + unsigned MaxSReg; + unsigned MaxVReg; // Input live is the live reg which cross block. - const GCNRPTracker::LiveRegSet inputLive; + const GCNRPTracker::LiveRegSet InputLive; }; // Skip live reg remated to other block. @@ -893,7 +888,7 @@ void AddCloneCandidate(std::vector &cloneList, // Group user in same blocks. std::vector UserSetList(cloneList.size()); - for (int i = 0; i < cloneList.size(); i++) { + for (size_t i = 0; i < cloneList.size(); i++) { auto *Node = cloneList[i]; unsigned Reg = Node->Reg; MachineInstr *DefMI = Node->DefMI; @@ -1010,7 +1005,7 @@ DenseMap reduceClonedMBBs( // Collect hot blocks which Exp is live in. DenseSet hotBlockSet; for (BlockLiveInfo &hotBlock : hotBlocks) { - if (hotBlock.inputLive.count(Reg)) { + if (hotBlock.InputLive.count(Reg)) { hotBlockSet.insert(hotBlock.BB); } } @@ -1411,7 +1406,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, // entry block. if (MBB != EntryMBB) hotBlocks.emplace_back(LiveInfo); - GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.inputLive; + GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive; // Update reg pressure based on remat list. InstSet VReducedInsts; @@ -1552,7 +1547,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, } // TODO: what to do when cannot reach target? if (newRematSCnt > 0) { - if (newRematSCnt <= NearTargetRegLimit) { + if ((unsigned)newRematSCnt <= NearTargetRegLimit) { bNearTarget = true; } else { if (!bSGPRSpill) @@ -2838,7 +2833,7 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI, continue; unsigned dstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst); - if (dstIdx == -1) + if (dstIdx == (unsigned)-1) continue; MachineOperand &DstMO = MI.getOperand(dstIdx); if (DstMO.getSubReg() != 0) @@ -2899,8 +2894,6 @@ bool collectVToSCrossHotSpot( } // Try to make all possible vtos to reduce vpressure. - int VExtra = VPressure - VLimit; - const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs(); for (auto it : CurLives) { unsigned Reg = it.first; @@ -2908,7 +2901,6 @@ bool collectVToSCrossHotSpot( if (UniformIt == UniformMap.end()) continue; VToSMap[UniformIt->first] = UniformIt->second; - VExtra--; bUpdated = true; } } @@ -4252,7 +4244,7 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { bool bUpdated = false; - bool bCanClone = EnableSubExpClone | EnableSubExpAggressive; + bool bCanClone = EnableSubExpClone || EnableSubExpAggressive; SlotIndexes *slotIndexes = LIS->getSlotIndexes(); // Sort hot blocks by pressure first. From eb4f8c19817be23c7e05f62b02c3b0320b840eb7 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Thu, 13 Mar 2025 13:29:19 -0700 Subject: [PATCH 12/25] First batch of formatting changes --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 1169 ++++++++--------- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 60 +- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h | 26 +- 3 files changed, 618 insertions(+), 637 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 591cfef570d74..ed7093f85823d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -71,7 +71,7 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass { DenseSet TotalUniformInsts; DenseSet SafeToRemoveInsts; DenseSet DivergentInsts; - void RemoveInst(const MachineInstr *MI) { + void removeInst(const MachineInstr *MI) { TotalUniformInsts.erase(MI); SafeToRemoveInsts.erase(MI); DivergentInsts.erase(MI); @@ -102,8 +102,8 @@ typedef AMDGPUHotBlockRematerialize Remat; // Util functions. namespace { -MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT, - BlockSet &Blocks) { +MachineBasicBlock *NearestCommonDominator(MachineDominatorTree *DT, + BlockSet &Blocks) { auto I = Blocks.begin(), E = Blocks.end(); MachineBasicBlock *DomB = cast(*(I++)); @@ -150,9 +150,9 @@ MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT, return DomB; } -MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB, - MachineDominatorTree *DT, - MachineLoopInfo *LI) { +MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB, + MachineDominatorTree *DT, + MachineLoopInfo *LI) { while (LI->getLoopDepth(BB) > 0) { MachineDomTreeNode *N = DT->getNode(BB); if (N == nullptr) @@ -168,9 +168,9 @@ MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB, } MachineBasicBlock * -FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, +findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, - const MachineRegisterInfo &MRI, bool bMemBound) { + const MachineRegisterInfo &MRI, bool MemBound) { BlockSet BBSet; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { @@ -181,14 +181,14 @@ FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, MachineBasicBlock *BB = *BBSet.begin(); if (BBSet.size() > 1) { - MachineBasicBlock *BDom = nearest_common_dominator(DT, BBSet); + MachineBasicBlock *BDom = NearestCommonDominator(DT, BBSet); if (!BDom) return nullptr; BB = BDom; } // Try to find non loop dominator. - if (!bMemBound) { - BB = find_non_loop_dominator(BB, DT, MLI); + if (!MemBound) { + BB = findNonLoopDominator(BB, DT, MLI); } if (!BB) return nullptr; @@ -204,7 +204,7 @@ FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, } // Maybe expensive to be called all over the place -bool IsUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { +bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { for (auto &Def : DefMI->defs()) { for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) { if (UseMI.isPHI()) @@ -214,9 +214,9 @@ bool IsUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { return false; } -bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { +bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { // Do not move PHI nodes - if (IsUsedByPhi(DefMI, MRI)) + if (isUsedByPhi(DefMI, MRI)) return false; unsigned OpNum = DefMI->getNumOperands(); @@ -235,18 +235,18 @@ bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { // SGPR has alignment requirment, cannot get accurate reg number. const unsigned NearTargetRegLimit = 10; -bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, +bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST, MachineFunction &MF) { - unsigned maxSGPR = ST->getAddressableNumSGPRs(); + unsigned MaxSGPR = ST->getAddressableNumSGPRs(); const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + Register ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg) - maxSGPR -= 4; + MaxSGPR -= 4; const unsigned AlignmentDelta = 3; - maxSGPR -= AlignmentDelta; + MaxSGPR -= AlignmentDelta; - return maxSPressure > maxSGPR; + return MaxSPressure > MaxSGPR; } struct RematStatus { @@ -258,9 +258,9 @@ struct RematStatus { unsigned InputPhysicalVPressure; unsigned InputPhysicalSPressure; // More occupancy can help more than latency cost to reach it. - bool bMemBound; + bool MemBound; // abs(VTargetOcc-STargetOcc) > 1. - bool bNotBalance; + bool NotBalance; DenseMap MBBPressureMap; DenseMap MBBInputLiveMap; DenseMap MBBOutputLiveMap; @@ -270,10 +270,9 @@ struct RematStatus { DenseSet MemWriteMBBSet; }; -unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, - const MachineRegisterInfo &MRI, - const GCNSubtarget *ST, unsigned &maxVPressure, - unsigned &maxSPressure, RematStatus &status) { +unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { // Skip processing current block if it has only debug instructions if (MBB.getFirstNonDebugInstr() == MBB.end()) return ST->getOccupancyWithNumVGPRs(0); @@ -284,32 +283,32 @@ unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB)) return ST->getOccupancyWithNumVGPRs(0); - GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; - RPTracker.reset(*BBEnd, &outputLive, true); + GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; + RPTracker.reset(*BBEnd, &OutputLive, true); for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) { MachineInstr &MI = (*I++); RPTracker.recede(MI); if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH)) - status.MemWriteMBBSet.insert(&MBB); + Status.MemWriteMBBSet.insert(&MBB); } GCNRegPressure RP = RPTracker.getMaxPressureAndReset(); - unsigned sPressure = RP.getMaxSGPR(); - if (sPressure > maxSPressure) { - maxSPressure = sPressure; + unsigned SPressure = RP.getMaxSGPR(); + if (SPressure > MaxSPressure) { + MaxSPressure = SPressure; } - if (RP.getVGPRNum(ST->hasGFX90AInsts()) > maxVPressure) { - maxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) { + MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); } - status.MBBPressureMap[&MBB] = RP; + Status.MBBPressureMap[&MBB] = RP; return RP.getOccupancy(*ST); } -unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS, +unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI, - const GCNSubtarget *ST, unsigned &maxVPressure, - unsigned &maxSPressure, RematStatus &status) { + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second; // If only have one block, input/ouput virtual live set are empty. if (MF.size() > 1) { @@ -345,22 +344,22 @@ unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS, if (llvm::isLocalLiveInterval(LI, SlotIndexes)) continue; - for (auto inputIt : MBBInputSlotMap) { - MachineBasicBlock *MBB = inputIt.first; - auto SI = inputIt.second; + for (auto InputIt : MBBInputSlotMap) { + MachineBasicBlock *MBB = InputIt.first; + auto SI = InputIt.second; auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); if (LiveMask.any()) - status.MBBInputLiveMap[MBB][Reg] |= LiveMask; + Status.MBBInputLiveMap[MBB][Reg] |= LiveMask; } - for (auto outputIt : MBBOutputSlotMap) { - MachineBasicBlock *MBB = outputIt.first; - auto SI = outputIt.second; + for (auto OutputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = OutputIt.first; + auto SI = OutputIt.second; auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); if (LiveMask.any()) - status.MBBOutputLiveMap[MBB][Reg] |= LiveMask; + Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask; } } } @@ -368,70 +367,70 @@ unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS, LLVM_DEBUG( const SIRegisterInfo *SIRI = ST->getRegisterInfo(); dbgs() << "output live"; for (auto &it - : status.MBBOutputLiveMap) { + : Status.MBBOutputLiveMap) { unsigned Idx = it.first->getNumber(); auto LiveReg = it.second; dbgs() << "MBB" << Idx << ":"; llvm::dumpLiveSet(LiveReg, SIRI); } dbgs() << "input live"; for (auto &it - : status.MBBInputLiveMap) { + : Status.MBBInputLiveMap) { unsigned Idx = it.first->getNumber(); auto LiveReg = it.second; dbgs() << "MBB" << Idx << ":"; llvm::dumpLiveSet(LiveReg, SIRI); }); - for (auto it = MF.begin(); it != MF.end(); ++it) { - MachineBasicBlock &MBB = *it; - unsigned Occ = CollectMBBPressure(MBB, LIS, MRI, ST, maxVPressure, - maxSPressure, status); + for (auto It = MF.begin(); It != MF.end(); ++It) { + MachineBasicBlock &MBB = *It; + unsigned Occ = + collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status); if (TgtOcc > Occ) TgtOcc = Occ; } return TgtOcc; } -RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, +RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, const MachineRegisterInfo &MRI, const GCNSubtarget *ST) { - unsigned maxSPressure = 0; - unsigned maxVPressure = 0; - RematStatus status; + unsigned MaxSPressure = 0; + unsigned MaxVPressure = 0; + RematStatus Status; unsigned TgtOcc = - CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, maxSPressure, status); + collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status); const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; if (TgtOcc >= MaxOcc) { - status.TargetOcc = TgtOcc; - status.TargetVLimit = 0; - status.TargetSLimit = 0; - status.MaxVPressure = 0; - status.MaxSPressure = 0; - status.InputPhysicalVPressure = 0; - status.InputPhysicalSPressure = 0; - status.bMemBound = false; - status.bNotBalance = false; - return status; + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = 0; + Status.TargetSLimit = 0; + Status.MaxVPressure = 0; + Status.MaxSPressure = 0; + Status.InputPhysicalVPressure = 0; + Status.InputPhysicalSPressure = 0; + Status.MemBound = false; + Status.NotBalance = false; + return Status; } - maxSPressure += RegForVCC; - maxVPressure = std::min(maxVPressure, ST->getMaxNumVGPRs(MF)); - unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(maxSPressure); - unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(maxVPressure); + MaxSPressure += RegForVCC; + MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF)); + unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure); + unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure); - llvm::SchedScore totalScore = llvm::CollectLatency(MF, *ST, MLI); - bool bMemBound = - totalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc); + llvm::SchedScore TotalScore = llvm::CollectLatency(MF, *ST, MLI); + bool MemBound = + TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc); - bool bNotBalance = false; + bool NotBalance = false; const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU(); // Currently, only sgpr bound can be fixed with remat. if (STgtOcc < VTgtOcc) { - unsigned bigOcc = std::max(STgtOcc, VTgtOcc); - // Change TgtOcc to bigOcc in case sgpr and vgpr is not balance. - if (bigOcc > TgtOcc) { - TgtOcc = bigOcc; - bNotBalance = true; + unsigned BigOcc = std::max(STgtOcc, VTgtOcc); + // Change TgtOcc to in case sgpr and vgpr is not balance. + if (BigOcc > TgtOcc) { + TgtOcc = BigOcc; + NotBalance = true; if (TgtOcc >= MaxOccupancy) TgtOcc = MaxOccupancy - 1; } @@ -440,34 +439,34 @@ RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, // Collect input physical pressure. const SIRegisterInfo *SIRI = ST->getRegisterInfo(); - unsigned vInputPressure = 0; - uint64_t sInputMask = 0; - for (const auto &livein : MRI.liveins()) { - const Register Reg = livein.first; + unsigned VInputPressure = 0; + uint64_t SInputMask = 0; + for (const auto &Livein : MRI.liveins()) { + const Register Reg = Livein.first; const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); assert(Reg.isPhysical() && "input must be physical reg"); unsigned RegSize = RC->getLaneMask().getNumLanes(); if (SIRI->isVGPR(MRI, Reg)) { - vInputPressure += RegSize; + VInputPressure += RegSize; } else { unsigned RegIndex = SIRI->getHWRegIndex(Reg); - uint64_t mask = ((1 << RegSize) - 1) << RegIndex; - sInputMask |= mask; + uint64_t Mask = ((1 << RegSize) - 1) << RegIndex; + SInputMask |= Mask; } } // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high // pressure. - unsigned sInputPressure = 0; - uint64_t mask = 0xf; - while (mask != 0) { - if (mask & sInputMask) { - sInputPressure += 4; + unsigned SInputPressure = 0; + uint64_t Mask = 0xf; + while (Mask != 0) { + if (Mask & SInputMask) { + SInputPressure += 4; } - mask = mask << 4; + Mask = Mask << 4; } // If balanced, try next occupancy. - TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1); + TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1); auto CC = MF.getFunction().getCallingConv(); bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS; @@ -481,16 +480,16 @@ RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true); unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc); - status.TargetOcc = TgtOcc; - status.TargetVLimit = VLimit; - status.TargetSLimit = SLimit; - status.MaxVPressure = maxVPressure; - status.MaxSPressure = maxSPressure; - status.InputPhysicalVPressure = vInputPressure; - status.InputPhysicalSPressure = sInputPressure; - status.bMemBound = bMemBound; - status.bNotBalance = bNotBalance; - return status; + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = VLimit; + Status.TargetSLimit = SLimit; + Status.MaxVPressure = MaxVPressure; + Status.MaxSPressure = MaxSPressure; + Status.InputPhysicalVPressure = VInputPressure; + Status.InputPhysicalSPressure = SInputPressure; + Status.MemBound = MemBound; + Status.NotBalance = NotBalance; + return Status; } } // namespace @@ -530,22 +529,22 @@ struct BlockLiveInfo { }; // Skip live reg remated to other block. -void UpdateLiveInfo(MapVector &RematMap, +void updateLiveInfo(MapVector &RematMap, GCNRPTracker::LiveRegSet &LiveSet, - const GCNRPTracker::LiveRegSet &inputLive, + const GCNRPTracker::LiveRegSet &InputLive, MachineBasicBlock *CurBB, DenseMap &RPOTIndexMap) { - for (auto &it : RematMap) { - unsigned Reg = it.first; + for (auto &It : RematMap) { + unsigned Reg = It.first; // Skip reg not in live set. if (!LiveSet.count(Reg)) continue; // Skip reg already in input set. - // Input set will be taken care in GetReducedSize. - if (inputLive.count(Reg)) + // Input set will be taken care in getReducedSize. + if (InputLive.count(Reg)) continue; - auto &Node = it.second; + auto &Node = It.second; if (Node.Kind == RematNode::RematKind::OneDefOneUse) { MachineBasicBlock *InsertBB = Node.InsertBlock; // If LiveInfo.BB is after InsertBB in Reverse post order, the def is @@ -562,7 +561,7 @@ void UpdateLiveInfo(MapVector &RematMap, } } -int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR, +int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { @@ -586,8 +585,7 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR, if (!Reg.isVirtual()) continue; - bool isVGPR = SIRI->isVGPR(MRI, MO.getReg()); - if (bVGPR != isVGPR) { + if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) { // Not support mix of v and s when remat now. continue; } @@ -623,20 +621,19 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR, return SharedSize; } -int GetReducedSize(MapVector &RematMap, bool bVGPR, +int getReducedSize(MapVector &RematMap, GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts, - const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - BlockLiveInfo &LiveInfo, + const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo, DenseMap &RPOTIndexMap) { int ReducedSize = 0; - for (auto &it : RematMap) { - unsigned Reg = it.first; + for (auto &It : RematMap) { + Register Reg = It.first; if (!CanidateSet.count(Reg)) continue; - bool bReduced = false; - auto &Node = it.second; + bool IsReduced = false; + auto &Node = It.second; if (Node.Kind == RematNode::RematKind::OneDefOneUse) { MachineBasicBlock *InsertBB = Node.InsertBlock; // If LiveInfo.BB is before InsertBB in Reverse post order, the def is @@ -644,19 +641,19 @@ int GetReducedSize(MapVector &RematMap, bool bVGPR, unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB]; unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; if (LiveBBIndex < InsertBBIndex) - bReduced = true; + IsReduced = true; } else { // Clone. - bReduced = true; + IsReduced = true; // If has use in LiveInfo.BB, could not reduce from input live. for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { if (UseMI.getParent() == LiveInfo.BB) { - bReduced = false; + IsReduced = false; break; } } } - if (bReduced) { + if (IsReduced) { ReducedSize += Node.Size; ReducedInsts.insert(Node.DefMI); } @@ -668,11 +665,9 @@ int GetReducedSize(MapVector &RematMap, bool bVGPR, return ReducedSize; } -int RematGain(MachineInstr *DefMI, unsigned Reg, - GCNRPTracker::LiveRegSet &CandidateRegSet, - const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - bool bVGPR) { - int rematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); +int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, bool IsVGPR) { + int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); for (MachineOperand &MO : DefMI->operands()) { if (MO.isImm()) continue; @@ -688,32 +683,31 @@ int RematGain(MachineInstr *DefMI, unsigned Reg, // Don't move user of VCC. if (MO.getReg() == AMDGPU::VCC) { - rematSize = 0; + RematSize = 0; break; } Register Reg = MO.getReg(); // Don't move physical register use. if (Reg.isPhysical()) { - rematSize = 0; + RematSize = 0; break; } - bool isVGPR = SIRI->isVGPR(MRI, Reg); - if (bVGPR != isVGPR) { + if (IsVGPR != SIRI->isVGPR(MRI, Reg)) { // Not support mix of v and s when remat now. // TODO: count possible pressure change here. - rematSize = 0; + RematSize = 0; break; } - bool bSingleDef = MRI.hasOneDef(Reg); - if (!bSingleDef) { - bSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI); + bool IsSingleDef = MRI.hasOneDef(Reg); + if (!IsSingleDef) { + IsSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI); } - if (bSingleDef) { - // The reg might share with other candidates, but not check it here. - // Count share reg in GetReducedSize. + if (IsSingleDef) { + // The reg might share with other candidates, check it here. + // Count share reg in getReducedSize. if (EnableAggressive) { // In case of aggressive remat, treat multi use reg as shared reg and // ignore size of shared reg. @@ -725,72 +719,71 @@ int RematGain(MachineInstr *DefMI, unsigned Reg, if (OpRC) OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); } - int inputSize = SIRI->getRegSizeInBits(*OpRC); + int InputSize = SIRI->getRegSizeInBits(*OpRC); // If input not live in hotspot, move it cross hotspot should have // less reg then DefMi. - if (rematSize > inputSize) { - rematSize -= inputSize; + if (RematSize > InputSize) { + RematSize -= InputSize; continue; } } - rematSize = 0; + RematSize = 0; break; } - return rematSize; + return RematSize; } -void BuildRematCandiates(std::vector &Candidates, +void buildRematCandiates(std::vector &Candidates, GCNRPTracker::LiveRegSet &CandidateRegSet, DenseSet &PinnedRegSet, const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, - bool bVGPR) { + bool IsVGPR) { - for (auto liveRegIt : CandidateRegSet) { - unsigned Reg = liveRegIt.first; + for (auto LiveRegIt : CandidateRegSet) { + unsigned Reg = LiveRegIt.first; // Skip unsafe reg. if (PinnedRegSet.count(Reg)) continue; - bool isVGPR = SIRI->isVGPR(MRI, Reg); - if (isVGPR != bVGPR) + if (SIRI->isVGPR(MRI, Reg) != IsVGPR) continue; - bool bSafeCandidate = true; + bool IsSafeCandidate = true; MachineInstr *MI = MRI.getUniqueVRegDef(Reg); if (MI) { - if (bVGPR) { + if (IsVGPR) { // Only remat valu now. if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY) - bSafeCandidate = false; + IsSafeCandidate = false; if (MI->getOpcode() == AMDGPU::COPY) { // Make sure src is unique define. if (MI->getOperand(1).isReg() && nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg())) - bSafeCandidate = false; + IsSafeCandidate = false; } else { // Skip convergent valu. if (MI->isConvergent()) - bSafeCandidate = false; + IsSafeCandidate = false; } } // Skip inst has more than 1 def. if (MI->getDesc().NumDefs > 1) - bSafeCandidate = false; + IsSafeCandidate = false; } else { - bSafeCandidate = false; + IsSafeCandidate = false; } - if (bSafeCandidate) { - int gain = RematGain(MI, Reg, CandidateRegSet, MRI, SIRI, bVGPR); - if (gain > 0) { - Candidates.emplace_back(RematNode(Reg, MI, gain >> 5)); + if (IsSafeCandidate) { + int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR); + if (Gain > 0) { + Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5)); } else { - bSafeCandidate = false; + IsSafeCandidate = false; } } // Save unsafe reg. - if (!bSafeCandidate) + if (!IsSafeCandidate) PinnedRegSet.insert(Reg); } @@ -812,57 +805,57 @@ bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { return false; auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo(); - for (MachineOperand &def : DefMI->implicit_operands()) { - if (!def.isReg()) + for (MachineOperand &Def : DefMI->implicit_operands()) { + if (!Def.isReg()) continue; - if (def.isUse()) + if (Def.isUse()) continue; - unsigned Reg = def.getReg(); + Register Reg = Def.getReg(); if (UseMI->readsRegister(Reg, TRI)) return true; } return false; } -void AddOneDefOneUseCandidate(RematNode &Node, +void addOneDefOneUseCandidate(RematNode &Node, std::vector &RematList, - MachineRegisterInfo &MRI, int &rematCnt, + MachineRegisterInfo &MRI, int &RematCnt, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, - MachineLoopInfo *MLI, bool bVGPR, - bool bMemBound) { + MachineLoopInfo *MLI, bool IsVGPR, + bool MemBound) { unsigned Reg = Node.Reg; MachineInstr *DefMI = Node.DefMI; - unsigned size = Node.Size; + unsigned Size = Node.Size; MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin(); MachineBasicBlock *InsertBB = UseMI->getParent(); // For VGPR, always move next to the only user to avoid wqm or exec issue. - // But doing this will cause issue when DefMI is in wqm but single user not in + // But doing this will cause issue when DefMI is in wqm user not in // wqm. Disable VGPR remat for now. // TODO: make sure single user don't need wqm. - if (!bVGPR) { + if (!IsVGPR) { if (MachineBasicBlock *NewInsertBB = - FindInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, bMemBound)) { + findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) { if (InsertBB != NewInsertBB) { InsertBB = NewInsertBB; // If can find a non-loop insert block, go to the insert block. if (DefMI->getParent() != InsertBB) { if (!InsertBB->empty()) { - auto it = InsertBB->getFirstNonPHI(); - it = skipDebugInstructionsForward(it, InsertBB->end()); - if (it == InsertBB->end()) + auto It = InsertBB->getFirstNonPHI(); + It = skipDebugInstructionsForward(It, InsertBB->end()); + if (It == InsertBB->end()) UseMI = nullptr; else - UseMI = &*it; + UseMI = &*It; } } } } } - if (bVGPR) { + if (IsVGPR) { // Don't count reg in same block for valu. if (UseMI->getParent() == DefMI->getParent()) return; @@ -877,27 +870,26 @@ void AddOneDefOneUseCandidate(RematNode &Node, Node.InsertPointMI = UseMI; Node.Kind = RematNode::RematKind::OneDefOneUse; RematList.emplace_back(Node); - rematCnt += size; + RematCnt += Size; } -void AddCloneCandidate(std::vector &cloneList, +void addCloneCandidate(std::vector &CloneList, std::vector &RematList, DenseSet &PinnedRegSet, - MachineRegisterInfo &MRI, int &rematCnt, - SlotIndexes *SlotIndexes, MachineFunction &MF) { + MachineRegisterInfo &MRI, int &RematCnt) { // Group user in same blocks. - std::vector UserSetList(cloneList.size()); + std::vector UserSetList(CloneList.size()); - for (size_t i = 0; i < cloneList.size(); i++) { - auto *Node = cloneList[i]; + for (size_t i = 0; i < CloneList.size(); i++) { + auto *Node = CloneList[i]; unsigned Reg = Node->Reg; MachineInstr *DefMI = Node->DefMI; // Group user in same blocks. BlockSet &UserSet = UserSetList[i]; - for (auto useIt = MRI.use_instr_nodbg_begin(Reg); - useIt != MRI.use_instr_nodbg_end();) { - MachineInstr &UseMI = *(useIt++); + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); UserSet.insert(UseMI.getParent()); } @@ -912,36 +904,34 @@ void AddCloneCandidate(std::vector &cloneList, } } - int size = Node->Size; - size <<= 16; + int Size = Node->Size; + Size <<= 16; // Pack userSet size to size. - size |= UserSet.size(); - Node->UserCount = size; + Size |= UserSet.size(); + Node->UserCount = Size; } - std::sort(cloneList.begin(), cloneList.end(), + std::sort(CloneList.begin(), CloneList.end(), // Sort based on userSet size. - [](const RematNode *a, const RematNode *b) { - static constexpr int mask = 0xffff; - return (a->UserCount & mask) < (b->UserCount & mask); + [](const RematNode *A, const RematNode *B) { + static constexpr int Mask = 0xffff; + return (A->UserCount & Mask) < (B->UserCount & Mask); }); - for (RematNode *Node : cloneList) { + for (RematNode *Node : CloneList) { Node->Kind = RematNode::RematKind::Clone; RematList.emplace_back(*Node); - rematCnt += Node->Size; + RematCnt += Node->Size; } } -int FilterRematCandiates(std::vector &Candidates, +int filterRematCandiates(std::vector &Candidates, std::vector &RematList, DenseSet &PinnedRegSet, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, - MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - MachineFunction &MF, SlotIndexes *SlotIndexes, - bool bVGPR, bool bMemBound) { - int rematCnt = 0; + MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) { + int RematCnt = 0; // Work one def one use first. for (auto &Node : Candidates) { unsigned Reg = Node.Reg; @@ -949,17 +939,17 @@ int FilterRematCandiates(std::vector &Candidates, continue; } MachineInstr *DefMI = Node.DefMI; - if (!IsSafeToMove(DefMI, MRI)) { + if (!isSafeToMove(DefMI, MRI)) { PinnedRegSet.insert(Reg); continue; } - AddOneDefOneUseCandidate(Node, RematList, MRI, rematCnt, DT, PDT, MLI, - bVGPR, bMemBound); + addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI, + IsVGPR, MemBound); } - if (!bVGPR) { - std::vector cloneList; + if (!IsVGPR) { + std::vector CloneList; // Try multi use case. for (auto &Node : Candidates) { unsigned Reg = Node.Reg; @@ -967,23 +957,22 @@ int FilterRematCandiates(std::vector &Candidates, continue; } MachineInstr *DefMI = Node.DefMI; - if (!IsSafeToMove(DefMI, MRI)) { + if (!isSafeToMove(DefMI, MRI)) { PinnedRegSet.insert(Reg); continue; } // Clone for each user. - cloneList.emplace_back(&Node); + CloneList.emplace_back(&Node); } - AddCloneCandidate(cloneList, RematList, PinnedRegSet, MRI, rematCnt, - SlotIndexes, MF); + addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt); } - return rematCnt; + return RematCnt; } -void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef, +void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef, SmallVector &userMIs) { for (MachineInstr *UseMI : userMIs) { for (MachineOperand &MO : UseMI->operands()) { @@ -991,7 +980,7 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef, continue; if (MO.getReg() == Reg) { MO.setReg(NewReg); - if (bSubRegDef) + if (IsSubRegDef) MO.setSubReg(0); } } @@ -1001,7 +990,7 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef, DenseMap reduceClonedMBBs( unsigned Reg, BlockMap> &userBlocks, DenseSet &UserMBBSet, - std::vector &hotBlocks, MachineDominatorTree *pDT) { + std::vector &hotBlocks, MachineDominatorTree *DT) { // Collect hot blocks which Exp is live in. DenseSet hotBlockSet; for (BlockLiveInfo &hotBlock : hotBlocks) { @@ -1020,22 +1009,22 @@ DenseMap reduceClonedMBBs( if (hotBlockSet.count(MBB)) continue; - bool bDomAllHotBlocks = true; - bool bDomedByAllHotBlocks = true; + bool IsDomAllHotBlocks = true; + bool IsDomedByAllHotBlocks = true; for (MachineBasicBlock *hotMBB : hotBlockSet) { - if (!pDT->dominates(MBB, hotMBB)) { - bDomAllHotBlocks = false; + if (!DT->dominates(MBB, hotMBB)) { + IsDomAllHotBlocks = false; } - if (!pDT->dominates(hotMBB, MBB)) { - bDomedByAllHotBlocks = false; + if (!DT->dominates(hotMBB, MBB)) { + IsDomedByAllHotBlocks = false; } - if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) { + if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { break; } } - if (bDomAllHotBlocks) { + if (IsDomAllHotBlocks) { userBlocks.erase(MBB); - } else if (bDomedByAllHotBlocks) { + } else if (IsDomedByAllHotBlocks) { afterHotRangeMBBs.insert(MBB); } } @@ -1049,7 +1038,7 @@ DenseMap reduceClonedMBBs( MachineBasicBlock *MBB2 = it2; if (MBB == MBB2) continue; - if (pDT->dominates(MBB, MBB2)) { + if (DT->dominates(MBB, MBB2)) { auto &Dom = DomMap[MBB]; Dom.insert(MBB2); auto &Dom2 = DomMap[MBB2]; @@ -1113,7 +1102,7 @@ static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB, void ApplyCloneRemat(Remat *Remat, RematNode &Node, std::vector &hotBlocks, - MachineDominatorTree *pDT, MachineRegisterInfo &MRI, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { unsigned Reg = Node.Reg; @@ -1123,10 +1112,10 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, const MCInstrDesc &Desc = DefMI->getDesc(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); // When the unique def has subReg, just create newReg for the subReg part. - bool bSubRegDef = false; + bool IsSubRegDef = false; if (DefOp.getSubReg() != 0) { RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg()); - bSubRegDef = true; + IsSubRegDef = true; } const DebugLoc DL = DefMI->getDebugLoc(); unsigned OpNum = DefMI->getNumOperands(); @@ -1144,7 +1133,7 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, } DenseMap DomMap = - reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, pDT); + reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, DT); for (auto useIt : UserMap) { MachineBasicBlock *MBB = useIt.first; @@ -1185,14 +1174,14 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, SlotIndexes->insertMachineInstrInMaps(*NewDef); SmallVector &userMIs = useIt.second; - updateUsers(Reg, NewReg, bSubRegDef, userMIs); + updateUsers(Reg, NewReg, IsSubRegDef, userMIs); // update users in dom MBBs. auto domMapIt = DomMap.find(MBB); if (domMapIt != DomMap.end()) { for (MachineBasicBlock *UpdateMBB : domMapIt->second) { SmallVector &userMIs = UserMap[UpdateMBB]; - updateUsers(Reg, NewReg, bSubRegDef, userMIs); + updateUsers(Reg, NewReg, IsSubRegDef, userMIs); } } @@ -1200,7 +1189,7 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, } if (MRI.use_empty(Reg)) { SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); - Remat->RemoveInst(DefMI); + Remat->removeInst(DefMI); DefMI->eraseFromParent(); } } @@ -1235,9 +1224,9 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, slotIndexes->insertMachineInstrInMaps(*DefMI); } -void ApplyRemat(Remat *Remat, MapVector &RematMap, +void ApplyRemat(Remat *Remat, MapVector &RematMap, std::vector &hotBlocks, - MachineDominatorTree *pDT, SlotIndexes *slotIndexes, + MachineDominatorTree *DT, SlotIndexes *slotIndexes, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { std::vector UpdateList; @@ -1257,13 +1246,13 @@ void ApplyRemat(Remat *Remat, MapVector &RematMap, if (Node.Kind == RematNode::RematKind::OneDefOneUse) { ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII); } else if (Node.Kind == RematNode::RematKind::Clone) { - ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, + ApplyCloneRemat(Remat, Node, hotBlocks, DT, MRI, slotIndexes, SIRI, SIII, MF); } } } -void dumpRematMap(MapVector &RematMap, +void dumpRematMap(MapVector &RematMap, const SIRegisterInfo *SIRI) { dbgs() << "\n rematMap: \n"; for (auto it : RematMap) { @@ -1276,8 +1265,8 @@ void dumpRematMap(MapVector &RematMap, int DebugBlockIndex = 42; void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet, - MapVector &VRematMap, - MapVector &SRematMap, int BlockIndex, + MapVector &VRematMap, + MapVector &SRematMap, int BlockIndex, const SIRegisterInfo *SIRI) { if (DebugBlockIndex != BlockIndex) return; @@ -1303,8 +1292,8 @@ void dumpCandidates(std::vector &RematCandidates, int BlockIndex, } // namespace bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, - LiveIntervals *LIS, MachineDominatorTree *pDT, - MachinePostDominatorTree *pPDT, bool &bNearTarget) { + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, bool &IsNearTarget) { const GCNSubtarget *ST = &MF.getSubtarget(); const SIInstrInfo *SIII = ST->getInstrInfo(); @@ -1318,8 +1307,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, auto &MRI = MF.getRegInfo(); - bool bUpdated = false; - RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST); + bool IsUpdated = false; + RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST); const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; if (status.TargetOcc >= MaxOcc) @@ -1333,16 +1322,16 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, if (EnableAggressive) rematSCnt += NearTargetRegLimit; - bool bSGPRSpill = false; + bool IsSGPRSpill = false; if (rematSCnt > 0) { - bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); + IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); } - bool bForceRematSgpr = bSGPRSpill | status.bNotBalance; + bool IsForceRematSgpr = IsSGPRSpill | status.NotBalance; // If bound by lds, skip. if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && - !bForceRematSgpr) + !IsForceRematSgpr) return false; MachineBasicBlock *EntryMBB = &MF.front(); @@ -1350,8 +1339,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, auto *SlotIndexes = LIS->getSlotIndexes(); // Reg which already marked remat. - MapVector VRematMap; - MapVector SRematMap; + MapVector VRematMap; + MapVector SRematMap; // Reg which cannot move around to remat. DenseSet PinnedRegSet; std::vector hotBlocks; @@ -1382,8 +1371,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, Tracker.advance(); auto LISLR = Tracker.getLiveRegs(); // Update live set for things already remated. - UpdateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap); - UpdateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap); + updateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap); + updateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap); const GCNRPTracker::LiveRegSet &liveSet = LISLR; unsigned VPressure = 0; @@ -1411,38 +1400,35 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, // Update reg pressure based on remat list. InstSet VReducedInsts; InstSet SReducedInsts; - int VReduced = - GetReducedSize(VRematMap, /*bVGPR*/ true, CandidateRegs, VReducedInsts, - MRI, SIRI, LiveInfo, RPOTIndexMap); - int SReduced = - GetReducedSize(SRematMap, /*bVGPR*/ false, CandidateRegs, SReducedInsts, - MRI, SIRI, LiveInfo, RPOTIndexMap); + int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI, + LiveInfo, RPOTIndexMap); + int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI, + LiveInfo, RPOTIndexMap); // Calculate size need to be remat. int rematVCnt = maxVPressure - VReduced - VLimit; int rematSCnt = maxSPressure - SReduced - SLimit; - bool bSGPRSpill = false; + bool IsSGPRSpill = false; if (rematSCnt > 0) { - bSGPRSpill = nearSgprSpill(maxSPressure, ST, MF); + IsSGPRSpill = nearSgprSpill(maxSPressure, ST, MF); } - bool bForceRematSgpr = bSGPRSpill | status.bNotBalance; + bool IsForceRematSgpr = IsSGPRSpill || status.NotBalance; // Try to add candidates into remat list. int newRematSCnt = 0; if (rematSCnt > 0) { // Build candidate nodes. std::vector SRematCandidates; - BuildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI, - SIII, SIRI, /*bVGPR*/ false); + buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI, + SIII, SIRI, /*IsVGPR*/ false); LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI)); std::vector SRematList; // Filter candidates. - newRematSCnt = - FilterRematCandiates(SRematCandidates, SRematList, PinnedRegSet, pDT, - pPDT, MLI, MRI, SIRI, MF, SlotIndexes, - /*bVGPR*/ false, status.bMemBound); + newRematSCnt = filterRematCandiates(SRematCandidates, SRematList, + PinnedRegSet, DT, PDT, MLI, MRI, + /*IsVGPR*/ false, status.MemBound); if (newRematSCnt > rematSCnt) { // Has enough remat node to cover rematCnt. int rematCnt = 0; @@ -1460,51 +1446,49 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, } // Check shared size. int SharedReducedSize = - GetSharedReducedSize(SReducedInsts, /*bVGPR*/ false, MRI, SIRI); + getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI); if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= rematSCnt) { for (RematNode &Node : SRematList) { SRematMap[Node.Reg] = Node; } } else { - if (!bForceRematSgpr) { + if (!IsForceRematSgpr) return false; - } else { - for (RematNode &Node : SRematList) { - SRematMap[Node.Reg] = Node; - } - // Find local one def one use candidates. - for (MachineInstr &MI : *MBB) { - if (MI.isDebugInstr()) - continue; - if (MI.getDesc().NumDefs != 1) - continue; - MachineOperand &DstMO = MI.getOperand(0); - Register Reg = DstMO.getReg(); - if (!SIRI->isSGPRReg(MRI, Reg)) - continue; - if (!MRI.hasOneNonDBGUse(Reg)) - continue; - if (!MRI.hasOneDef(Reg)) - continue; - if (Reg.isPhysical()) - continue; - MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); - if (UseMI.getParent() != MBB) + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + } + // Find local one def one use candidates. + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + if (MI.getDesc().NumDefs != 1) + continue; + MachineOperand &DstMO = MI.getOperand(0); + Register Reg = DstMO.getReg(); + if (!SIRI->isSGPRReg(MRI, Reg)) + continue; + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + if (!MRI.hasOneDef(Reg)) + continue; + if (Reg.isPhysical()) + continue; + MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); + if (UseMI.getParent() != MBB) + continue; + int gain = rematGain(&MI, Reg, MRI, SIRI, + /*IsVGPR*/ false); + if (gain > 0) { + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(&MI, &UseMI)) { continue; - int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, - /*bVGPR*/ false); - if (gain > 0) { - // Skip case when DefMI has implicit define which used by UseMI. - if (isImplicitDefUse(&MI, &UseMI)) { - continue; - } - RematNode Node = {Reg, &MI, (unsigned)gain >> 5}; - Node.InsertPointMI = &UseMI; - Node.Kind = RematNode::RematKind::OneDefOneUse; - SRematMap[Reg] = Node; - SharedReducedSize += Node.Size; } + RematNode Node = {Reg, &MI, (unsigned)gain >> 5}; + Node.InsertPointMI = &UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + SRematMap[Reg] = Node; + SharedReducedSize += Node.Size; } } } @@ -1518,57 +1502,57 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, // Remat these common live range. // Apply the remat. - int newRematVCnt = 0; + int NewRematVCnt = 0; if (rematVCnt > 0) { // TODO: V remat. } - bool bNeedSRemat = rematSCnt > 0; - bool bNeedVRemat = rematVCnt > 0; + bool NeedSRemat = rematSCnt > 0; + bool NeedVRemat = rematVCnt > 0; // If sgpr spill, always do remat. - bool bSRematOK = - (newRematSCnt <= 0 && !SRematMap.empty()) || bForceRematSgpr; - bool bVRematOK = - (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty(); - if (bNeedSRemat && bNeedVRemat) { - if (bVRematOK && bSRematOK) { - bUpdated = true; - } else if (bSGPRSpill) { - bUpdated = true; + bool IsSRematOK = + (newRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr; + bool IsVRematOK = + (status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty(); + if (NeedSRemat && NeedVRemat) { + if (IsVRematOK && IsSRematOK) { + IsUpdated = true; + } else if (IsSGPRSpill) { + IsUpdated = true; } - } else if (bNeedSRemat) { - if (bSRematOK) { - bUpdated = true; + } else if (NeedSRemat) { + if (IsSRematOK) { + IsUpdated = true; } - } else if (bNeedVRemat) { - if (bVRematOK) { - bUpdated = true; + } else if (NeedVRemat) { + if (IsVRematOK) { + IsUpdated = true; } } // TODO: what to do when cannot reach target? if (newRematSCnt > 0) { if ((unsigned)newRematSCnt <= NearTargetRegLimit) { - bNearTarget = true; + IsNearTarget = true; } else { - if (!bSGPRSpill) + if (!IsSGPRSpill) return false; } } } if (SRematMap.empty() && VRematMap.empty()) { - return bUpdated; + return IsUpdated; } if (!SRematMap.empty()) { - bUpdated = true; - ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, + IsUpdated = true; + ApplyRemat(Remat, SRematMap, hotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, MF); LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); } // Balance between vector and scalar if possible. - return bUpdated; + return IsUpdated; } namespace { @@ -1622,10 +1606,10 @@ static bool isConvergent(Remat *Remat, const MachineInstr &MI) { bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - bool bSink) { + bool IsSink) { if (Reg.isPhysical()) return false; - bool bVGPR = SIRI->isVGPR(MRI, Reg); + bool IsVGPR = SIRI->isVGPR(MRI, Reg); MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); if (!DefMI) @@ -1667,7 +1651,7 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, } } - if (bVGPR && bSink) { + if (IsVGPR && IsSink) { // Skip mem related inst. if (DefMI->mayLoadOrStore()) { return false; @@ -1686,7 +1670,7 @@ std::vector buildSubExpFromCandidates( Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, - GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) { + GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) { InstSet CandidateDefs; DenseSet RemovedCandidates; std::vector CandidateRegs; @@ -1715,7 +1699,7 @@ std::vector buildSubExpFromCandidates( LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";); for (unsigned Reg : CandidateRegs) { MachineInstr *MI = MRI.getUniqueVRegDef(Reg); - bool bHasNoCandidatesSameBlockUser = false; + bool IsHasNoCandidatesSameBlockUser = false; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { if (UseMI.getParent() == MI->getParent()) { if (UseMI.getNumExplicitDefs() == 1) { @@ -1725,14 +1709,14 @@ std::vector buildSubExpFromCandidates( RemovedCandidates.count(UserDefReg) == 0) continue; } - if (!bAllowPartialUseInSubExp) - bHasNoCandidatesSameBlockUser = true; + if (!AllowPartialUseInSubExp) + IsHasNoCandidatesSameBlockUser = true; else PartialCandidates.insert(MI); break; } } - if (bHasNoCandidatesSameBlockUser) { + if (IsHasNoCandidatesSameBlockUser) { RemovedCandidates.insert(Reg); continue; } @@ -1761,15 +1745,15 @@ std::vector buildSubExpFromCandidates( // Skip if MI is not safe to move. if (MI.getNumDefs() != 1) { // allow to move unused implicit def. - bool bDeadImplictDef = false; + bool IsDeadImplictDef = false; for (MachineOperand &MO : MI.implicit_operands()) { if (!MO.isReg()) continue; if (!MO.isDef()) continue; - bDeadImplictDef = MO.isDead(); + IsDeadImplictDef = MO.isDead(); } - if (!bDeadImplictDef) + if (!IsDeadImplictDef) continue; } @@ -1783,24 +1767,24 @@ std::vector buildSubExpFromCandidates( break; } - if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true)) continue; // If all users of MI are in candidate defs, add MI into candidate defs. // If part of user of MI is in candidate defs, add MI into candidate defs // when allow partialUse. - bool bAllUserInCandidate = true; - bool bHasCandidateUser = false; + bool IsAllUserInCandidate = true; + bool IsHasCandidateUser = false; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { if (CandidateDefs.count(&UseMI) == 0) - bAllUserInCandidate = false; + IsAllUserInCandidate = false; else - bHasCandidateUser = true; + IsHasCandidateUser = true; } - if (!bHasCandidateUser) + if (!IsHasCandidateUser) continue; - if (!bAllUserInCandidate) { - if (!bAllowPartialUseInSubExp) + if (!IsAllUserInCandidate) { + if (!AllowPartialUseInSubExp) continue; PartialCandidates.insert(&MI); } @@ -1834,10 +1818,9 @@ std::vector buildSubExpFromCandidates( std::vector defs; defs.reserve(CandidateDefs.size()); for (MachineInstr &MI : *MBB) { - MachineInstr *pMI = &MI; - if (CandidateDefs.count(pMI) == 0) + if (CandidateDefs.count(&MI) == 0) continue; - defs.emplace_back(pMI); + defs.emplace_back(&MI); } LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI @@ -1847,13 +1830,13 @@ std::vector buildSubExpFromCandidates( // Build SubExp with CandidateDefs as Nodes, CandidateInput as input // Candidates as output. - ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true); + ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); dag.build(CandidateInput, Candidates, defs); - if (bAllowPartialUseInSubExp) { + if (AllowPartialUseInSubExp) { for (auto &subExp : dag.SubExps) { for (auto *MI : subExp.SUnits) { if (PartialCandidates.count(MI)) { - subExp.bCloneOnly = true; + subExp.IsCloneOnly = true; break; } } @@ -1881,7 +1864,7 @@ std::vector buildSubExpFromCandidatesTopBottom( continue; assert(UseMBB == MBB && "block mismatch"); // If all operands in CandidateRegs, add to candidateDefs. - bool bHasOpRegNotInCandidates = false; + bool IsHasOpRegNotInCandidates = false; for (MachineOperand &MO : UseMI.operands()) { if (!MO.isReg()) continue; @@ -1891,11 +1874,11 @@ std::vector buildSubExpFromCandidatesTopBottom( if (MO.isImplicit() && OpReg.isPhysical()) continue; if (Candidates.count(OpReg) == 0) { - bHasOpRegNotInCandidates = true; + IsHasOpRegNotInCandidates = true; break; } } - if (bHasOpRegNotInCandidates) + if (IsHasOpRegNotInCandidates) continue; LLVM_DEBUG(UseMI.dump()); @@ -1948,11 +1931,11 @@ std::vector buildSubExpFromCandidatesTopBottom( } // Still use bsink to skip mem load/store. - // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*bSink*/true)) + // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*IsSink*/true)) // continue; // If all user of MI is in candidate defs, add MI into candidate defs. - bool bAllOperandInCandidate = true; + bool IsAllOperandInCandidate = true; for (MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; @@ -1966,22 +1949,22 @@ std::vector buildSubExpFromCandidatesTopBottom( (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO)) continue; if (OpReg.isPhysical()) { - bAllOperandInCandidate = false; + IsAllOperandInCandidate = false; break; } MachineInstr *OpMI = MRI.getUniqueVRegDef(OpReg); if (!OpMI) { - bAllOperandInCandidate = false; + IsAllOperandInCandidate = false; break; } if (CandidateDefs.count(OpMI) == 0) { - bAllOperandInCandidate = false; + IsAllOperandInCandidate = false; break; } if (MO.isTied()) continue; } - if (!bAllOperandInCandidate) + if (!IsAllOperandInCandidate) continue; LLVM_DEBUG(llvm::dbgs() << "Add local candidates:"; pressure::print_reg(Reg, MRI, SIRI, llvm::dbgs());); @@ -2023,10 +2006,9 @@ std::vector buildSubExpFromCandidatesTopBottom( std::vector defs; defs.reserve(CandidateDefs.size()); for (MachineInstr &MI : *MBB) { - MachineInstr *pMI = &MI; - if (CandidateDefs.count(pMI) == 0) + if (CandidateDefs.count(&MI) == 0) continue; - defs.emplace_back(pMI); + defs.emplace_back(&MI); } LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI @@ -2042,7 +2024,7 @@ std::vector buildSubExpFromCandidatesTopBottom( // Input is Candidates, output is? // Build SubExp with CandidateDefs as Nodes, CandidateInput as input // Candidates as output. - ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true); + ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); dag.build(Candidates, LocalCandidates, defs); return dag.SubExps; } @@ -2060,7 +2042,7 @@ void print_vreg(Register Reg, const MachineRegisterInfo &MRI) { MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, const MachineRegisterInfo &MRI, - MachineDominatorTree *pDT) { + MachineDominatorTree *DT) { BlockSet userBlocks; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserBB = UseMI.getParent(); @@ -2073,8 +2055,8 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, } if (userBlocks.empty()) return nullptr; - MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks); - if (!pDT->dominates(FromBB, userBlock)) { + MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks); + if (!DT->dominates(FromBB, userBlock)) { return nullptr; } if (userBlock == FromBB) @@ -2083,7 +2065,7 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, } void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, - MachineDominatorTree *pDT, + MachineDominatorTree *DT, SlotIndexes *slotIndexes, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI) { // Move from bottom. @@ -2094,7 +2076,7 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, continue; unsigned Reg = DefMI->getOperand(0).getReg(); - MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, pDT); + MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, DT); if (!ToBB) continue; @@ -2118,7 +2100,7 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, } void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, - MachineDominatorTree *pDT, + MachineDominatorTree *DT, SlotIndexes *slotIndexes, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI) { @@ -2172,18 +2154,18 @@ DenseSet buildCloneSet(ExpDag &dag, continue; MachineInstr *MI = SU.getInstr(); if (dagBottoms.find(&SU) != dagBottoms.end()) { - bool bUsed = false; + bool IsUsed = false; // For bottom SU, if in usedOutput, add to copySet; for (MachineOperand &DefMO : MI->defs()) { if (!DefMO.isReg()) continue; unsigned Reg = DefMO.getReg(); if (usedOutput.count(Reg) > 0) { - bUsed = true; + IsUsed = true; break; } } - if (bUsed) { + if (IsUsed) { copySet.insert(MI); continue; } @@ -2192,16 +2174,16 @@ DenseSet buildCloneSet(ExpDag &dag, } // If any SuccNode is in copySet, add to copySet. - bool bSuccCopied = false; + bool IsSuccCopied = false; for (SDep &SucDep : SU.Succs) { SUnit *SucSU = SucDep.getSUnit(); MachineInstr *SuccMI = SucSU->getInstr(); if (copySet.count(SuccMI) > 0) { - bSuccCopied = true; + IsSuccCopied = true; break; } } - if (bSuccCopied) + if (IsSuccCopied) copySet.insert(MI); } return copySet; @@ -2237,7 +2219,7 @@ DenseMap reduceClonedMBBs( SubExp &Exp, MapVector> &userBlocks, DenseMap &userBlocksLiveRegs, - std::vector &hotBlocks, MachineDominatorTree *pDT) { + std::vector &hotBlocks, MachineDominatorTree *DT) { // Collect hot blocks which Exp is live in. DenseSet hotBlockSet; for (HotBlock &hotBlock : hotBlocks) { @@ -2260,22 +2242,22 @@ DenseMap reduceClonedMBBs( if (hotBlockSet.count(MBB)) continue; - bool bDomAllHotBlocks = true; - bool bDomedByAllHotBlocks = true; + bool IsDomAllHotBlocks = true; + bool IsDomedByAllHotBlocks = true; for (MachineBasicBlock *hotMBB : hotBlockSet) { - if (!pDT->dominates(MBB, hotMBB)) { - bDomAllHotBlocks = false; + if (!DT->dominates(MBB, hotMBB)) { + IsDomAllHotBlocks = false; } - if (!pDT->dominates(hotMBB, MBB)) { - bDomedByAllHotBlocks = false; + if (!DT->dominates(hotMBB, MBB)) { + IsDomedByAllHotBlocks = false; } - if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) { + if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { break; } } - if (bDomAllHotBlocks) { + if (IsDomAllHotBlocks) { userBlocks.erase(MBB); - } else if (bDomedByAllHotBlocks) { + } else if (IsDomedByAllHotBlocks) { afterHotRangeMBBs.insert(MBB); } } @@ -2289,7 +2271,7 @@ DenseMap reduceClonedMBBs( MachineBasicBlock *MBB2 = it2; if (MBB == MBB2) continue; - if (pDT->dominates(MBB, MBB2)) { + if (DT->dominates(MBB, MBB2)) { auto &Dom = DomMap[MBB]; Dom.insert(MBB2); auto &Dom2 = DomMap[MBB2]; @@ -2315,7 +2297,7 @@ DenseMap reduceClonedMBBs( } void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, - MachineDominatorTree *pDT, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI) { @@ -2341,7 +2323,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, } } // Build dag for SubExp to help remove unused inst when clone. - ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true); + ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits); DenseSet dagBottoms; for (SUnit &SU : dag.SUnits) { @@ -2369,7 +2351,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, // For userBlocks which dominated by all hotBlocks, they could share clones // because once after hot block, the pressure is OK. DenseMap DomMap = - reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT); + reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, DT); // Sort to make stable order. std::sort( @@ -2379,7 +2361,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, return it0.first->getNumber() < it1.first->getNumber(); }); - const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); + const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); // Clone for each userBlocks. Not share clone thru dom tree which cannot help // reg pressure. @@ -2395,7 +2377,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, DenseMap RegMap; auto insertPtr = MBB->getFirstNonPHI(); // If Exp has scc read/write, make sure MBB not have scc in liveins. - if (bModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr)) + if (IsModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr)) continue; MachineFunction *MF = MBB->getParent(); for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { @@ -2484,7 +2466,7 @@ void ApplySubExpCloneNearUserInBlock( } SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex(); - const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); + const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); for (unsigned Reg : Exp.BottomRegs) { @@ -2504,7 +2486,7 @@ void ApplySubExpCloneNearUserInBlock( continue; // Do not overwrite a live scc. - if (bModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI)) + if (IsModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI)) continue; useMIs.emplace_back(&UseMI); @@ -2677,7 +2659,7 @@ bool collectPacifist(MachineInstr &MI, return false; } - bool bHasDef = false; + bool IsHasDef = false; for (MachineOperand &MO : MI.defs()) { Register Reg = MO.getReg(); @@ -2688,10 +2670,10 @@ bool collectPacifist(MachineInstr &MI, getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) return false; - bHasDef = true; + IsHasDef = true; } // If no def, it will not increase pressure, don't mark it. - return bHasDef; + return IsHasDef; } static MachineInstr *findFirstAliasingLoadOrStoreInMBB(MachineInstr &MI, @@ -2769,7 +2751,7 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, LLVM_DEBUG(dbgs() << "pacifist end\n"); SlotIndexes *slotIndexes = LIS->getSlotIndexes(); - bool bUpdated = false; + bool IsUpdated = false; // Move pacifist to its first user. // for (MachineInstr *MI : pacifistList) { @@ -2813,10 +2795,10 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, MBB.insert(insertPoint, MI); LIS->handleMove(*MI); - bUpdated = true; + IsUpdated = true; } - return bUpdated; + return IsUpdated; } DenseMap @@ -2862,16 +2844,15 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI, bool collectVToSCrossHotSpot( MachineBasicBlock &MBB, RematStatus &status, DenseMap &UniformMap, - SmallMapVector &VToSMap, LiveIntervals *LIS, - MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII) { + SmallMapVector &VToSMap, LiveIntervals *LIS) +{ unsigned VLimit = status.TargetVLimit; unsigned SLimit = status.TargetSLimit; auto &ST = MBB.getParent()->getSubtarget(); GCNDownwardRPTracker Tracker(*LIS); - bool bUpdated = false; + bool IsUpdated = false; const auto inputLive = status.MBBInputLiveMap[&MBB]; Tracker.reset(*MBB.begin(), &inputLive); for (MachineInstr &MI : MBB) { @@ -2901,10 +2882,10 @@ bool collectVToSCrossHotSpot( if (UniformIt == UniformMap.end()) continue; VToSMap[UniformIt->first] = UniformIt->second; - bUpdated = true; + IsUpdated = true; } } - return bUpdated; + return IsUpdated; } // Return true if the user is outside of the def's loop. @@ -2927,8 +2908,7 @@ bool rematUniformVgprToSgpr( for (auto &hotBlock : hotBlocks) { MachineBasicBlock &MBB = *hotBlock.MBB; - collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS, MRI, - SIRI, SIII); + collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS); } if (VToSMap.empty()) @@ -2969,7 +2949,7 @@ bool rematUniformVgprToSgpr( for (MachineInstr *userMI : userMIs) { const auto &Desc = userMI->getDesc(); - bool bIllegal = false; + bool IsIllegal = false; for (unsigned i = 0; i < userMI->getNumOperands(); i++) { MachineOperand &MO = userMI->getOperand(i); if (!MO.isReg()) @@ -2979,7 +2959,7 @@ bool rematUniformVgprToSgpr( if (MO.getReg() != Reg) continue; if (i >= Desc.getNumOperands()) { - bIllegal = true; + IsIllegal = true; break; } @@ -2997,7 +2977,7 @@ bool rematUniformVgprToSgpr( // consider not have limit on reg class. } } - if (bIllegal) + if (IsIllegal) continue; auto rit = userMI->getReverseIterator(); @@ -3084,7 +3064,7 @@ bool collectRematableHotReg( } bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, - std::vector &inBlockCloneSubExps, bool bVGPR, + std::vector &inBlockCloneSubExps, bool IsVGPR, const GCNRPTracker::LiveRegSet &inputLive, const GCNRPTracker::LiveRegSet &outputLive, DenseSet &hotSet, int vDistance, int sDistance, @@ -3138,7 +3118,7 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, // If the def reg is in hot reg. // Add to output. if (hotLive.find(DefReg) != hotLive.end()) { - bool bUserIsHot = false; + bool IsUserIsHot = false; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) { if (UseMI.getParent() != &MBB) continue; @@ -3148,12 +3128,12 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex(); // When has a hot user after hotMI, remat it may not help. if (useSI > SI) { - bUserIsHot = true; + IsUserIsHot = true; break; } } - if (bUserIsHot) + if (IsUserIsHot) continue; outputSet[DefReg]; LLVM_DEBUG(dbgs() << "hotRemat:"); @@ -3174,37 +3154,37 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, // Build SubExp with pureHotRematList as Nodes, hotLive as input // rematHot as output. // Not join input when build ExpDag to get small subExps. - ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ false); + ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ false); dag.build(hotLive, outputSet, pureHotRematList); // Find best subExp add to inBlockCloneSubExps. // Sort by size of subExp. std::sort(dag.SubExps.begin(), dag.SubExps.end(), - [](const SubExp &a, const SubExp &b) { - return a.SUnits.size() < b.SUnits.size(); + [](const SubExp &A, const SubExp &B) { + return A.SUnits.size() < B.SUnits.size(); }); std::vector cloneSubExps; - int distance = bVGPR ? vDistance : sDistance; + int distance = IsVGPR ? vDistance : sDistance; for (SubExp &subExp : dag.SubExps) { - if (subExp.bNotSafeToCopy) + if (subExp.IsNotSafeToCopy) continue; - if (bVGPR) { + if (IsVGPR) { if (subExp.vOutputSize == 0) continue; } else { if (subExp.sOutputSize == 0) continue; } - if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) + if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) continue; - // Not clone big subExp. + // Not clone . if (subExp.SUnits.size() > 10) continue; // Do not allow remat in the block when the expression has a memory op and // the block has a write. We could allow this in some cases with better // analysis. - if (subExp.bHasMemInst && MemWriteMBBSet.count(&MBB)) + if (subExp.IsHasMemInst && MemWriteMBBSet.count(&MBB)) continue; - if (bVGPR) { + if (IsVGPR) { distance -= subExp.vOutputSize; } else { distance -= subExp.sOutputSize; @@ -3282,7 +3262,7 @@ bool tryRematInHotSpot( if (vDistance > 0 && hotVMI) { // Use hotVMI when apply. inBlockHotSInstMap[&MBB] = nullptr; - if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive, + if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*IsVGPR*/ true, inputLive, outputLive, hotSet, vDistance, sDistance, VLimit, SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII)) return true; @@ -3292,7 +3272,7 @@ bool tryRematInHotSpot( // Use hotSMI when apply. inBlockHotSInstMap[&MBB] = hotSMI; inBlockHotVInstMap[&MBB] = nullptr; - return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false, + return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*IsVGPR*/ false, inputLive, outputLive, hotSet, vDistance, sDistance, VLimit, SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII); } @@ -3308,7 +3288,7 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { struct SortNode { SubExp Exp; unsigned Depth; - bool bDepthDirty; + bool IsDepthDirty; SmallDenseSet Preds; SmallDenseSet Succs; }; @@ -3342,10 +3322,10 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { auto &outExps = outIt->second; for (SubExp *inExp : inExps) { for (SubExp *outExp : outExps) { - if (inExp->bHoist != outExp->bHoist) { + if (inExp->IsHoist != outExp->IsHoist) { // Different direction. // If output (def) move up, input(use) move down, nothing happens. - if (outExp->bHoist) + if (outExp->IsHoist) continue; // Canot input(use) move up, output(def) move down. // Choose the exp which save more. @@ -3359,7 +3339,7 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { continue; } // Link outExp to inExp. - if (inExp->bHoist) { + if (inExp->IsHoist) { sortMap[outExp].Preds.insert(inExp); sortMap[inExp].Succs.insert(outExp); } else { @@ -3378,8 +3358,8 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { SortNode &Node = sortMap[&Exp]; Node.Depth = 0; Node.Exp = Exp; - Node.bDepthDirty = !Node.Preds.empty(); - if (!Node.bDepthDirty) + Node.IsDepthDirty = !Node.Preds.empty(); + if (!Node.IsDepthDirty) WorkList.emplace_back(&Exp); } // Calc depth. @@ -3389,16 +3369,16 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { for (SubExp *Succ : Node.Succs) { SortNode &SuccNode = sortMap[Succ]; SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1); - bool bAllPrevClean = true; + bool IsAllPrevClean = true; for (SubExp *Prev : SuccNode.Preds) { SortNode &PrevNode = sortMap[Prev]; - if (PrevNode.bDepthDirty) { - bAllPrevClean = false; + if (PrevNode.IsDepthDirty) { + IsAllPrevClean = false; break; } } - if (bAllPrevClean) { - SuccNode.bDepthDirty = false; + if (IsAllPrevClean) { + SuccNode.IsDepthDirty = false; WorkList.push_back(Succ); } } @@ -3435,12 +3415,12 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1, unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1); unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0); unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1); - // big occupancy is low pressure. + // is low pressure. if (Occ0 > Occ1) return false; if (Occ0 < Occ1) return true; - // When sgpr bound, big sgpr is high pressure. + // When sgpr bound, is high pressure. if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) { return maxS0 > maxS1; } @@ -3453,9 +3433,9 @@ bool canHelpPressureWhenSink( SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, const MachineLoopInfo *MLI, - MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound) { + MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound) { LLVM_DEBUG(subExp.dump(MRI, SIRI)); - if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) + if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) return false; // Update input size to ignore lives in which already in @@ -3475,7 +3455,7 @@ bool canHelpPressureWhenSink( if (subExp.vInputSize > subExp.vOutputSize) return false; - if (subExp.sInputSize > subExp.sOutputSize && bSgprBound) + if (subExp.sInputSize > subExp.sOutputSize && IsSgprBound) return false; if (subExp.sInputSize >= subExp.sOutputSize && @@ -3496,20 +3476,20 @@ bool canHelpPressureWhenSink( } if (userBlocks.empty()) return false; - MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks); - if (!pDT->dominates(subExp.FromBB, userBlock)) { + MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks); + if (!DT->dominates(subExp.FromBB, userBlock)) { return false; } if (userBlock == subExp.FromBB && // When allow clone, could go clone path if cannot move subExp. - !bCanClone) + !IsCanClone) return false; subExp.ToBB = userBlock; if (auto *toLoop = MLI->getLoopFor(userBlock)) { auto *fromLoop = MLI->getLoopFor(subExp.FromBB); if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth()) - subExp.bMoveIntoLoop = true; + subExp.IsMoveIntoLoop = true; } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) { auto *toLoop = MLI->getLoopFor(userBlock); // not safe to move out of loop. @@ -3523,12 +3503,12 @@ bool canHelpPressureWhenSink( bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - const MachineLoopInfo *MLI, bool bSgprBound) { - if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ true)) + const MachineLoopInfo *MLI, bool IsSgprBound) { + if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ true)) return false; if (subExp.vInputSize < subExp.vOutputSize) return false; - if (subExp.sInputSize < subExp.sOutputSize && bSgprBound) + if (subExp.sInputSize < subExp.sOutputSize && IsSgprBound) return false; if (subExp.sInputSize <= subExp.sOutputSize && @@ -3584,7 +3564,7 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus, LLVM_DEBUG(print_vreg(Reg, MRI)); LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr "; else dbgs() << " vgpr ";); - if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) { + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true)) { LLVM_DEBUG(dbgs() << " is not safe\n"); continue; } @@ -3631,7 +3611,7 @@ collectPassThrus(MachineBasicBlock *MBB, const GCNRPTracker::LiveRegSet &outputLive, const GCNRPTracker::LiveRegSet &usedPassThrus, const GCNRPTracker::LiveRegSet &liveRegCandidates, - MachineRegisterInfo &MRI, bool bCanClone) { + MachineRegisterInfo &MRI, bool IsCanClone) { GCNRPTracker::LiveRegSet passThrus; llvm::mergeLiveRegSet(passThrus, inputLive); llvm::andLiveRegSet(passThrus, outputLive); @@ -3655,17 +3635,17 @@ collectPassThrus(MachineBasicBlock *MBB, } DenseSet UseMBBs; // Allow use for pass thru if clone is OK. - if (!bCanClone) { + if (!IsCanClone) { for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserMBB = UseMI.getParent(); UseMBBs.insert(UserMBB); } } - bool bW = DefMBBs.count(MBB) > 0; - bool bR = UseMBBs.count(MBB) > 0; + bool IsW = DefMBBs.count(MBB) > 0; + bool IsR = UseMBBs.count(MBB) > 0; - bool bPassThru = !bW && !bR; - if (!bPassThru) + bool IsPassThru = !IsW && !IsR; + if (!IsPassThru) passThrus.erase(Reg); } return passThrus; @@ -3682,7 +3662,7 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, SmallDenseSet freeInstUseRegs; SmallVector freeInsts; for (MachineInstr *MI : subExp.SUnits) { - bool bIsFree = true; + bool IsFree = true; // Check all use regs are free. for (MachineOperand &MO : MI->uses()) { if (!MO.isReg()) @@ -3691,7 +3671,7 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, if (MO.isImplicit() && Reg == AMDGPU::EXEC) continue; if (MRI.getUniqueVRegDef(Reg) == nullptr) { - bIsFree = false; + IsFree = false; break; } // Skip local pass thrus unless it is free. @@ -3699,18 +3679,18 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, continue; if (freeRegs.count(Reg)) continue; - bIsFree = false; + IsFree = false; break; } // Check def is unique. for (MachineOperand &MO : MI->defs()) { unsigned Reg = MO.getReg(); if (MRI.getUniqueVRegDef(Reg) == nullptr) { - bIsFree = false; + IsFree = false; break; } } - if (!bIsFree) + if (!IsFree) continue; // Save inst as free inst. freeInsts.emplace_back(MI); @@ -3730,20 +3710,20 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, } // Then remove local inst has no output use. for (MachineInstr *MI : freeInsts) { - bool bIsFreeUsed = false; + bool IsFreeUsed = false; for (MachineOperand &MO : MI->defs()) { unsigned Reg = MO.getReg(); // Used as freeInst or output. - bIsFreeUsed |= + IsFreeUsed |= freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg); } - if (!bIsFreeUsed) + if (!IsFreeUsed) continue; freeExp.SUnits.emplace_back(MI); } if (freeExp.SUnits.empty()) { // mark has terminator to make it unsafe. - freeExp.bHasTerminatorInst = true; + freeExp.IsHasTerminatorInst = true; return freeExp; } // Build BottomRegs and TopRegs for freeExp. @@ -3760,7 +3740,7 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, freeExp.FromBB = subExp.FromBB; freeExp.ToBB = subExp.ToBB; // must be clone since is partial of subExp. - freeExp.bCloneOnly = true; + freeExp.IsCloneOnly = true; // Calc reg for freeExp. for (unsigned Reg : freeExp.TopRegs) { @@ -3785,10 +3765,10 @@ std::vector buildSubExpCandidates( GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, const MachineLoopInfo *MLI, SlotIndexes *slotIndexes, - MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound, + MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound, GCNRPTracker::LiveRegSet &unUsedPassThrus, DenseSet &MemWriteMBBSet, - bool bAllowPartialUseInSubExp) { + bool AllowPartialUseInSubExp) { std::vector subExpCandidates; // Build exp dag on define blocks. // Save profit candidates into list. @@ -3799,40 +3779,40 @@ std::vector buildSubExpCandidates( // Go up on the dag until reach share node. auto subExps = buildSubExpFromCandidates( Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus, - bAllowPartialUseInSubExp); + AllowPartialUseInSubExp); for (SubExp &subExp : subExps) { - if (subExp.bHasMemInst) { + if (subExp.IsHasMemInst) { // Skip when memory ld/st inst need to cross MBB which write memory. // TODO: check all MBBs in between FromBB and ToBB not write memory. // Currently just skip when any memory write exist. if (!MemWriteMBBSet.empty()) { MachineBasicBlock *FromBB = subExp.FromBB; MachineBasicBlock *ToBB = subExp.ToBB; - if (subExp.bHoist) { + if (subExp.IsHoist) { FromBB = subExp.ToBB; ToBB = subExp.FromBB; } - bool bCrossMemWriteMBB = false; + bool IsCrossMemWriteMBB = false; for (MachineBasicBlock *MemMBB : MemWriteMBBSet) { - if (pDT->dominates(ToBB, MemMBB)) + if (DT->dominates(ToBB, MemMBB)) continue; - if (pDT->dominates(MemMBB, FromBB)) + if (DT->dominates(MemMBB, FromBB)) continue; - bCrossMemWriteMBB = true; + IsCrossMemWriteMBB = true; break; } - if (bCrossMemWriteMBB) + if (IsCrossMemWriteMBB) continue; } } - if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT, - bCanClone, bSgprBound)) { - if (bAllowPartialUseInSubExp && - subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) { + if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, DT, + IsCanClone, IsSgprBound)) { + if (AllowPartialUseInSubExp && + subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) { SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI); if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, - MLI, pDT, bCanClone, bSgprBound)) { + MLI, DT, IsCanClone, IsSgprBound)) { subExpCandidates.emplace_back(freeSubExp); } } @@ -3848,28 +3828,28 @@ std::vector buildSubExpCandidates( std::pair calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, GCNRPTracker::LiveRegSet &inputLive, - GCNRPTracker::LiveRegSet &outputLive, bool bVOutBound, - bool bSOutBound, bool bCanClone, MachineDominatorTree *pDT, + GCNRPTracker::LiveRegSet &outputLive, bool IsVOutBound, + bool IsSOutBound, bool IsCanClone, MachineDominatorTree *DT, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { int vgpr = 0; int sgpr = 0; MachineBasicBlock *MBB = hotBB.MBB; // Sink saving. for (SubExp &Exp : subExpCandidates) { - if (Exp.bHoist) { + if (Exp.IsHoist) { // ToMBB -> MBB -> FromMBB. // If ToMBB not dom hot block, reg will not live in MBB. - if (!pDT->dominates(Exp.ToBB, MBB)) + if (!DT->dominates(Exp.ToBB, MBB)) continue; } else { // If FromBB not dom hot block, reg will not live in MBB. - if (!pDT->dominates(Exp.FromBB, MBB)) + if (!DT->dominates(Exp.FromBB, MBB)) continue; // When subExp is from hotBB, check output instead of input. if (Exp.FromBB == MBB) { - if (bVOutBound && Exp.vOutputSize < Exp.vInputSize) + if (IsVOutBound && Exp.vOutputSize < Exp.vInputSize) continue; - if (bSOutBound && Exp.sOutputSize < Exp.sInputSize) + if (IsSOutBound && Exp.sOutputSize < Exp.sInputSize) continue; vgpr += Exp.vInputSize; vgpr -= Exp.vOutputSize; @@ -3884,18 +3864,18 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, // If subExp is to hotBB, it is crossing output instead of input. GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive; - bool bClone = false; + bool IsClone = false; GCNRPTracker::LiveRegSet newInput; - if (!Exp.bMoveIntoLoop) { - if (Exp.bHoist) { + if (!Exp.IsMoveIntoLoop) { + if (Exp.IsHoist) { // If FromBB dom hot block, it will not change live for MBB. - if (Exp.FromBB != MBB && pDT->dominates(Exp.FromBB, MBB)) + if (Exp.FromBB != MBB && DT->dominates(Exp.FromBB, MBB)) continue; } else { // If ToBB dom hot block, it will not change live for MBB. - if (ToMBB != MBB && pDT->dominates(ToMBB, MBB)) { - if (bCanClone && !Exp.bNotSafeToCopy) { - bClone = true; + if (ToMBB != MBB && DT->dominates(ToMBB, MBB)) { + if (IsCanClone && !Exp.IsNotSafeToCopy) { + IsClone = true; } else { continue; } @@ -3909,27 +3889,27 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, if (crossLive.find(Reg) != crossLive.end()) MBBBeginMask = crossLive[Reg]; // Check mask which live in both BeginSlot and exp output when sink to - // kill the output. Check mask which not live in BeginSlot but live in + // kill the output. Check mask which not live in BeginSlot in // exp output when hoist to live the output. - LaneBitmask profitMask = - Exp.bHoist ? (outMask & (~MBBBeginMask)) : (outMask & MBBBeginMask); + LaneBitmask profitMask = Exp.IsHoist ? (outMask & (~MBBBeginMask)) + : (outMask & MBBBeginMask); if (MBBBeginMask.any()) { unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); LLVM_DEBUG(std::string movStr = - Exp.bHoist ? "output hoist:" : "output sink:"; + Exp.IsHoist ? "output hoist:" : "output sink:"; dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size); // Exp out live at block input. // It will descrease live for MBB when sink and increase when hoist. if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); - if (Exp.bHoist) + if (Exp.IsHoist) vgprDiff += Size; else vgprDiff -= Size; } else { LLVM_DEBUG(dbgs() << "s\n"); - if (Exp.bHoist) + if (Exp.IsHoist) sgprDiff += Size; else sgprDiff -= Size; @@ -3943,11 +3923,11 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, LaneBitmask MBBBeginMask; if (crossLive.find(Reg) != crossLive.end()) MBBBeginMask = crossLive[Reg]; - // Check mask which not live in BeginSlot but live in exp input when + // Check mask which not live in BeginSlot in exp input when // sink to live the input. Check mask which live in both BeginSlot and // exp output when hoist to kill the input. LaneBitmask profitMask = - Exp.bHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask)); + Exp.IsHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask)); if (profitMask.any()) { // Update input live to avoid count same input more than once. newInput[Reg] |= inMask; @@ -3956,17 +3936,17 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); LLVM_DEBUG( - std::string movStr = Exp.bHoist ? "input hoist:" : "input sink:"; + std::string movStr = Exp.IsHoist ? "input hoist:" : "input sink:"; dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size); if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); - if (Exp.bHoist) + if (Exp.IsHoist) vgprDiff -= Size; else vgprDiff += Size; } else { LLVM_DEBUG(dbgs() << "s\n"); - if (Exp.bHoist) + if (Exp.IsHoist) sgprDiff -= Size; else sgprDiff += Size; @@ -3981,15 +3961,15 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, // Hoist into loop is not supported now. for (auto outIt : Exp.outputLive) { unsigned Reg = outIt.first; - bool bDomUser = false; + bool IsDomUser = false; for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserMBB = MI.getParent(); - if (pDT->dominates(MBB, UserMBB)) { - bDomUser = true; + if (DT->dominates(MBB, UserMBB)) { + IsDomUser = true; break; } } - if (bDomUser) + if (IsDomUser) continue; LaneBitmask outMask = outIt.second; @@ -4019,7 +3999,7 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, LaneBitmask MBBBeginMask; if (inputLive.find(Reg) != inputLive.end()) MBBBeginMask = inputLive[Reg]; - // Check mask which not live in BeginSlot but live in exp input. + // Check mask which not live in BeginSlot in exp input. LaneBitmask profitMask = inMask & (~MBBBeginMask); if (profitMask.any()) { // Update input live to avoid count same input more than once. @@ -4041,16 +4021,16 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, } } - if (bVOutBound && vgprDiff > 0) + if (IsVOutBound && vgprDiff > 0) continue; - if (bSOutBound && sgprDiff > 0) + if (IsSOutBound && sgprDiff > 0) continue; llvm::mergeLiveRegSet(crossLive, newInput); vgpr += vgprDiff; sgpr += sgprDiff; - if (bClone) - Exp.bCloneOnly = true; + if (IsClone) + Exp.IsCloneOnly = true; } return std::make_pair(vgpr, sgpr); @@ -4062,7 +4042,7 @@ void addExpCandidates(std::vector &subExpCandidates, subExpCandidates.insert(subExpCandidates.end(), subExps.begin(), subExps.end()); for (auto &Exp : subExps) { - if (Exp.bHoist) { + if (Exp.IsHoist) { for (auto &Reg : Exp.TopRegs) { usedRegs[Reg]; } @@ -4087,19 +4067,19 @@ bool tryToAddSubExps( GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, const MachineLoopInfo *MLI, - SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT, - bool bCanClone, bool bVOutBound, bool bSOutBound, - GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) { + SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *DT, + bool IsCanClone, bool IsVOutBound, bool IsSOutBound, + GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) { std::vector partialSubExps = buildSubExpCandidates( - Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, - bCanClone, bSOutBound, unUsedPassThrus, status.MemWriteMBBSet, - bAllowPartialUseInSubExp); + Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, DT, + IsCanClone, IsSOutBound, unUsedPassThrus, status.MemWriteMBBSet, + AllowPartialUseInSubExp); GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive; GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive; std::pair curSaving = calculateSaving( hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive, - bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI); + IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); const int VLimit = status.TargetVLimit; const int SLimit = status.TargetSLimit; @@ -4114,7 +4094,7 @@ bool tryToAddSubExps( } if (EnableSubExpAggressive) { - // Build candidates from passThrus but not used in partialSubExps. + // Build candidates from passThrus used in partialSubExps. GCNRPTracker::LiveRegSet sinkUsedRegs; for (auto &Exp : partialSubExps) { for (auto &Reg : Exp.BottomRegs) { @@ -4130,7 +4110,7 @@ bool tryToAddSubExps( if (usedRegs.count(Reg)) continue; // Skip unsafe reg. - if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ false)) { + if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ false)) { LLVM_DEBUG(dbgs() << " is not safe to hoist\n"); continue; } @@ -4165,16 +4145,17 @@ bool tryToAddSubExps( auto subExps = buildSubExpFromCandidatesTopBottom( Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes); for (SubExp &subExp : subExps) { - if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound)) + if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, + IsSOutBound)) continue; - subExp.bHoist = true; + subExp.IsHoist = true; hoistSubExpCandidates.emplace_back(subExp); } } std::pair hoistSaving = calculateSaving( hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive, - bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI); + IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); int hoistVgpr = vgpr + hoistSaving.first; int hoistSgpr = sgpr + hoistSaving.second; @@ -4182,7 +4163,7 @@ bool tryToAddSubExps( if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) || // If status not balance, do the remat even cannot reach target. // TODO: check the result not help even one occupancy. - (!hoistSubExpCandidates.empty() && !status.bNotBalance && + (!hoistSubExpCandidates.empty() && !status.NotBalance && TargetOccupancy != 0)) { // nrmSubExps can help reach target occupancy, add it to // subExpCandidates. @@ -4195,8 +4176,8 @@ bool tryToAddSubExps( if (EnableVmemDegree && // Only expect vmem when last tryToAddSubExps. - // If not, bAllowPartialUseInSubExp will no chance to be true. - (bAllowPartialUseInSubExp || !EnableSubExpAggressive)) { + // If not, AllowPartialUseInSubExp will no chance to be true. + (AllowPartialUseInSubExp || !EnableSubExpAggressive)) { // Assume vmemLdSize could be optimized by not parallel. if (((vgpr - hotBB.vmemLdInputSize) <= VLimit || (vgpr - hotBB.vmemLdOutputSize) <= VLimit) && @@ -4218,11 +4199,11 @@ bool tryToAddSubExps( inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) { // return false always when not allow partialUseInSubExp, it will try again // with partialUseInSubExp enabled. - if (!bAllowPartialUseInSubExp) + if (!AllowPartialUseInSubExp) return false; // If status not balance, do the remat even cannot reach target. // TODO: check the result not help even one occupancy. - if (!status.bNotBalance && TargetOccupancy == 0) + if (!status.NotBalance && TargetOccupancy == 0) return false; } // nrmSubExps can help reach target occupancy, add it to @@ -4234,17 +4215,17 @@ bool tryToAddSubExps( // Remat passthru regs per hot block. // Reason to do it per block is to make sure passthru reuse is precise. // If try remat on all hot blocks together, the passthru might be on one block, -// but the reuse in on another block which the reg is not passthru there. +// reuse in on another block which the reg is not passthru there. bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, RematStatus &status, GCNRPTracker::LiveRegSet &liveRegCandidates, const GCNSubtarget *ST, LiveIntervals *LIS, const MachineLoopInfo *MLI, - MachineDominatorTree *pDT, MachineRegisterInfo &MRI, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { - bool bUpdated = false; - bool bCanClone = EnableSubExpClone || EnableSubExpAggressive; + bool IsUpdated = false; + bool IsCanClone = EnableSubExpClone || EnableSubExpAggressive; SlotIndexes *slotIndexes = LIS->getSlotIndexes(); // Sort hot blocks by pressure first. @@ -4285,15 +4266,15 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, const int PressureDelta = -1; int vgpr = it.maxPressures.first - PressureDelta; int sgpr = it.maxPressures.second; - bool bVOutBound = vgpr > VLimit; - bool bSOutBound = sgpr > SLimit; + bool IsVOutBound = vgpr > VLimit; + bool IsSOutBound = sgpr > SLimit; // savingInputLive is used to calculate saving which will be modified to // avoid count same input multiple times. GCNRPTracker::LiveRegSet savingInputLive = inputLive; GCNRPTracker::LiveRegSet savingOutputLive = outputLive; std::pair curSaving = calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive, - bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI); + IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); vgpr += curSaving.first; sgpr += curSaving.second; @@ -4304,7 +4285,7 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, // Collect pass thru regs. GCNRPTracker::LiveRegSet passThrus = collectPassThrus(MBB, inputLive, outputLive, usedPassThrus, - liveRegCandidates, MRI, bCanClone); + liveRegCandidates, MRI, IsCanClone); // Group pass thru regs by def MBB. SmallVector> @@ -4314,34 +4295,34 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, // subExp. GCNRPTracker::LiveRegSet unusedPassThrus; // Build exp dag on define blocks. - bool bAllowPartialUseInSubExp = false; + bool AllowPartialUseInSubExp = false; if (tryToAddSubExps( Remat, it, status, subExpCandidates, inBlockCloneSubExps, inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr, savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI, - SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound, - unusedPassThrus, bAllowPartialUseInSubExp)) { + SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound, + IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) { // Remove unusedPassThrus from passThrus first. llvm::andNotLiveRegSet(passThrus, unusedPassThrus); llvm::mergeLiveRegSet(usedPassThrus, passThrus); continue; } // If cannot clone, don't need to try partialUseInSubExp which must clone. - if (!bCanClone) + if (!IsCanClone) return false; - // Partial use subExp may result big alu count caused by clone. + // Partial use subExp may result count caused by clone. // Only try it when enable aggressive remat. if (!EnableSubExpAggressive) return false; - bAllowPartialUseInSubExp = true; + AllowPartialUseInSubExp = true; if (!tryToAddSubExps( Remat, it, status, subExpCandidates, inBlockCloneSubExps, inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr, savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI, - SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound, - unusedPassThrus, bAllowPartialUseInSubExp)) { + SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound, + IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) { return false; } // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp. @@ -4360,14 +4341,14 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, if (Exp.SUnits.empty()) continue; LLVM_DEBUG(Exp.dump(MRI, SIRI)); - if (Exp.bHoist) { - ApplySubExpMoveNearDefine(Exp, MRI, pDT, slotIndexes, SIII, SIRI); + if (Exp.IsHoist) { + ApplySubExpMoveNearDefine(Exp, MRI, DT, slotIndexes, SIII, SIRI); } else { - if (Exp.bCloneOnly) - ApplySubExpCloneNearUser(Exp, hotBlocks, pDT, MRI, slotIndexes, SIII, + if (Exp.IsCloneOnly) + ApplySubExpCloneNearUser(Exp, hotBlocks, DT, MRI, slotIndexes, SIII, SIRI); else - ApplySubExpMoveNearUser(Exp, MRI, pDT, slotIndexes, SIII, SIRI); + ApplySubExpMoveNearUser(Exp, MRI, DT, slotIndexes, SIII, SIRI); } } @@ -4378,10 +4359,10 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, } // Try to see possible occupancy could reach, then dicide a target. // Apply remat. - bUpdated = subExpCandidates.size(); + IsUpdated = subExpCandidates.size(); } - return bUpdated; + return IsUpdated; } int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, @@ -4389,8 +4370,8 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, int vmemLdSize = 0; // Collect vmemLd when enable split. for (MachineInstr &MI : MBB) { - bool bIsHighLatency = SIII->isHighLatencyInstruction(MI); - if (!bIsHighLatency) + bool IsHighLatency = SIII->isHighLatencyInstruction(MI); + if (!IsHighLatency) continue; if (!(MI.mayLoad() && // Skip case like atomic which not return value. @@ -4408,8 +4389,8 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, } // namespace bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, - LiveIntervals *LIS, MachineDominatorTree *pDT, - MachinePostDominatorTree *pPDT, AliasAnalysis *AA) { + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, AliasAnalysis *AA) { if (MF.size() < 2) return false; const GCNSubtarget *ST = &MF.getSubtarget(); @@ -4419,7 +4400,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, auto &MRI = MF.getRegInfo(); - RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST); + RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST); const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; if (status.TargetOcc >= MaxOcc) @@ -4431,20 +4412,20 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, int rematVCnt = status.MaxVPressure - VLimit; int rematSCnt = status.MaxSPressure - SLimit; - bool bSGPRSpill = false; + bool IsSGPRSpill = false; if (rematSCnt > 0) { - bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); + IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); } // If bound by lds, skip. if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second && - !bSGPRSpill) + !IsSGPRSpill) return false; - bool bBothOutLimit = rematVCnt > 0 && rematSCnt > 0; + bool IsBothOutLimit = rematVCnt > 0 && rematSCnt > 0; // TODO: use check wqm and support vreg remat. - bool bCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; - rematVCnt = bCheckWQM & false; + bool IsCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; + rematVCnt = IsCheckWQM & false; // Remat on every hot block. @@ -4467,8 +4448,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) { maxLocalVPressure = 0; maxLocalSPressure = 0; - CollectMBBPressure(MBB, LIS, MRI, ST, maxLocalVPressure, - maxLocalSPressure, status); + collectMBBPressure(MBB, LIS, ST, maxLocalVPressure, maxLocalSPressure, + status); maxLocalSPressure += RegForVCC; } @@ -4476,7 +4457,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, continue; // When both vgpr sgpr out limit, only help vgpr. - if (bBothOutLimit && maxLocalVPressure <= VLimit) + if (IsBothOutLimit && maxLocalVPressure <= VLimit) continue; GCNRPTracker::LiveRegSet liveSet; hotBlocks.push_back({&MBB, liveSet, @@ -4513,8 +4494,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, continue; if (Pred->empty()) continue; - bool bIsHighLatency = SIII->isHighLatencyInstruction(Pred->back()); - if (!bIsHighLatency) + bool IsHighLatency = SIII->isHighLatencyInstruction(Pred->back()); + if (!IsHighLatency) continue; int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI); it.vmemLdInputSize = vmemLdSize; @@ -4527,14 +4508,14 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, hotBlocks, LIS, MRI, SIRI, SIII, MLI)) { // Rebuild LIS. LIS->reanalyze(MF); - status = GetRematStatus(MF, MLI, LIS, MRI, ST); - bool bSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF); - if (bSgprSpilled) { - bool bNearTarget = false; - hotBlockRemat(Remat, MF, MLI, LIS, pDT, pPDT, bNearTarget); + status = getRematStatus(MF, MLI, LIS, MRI, ST); + bool IsSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF); + if (IsSgprSpilled) { + bool IsNearTarget = false; + hotBlockRemat(Remat, MF, MLI, LIS, DT, PDT, IsNearTarget); // Rebuild LIS. LIS->reanalyze(MF); - status = GetRematStatus(MF, MLI, LIS, MRI, ST); + status = getRematStatus(MF, MLI, LIS, MRI, ST); } for (auto &it : hotBlocks) { @@ -4586,11 +4567,11 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, } } - bool bUpdated = + bool IsUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST, - LIS, MLI, pDT, MRI, SIRI, SIII); + LIS, MLI, DT, MRI, SIRI, SIII); - return bUpdated; + return IsUpdated; } bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { @@ -4623,21 +4604,21 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { // LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)")); // For non-cs/ps, set target occ as 4. - bool bNearTarget = false; - bool bFinalUpdated = false; - bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget); - bFinalUpdated |= bUpdated; + bool IsNearTarget = false; + bool IsFinalUpdated = false; + bool IsUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, IsNearTarget); + IsFinalUpdated |= IsUpdated; if (EnableSubExp) { - if (bUpdated) { + if (IsUpdated) { // Rebuild LIS. LIS->reanalyze(MF); } - bUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA); + IsUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA); - bFinalUpdated |= bUpdated; + IsFinalUpdated |= IsUpdated; } - return bFinalUpdated; + return IsFinalUpdated; } INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index b133659d8fb66..be24bfce2851c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -102,7 +102,7 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, Register Reg = MO.getReg(); if (!Reg.isVirtual()) { if (Reg == AMDGPU::SCC) - bTouchSCC = true; + IsTouchSCC = true; continue; } @@ -132,12 +132,12 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, } } -bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const { - if (bMultiDefOutput) +bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool IsMoveUp) const { + if (IsMultiDefOutput) return false; - if (bHasTerminatorInst) + if (IsHasTerminatorInst) return false; - if (bUseIncomingReg) + if (IsUseIncomingReg) return false; // Input should be single def. @@ -150,8 +150,8 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const { ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - const bool bJoinInput) - : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {} + const bool IsJoinInput) + : MRI(MRI), SIRI(SIRI), SIII(SIII), IsJoinInputToSubExp(IsJoinInput) {} template void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) { @@ -209,12 +209,12 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, passThruInputs.emplace_back(SU.NodeNum); continue; } - if (!bJoinInputToSubExp && !SU.isInstr()) + if (!IsJoinInputToSubExp && !SU.isInstr()) continue; // Join prev. for (SDep &PreDep : SU.Preds) { SUnit *PreSU = PreDep.getSUnit(); - if (!bJoinInputToSubExp && !PreSU->isInstr()) + if (!IsJoinInputToSubExp && !PreSU->isInstr()) continue; SubtreeClasses.join(SU.NodeNum, PreSU->NodeNum); } @@ -266,7 +266,7 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, continue; unsigned Reg = MO.getReg(); if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) { - Exp.bUseIncomingReg = true; + Exp.IsUseIncomingReg = true; } } @@ -274,13 +274,13 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, if (SU.NumSuccsLeft == 0) { Exp.BottomRoots.insert(MI); if (MI->isTerminator()) - Exp.bHasTerminatorInst = true; + Exp.IsHasTerminatorInst = true; } if (MI->isNotDuplicable()) - Exp.bNotSafeToCopy = true; + Exp.IsNotSafeToCopy = true; // Skip Scalar mem access since no scalar store. if (MI->mayLoadOrStore() && !SIII->isSMRD(*MI)) { - Exp.bHasMemInst = true; + Exp.IsHasMemInst = true; } // Add bottom regs. for (MachineOperand &MO : MI->operands()) { @@ -295,16 +295,16 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, if (SU.NumSuccsLeft) { // For SU which has used in current blk. // Check if used in other blks or subExps. - bool bUsedInOtherBlk = false; + bool IsUsedInOtherBlk = false; for (auto &UserMI : MRI.use_nodbg_instructions(Reg)) { if (UserMI.getParent() != MBB) { - bUsedInOtherBlk = true; + IsUsedInOtherBlk = true; break; } auto suIt = MISUnitMap.find(&UserMI); // When UserMI is not in dag, treat it as other block. if (suIt == MISUnitMap.end()) { - bUsedInOtherBlk = true; + IsUsedInOtherBlk = true; break; } SUnit *UseSU = suIt->second; @@ -318,12 +318,12 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, break; } } - if (!bUsedInOtherBlk) + if (!IsUsedInOtherBlk) continue; } Exp.BottomRegs.insert(Reg); if (!MRI.getUniqueVRegDef(Reg)) { - Exp.bMultiDefOutput = true; + Exp.IsMultiDefOutput = true; } } } @@ -435,7 +435,7 @@ BlockExpDag::BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII) - : ExpDag(MRI, SIRI, SIII, /*bJoinInput*/ true), LIS(LIS), MBB(B) {} + : ExpDag(MRI, SIRI, SIII, /*IsJoinInput*/ true), LIS(LIS), MBB(B) {} void BlockExpDag::build() { auto *SlotIndexes = LIS->getSlotIndexes(); @@ -503,7 +503,7 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet, } } while (!WorkList.empty()) { - bool bUpdated = false; + bool IsUpdated = false; SmallVector ReadyNodes; for (SUnit *SU : WorkList) { if (SU->NumPredsLeft > 0) @@ -511,7 +511,7 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet, ReadyNodes.emplace_back(SU); // Ready, move it to Processed. Processed.insert(SU); - bUpdated = true; + IsUpdated = true; // Only update 1 node once. // Order of schedle here should not affect pressure. break; @@ -613,7 +613,7 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, } while (!WorkList.empty()) { - bool bUpdated = false; + bool IsUpdated = false; SmallVector ReadyNodes; for (SUnit *SU : WorkList) { if (SU->NumSuccsLeft > 0) @@ -621,7 +621,7 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, ReadyNodes.emplace_back(SU); // Ready, move it to Processed. Processed.insert(SU); - bUpdated = true; + IsUpdated = true; // Only update 1 node once. // Order of schedle here should not affect pressure. break; @@ -977,7 +977,7 @@ void HRB::buildLinear(std::vector &SUnits) { continue; if (ChainedNodes.count(SU) > 0) continue; - bRecomputeHeight = false; + IsRecomputeHeight = false; Lineage lineage = buildChain(SU, SUnits); // Remove chained nodes from worklist. @@ -992,7 +992,7 @@ void HRB::buildLinear(std::vector &SUnits) { Lineages.emplace_back(lineage); - if (bRecomputeHeight) { + if (IsRecomputeHeight) { // Update height from tail. SUnit *tail = lineage.Nodes.back(); tail->setDepthDirty(); @@ -1111,7 +1111,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector &SUnits) { // Update height if need. unsigned Height = Succ->getHeight(); if (Height <= HeriHeight) { - bRecomputeHeight = true; + IsRecomputeHeight = true; } } return Heir; @@ -1345,9 +1345,9 @@ bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector &SUnits) { void HRB::fusionLineages(std::vector &SUnits) { if (Lineages.empty()) return; - bool bUpdated = true; - while (bUpdated) { - bUpdated = false; + bool IsUpdated = true; + while (IsUpdated) { + IsUpdated = false; int size = Lineages.size(); for (int i = 0; i < size; i++) { Lineage &a = Lineages[i]; @@ -1359,7 +1359,7 @@ void HRB::fusionLineages(std::vector &SUnits) { if (b.length() == 0) continue; if (tryFuse(a, b, SUnits)) { - bUpdated = true; + IsUpdated = true; if (a.length() == 0) break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h index a7d29430b4276..952126798b1de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h @@ -28,17 +28,17 @@ struct SubExp { llvm::DenseSet TopRegs; llvm::DenseSet BottomRoots; llvm::DenseSet BottomRegs; - bool bMultiDefOutput = false; - bool bHasTerminatorInst = false; - bool bUseIncomingReg = false; - bool bMoveIntoLoop = false; - bool bNotSafeToCopy = false; - bool bHasMemInst = false; - bool bHoist = false; + bool IsMultiDefOutput = false; + bool IsHasTerminatorInst = false; + bool IsUseIncomingReg = false; + bool IsMoveIntoLoop = false; + bool IsNotSafeToCopy = false; + bool IsHasMemInst = false; + bool IsHoist = false; // If temp/out reg is used by inst not in the subExp, cannot move since not // all users will be move. But OK to clone. - bool bCloneOnly = false; - bool bTouchSCC = false; + bool IsCloneOnly = false; + bool IsTouchSCC = false; llvm::MachineBasicBlock *FromBB; llvm::MachineBasicBlock *ToBB; unsigned sInputSize; @@ -49,7 +49,7 @@ struct SubExp { unsigned vMaxSize; LiveSet inputLive; LiveSet outputLive; - bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool bMoveUp) const; + bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool IsMoveUp) const; void calcMaxPressure(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI); void dump(const llvm::MachineRegisterInfo &MRI, @@ -59,11 +59,11 @@ struct SubExp { struct ExpDag { ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, - const llvm::SIInstrInfo *SIII, const bool bJoinInput); + const llvm::SIInstrInfo *SIII, const bool IsJoinInput); const llvm::MachineRegisterInfo &MRI; const llvm::SIRegisterInfo *SIRI; const llvm::SIInstrInfo *SIII; - const bool bJoinInputToSubExp; + const bool IsJoinInputToSubExp; std::vector SUnits; ///< The scheduling units. llvm::DenseMap MISUnitMap; @@ -181,7 +181,7 @@ class HRB { llvm::DenseSet ChainedNodes; llvm::DenseMap> ReachMap; - bool bRecomputeHeight = false; + bool IsRecomputeHeight = false; std::vector Lineages; ColorResult Color; const llvm::MachineRegisterInfo &MRI; From 78ab7f34417b7207547f84c3a02ec7dbc939b0e8 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Thu, 13 Mar 2025 15:53:20 -0700 Subject: [PATCH 13/25] Batch 2 --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 2244 ++++++++--------- 1 file changed, 1105 insertions(+), 1139 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index ed7093f85823d..4c46cee69a038 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -102,7 +102,7 @@ typedef AMDGPUHotBlockRematerialize Remat; // Util functions. namespace { -MachineBasicBlock *NearestCommonDominator(MachineDominatorTree *DT, +MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT, BlockSet &Blocks) { auto I = Blocks.begin(), E = Blocks.end(); @@ -181,7 +181,7 @@ findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, MachineBasicBlock *BB = *BBSet.begin(); if (BBSet.size() > 1) { - MachineBasicBlock *BDom = NearestCommonDominator(DT, BBSet); + MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet); if (!BDom) return nullptr; BB = BDom; @@ -194,7 +194,7 @@ findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT, return nullptr; // If BB is already a hot block, move to BB will not help. - // hotBlockRemat will fail it when process BB. + // hotBlockRemat will fail It when process BB. // Must reachable from DefMI. if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB)) @@ -221,8 +221,8 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { unsigned OpNum = DefMI->getNumOperands(); // Only move DefMI which all operand is unique def. - for (unsigned i = 0; i < OpNum; i++) { - MachineOperand &Op = DefMI->getOperand(i); + for (unsigned I = 0; I < OpNum; I++) { + MachineOperand &Op = DefMI->getOperand(I); if (!Op.isReg()) continue; if (!MRI.getUniqueVRegDef(Op.getReg()) && @@ -257,7 +257,7 @@ struct RematStatus { unsigned MaxSPressure; unsigned InputPhysicalVPressure; unsigned InputPhysicalSPressure; - // More occupancy can help more than latency cost to reach it. + // More occupancy can help more than latency cost to reach It. bool MemBound; // abs(VTargetOcc-STargetOcc) > 1. bool NotBalance; @@ -273,7 +273,7 @@ struct RematStatus { unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, const GCNSubtarget *ST, unsigned &MaxVPressure, unsigned &MaxSPressure, RematStatus &Status) { - // Skip processing current block if it has only debug instructions + // Skip processing current block if It has only debug instructions if (MBB.getFirstNonDebugInstr() == MBB.end()) return ST->getOccupancyWithNumVGPRs(0); auto BBEnd = MBB.rbegin(); @@ -366,17 +366,17 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, LLVM_DEBUG( const SIRegisterInfo *SIRI = ST->getRegisterInfo(); - dbgs() << "output live"; for (auto &it + dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) { - unsigned Idx = it.first->getNumber(); - auto LiveReg = it.second; + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; dbgs() << "MBB" << Idx << ":"; llvm::dumpLiveSet(LiveReg, SIRI); } dbgs() << "input live"; - for (auto &it + for (auto &It : Status.MBBInputLiveMap) { - unsigned Idx = it.first->getNumber(); - auto LiveReg = it.second; + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; dbgs() << "MBB" << Idx << ":"; llvm::dumpLiveSet(LiveReg, SIRI); }); @@ -548,7 +548,7 @@ void updateLiveInfo(MapVector &RematMap, if (Node.Kind == RematNode::RematKind::OneDefOneUse) { MachineBasicBlock *InsertBB = Node.InsertBlock; // If LiveInfo.BB is after InsertBB in Reverse post order, the def is - // still before LiveInfo.BB, it is still live. + // still before LiveInfo.BB, It is still live. unsigned LiveBBIndex = RPOTIndexMap[CurBB]; unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; if (LiveBBIndex > InsertBBIndex) { @@ -607,8 +607,8 @@ int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR, unsigned PrevMask = SharedRegIt->second.getAsInteger(); if (unsigned SharedMask = (PrevMask & Mask)) { // Some thing is shared. - for (int i = 0; i < MOSize; i++) { - if (SharedMask & (1 << i)) { + for (int I = 0; I < MOSize; I++) { + if (SharedMask & (1 << I)) { SharedSize += 1; } } @@ -637,7 +637,7 @@ int getReducedSize(MapVector &RematMap, if (Node.Kind == RematNode::RematKind::OneDefOneUse) { MachineBasicBlock *InsertBB = Node.InsertBlock; // If LiveInfo.BB is before InsertBB in Reverse post order, the def is - // moved after LiveInfo.BB, it is not live anymore. + // moved after LiveInfo.BB, It is not live anymore. unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB]; unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; if (LiveBBIndex < InsertBBIndex) @@ -706,7 +706,7 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, } if (IsSingleDef) { - // The reg might share with other candidates, check it here. + // The reg might share with other candidates, check It here. // Count share reg in getReducedSize. if (EnableAggressive) { // In case of aggressive remat, treat multi use reg as shared reg and @@ -720,7 +720,7 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); } int InputSize = SIRI->getRegSizeInBits(*OpRC); - // If input not live in hotspot, move it cross hotspot should have + // If input not live in hotspot, move It cross hotspot should have // less reg then DefMi. if (RematSize > InputSize) { RematSize -= InputSize; @@ -789,7 +789,7 @@ void buildRematCandiates(std::vector &Candidates, // Sort by gain. std::sort(Candidates.begin(), Candidates.end(), - [](RematNode &i, RematNode &j) { return i.Size > j.Size; }); + [](RematNode &I, RematNode &J) { return I.Size > J.Size; }); } // For case like @@ -799,7 +799,7 @@ void buildRematCandiates(std::vector &Candidates, // xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit // killed $scc; xb.uniform // Sink S_AND right before S_CSELECT will overwrite SCC. -// To avoid it, skip case when DefMI and UseMI has implicit define use. +// To avoid It, skip case when DefMI and UseMI has implicit define use. bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { if (DefMI->getDesc().NumImplicitDefs == 0) return false; @@ -880,12 +880,12 @@ void addCloneCandidate(std::vector &CloneList, // Group user in same blocks. std::vector UserSetList(CloneList.size()); - for (size_t i = 0; i < CloneList.size(); i++) { - auto *Node = CloneList[i]; + for (size_t I = 0; I < CloneList.size(); I++) { + auto *Node = CloneList[I]; unsigned Reg = Node->Reg; MachineInstr *DefMI = Node->DefMI; // Group user in same blocks. - BlockSet &UserSet = UserSetList[i]; + BlockSet &UserSet = UserSetList[I]; for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); UseIt != MRI.use_instr_nodbg_end();) { @@ -973,8 +973,8 @@ int filterRematCandiates(std::vector &Candidates, } void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef, - SmallVector &userMIs) { - for (MachineInstr *UseMI : userMIs) { + SmallVector &UserMIs) { + for (MachineInstr *UseMI : UserMIs) { for (MachineOperand &MO : UseMI->operands()) { if (!MO.isReg()) continue; @@ -988,14 +988,14 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef, } DenseMap reduceClonedMBBs( - unsigned Reg, BlockMap> &userBlocks, + unsigned Reg, BlockMap> &UserBlocks, DenseSet &UserMBBSet, - std::vector &hotBlocks, MachineDominatorTree *DT) { + std::vector &HotBlocks, MachineDominatorTree *DT) { // Collect hot blocks which Exp is live in. - DenseSet hotBlockSet; - for (BlockLiveInfo &hotBlock : hotBlocks) { - if (hotBlock.InputLive.count(Reg)) { - hotBlockSet.insert(hotBlock.BB); + DenseSet HotBlockSet; + for (BlockLiveInfo &HotBlock : HotBlocks) { + if (HotBlock.InputLive.count(Reg)) { + HotBlockSet.insert(HotBlock.BB); } } @@ -1003,19 +1003,19 @@ DenseMap reduceClonedMBBs( // the value not cross hotBlocks when later blocks are cloned. // For userBlocks which dominated by all hotBlocks, they could share clones // because once after hot block, the pressure is OK. - DenseSet afterHotRangeMBBs; + DenseSet AfterHotRangeMBBs; for (MachineBasicBlock *MBB : UserMBBSet) { // Always clone in hot block. - if (hotBlockSet.count(MBB)) + if (HotBlockSet.count(MBB)) continue; bool IsDomAllHotBlocks = true; bool IsDomedByAllHotBlocks = true; - for (MachineBasicBlock *hotMBB : hotBlockSet) { - if (!DT->dominates(MBB, hotMBB)) { + for (MachineBasicBlock *HotMBB : HotBlockSet) { + if (!DT->dominates(MBB, HotMBB)) { IsDomAllHotBlocks = false; } - if (!DT->dominates(hotMBB, MBB)) { + if (!DT->dominates(HotMBB, MBB)) { IsDomedByAllHotBlocks = false; } if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { @@ -1023,19 +1023,17 @@ DenseMap reduceClonedMBBs( } } if (IsDomAllHotBlocks) { - userBlocks.erase(MBB); + UserBlocks.erase(MBB); } else if (IsDomedByAllHotBlocks) { - afterHotRangeMBBs.insert(MBB); + AfterHotRangeMBBs.insert(MBB); } } // Split after hotRange block set by domtree. DenseMap DomMap; - if (!afterHotRangeMBBs.empty()) { - for (auto it : afterHotRangeMBBs) { - MachineBasicBlock *MBB = it; - for (auto it2 : afterHotRangeMBBs) { - MachineBasicBlock *MBB2 = it2; + if (!AfterHotRangeMBBs.empty()) { + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) { if (MBB == MBB2) continue; if (DT->dominates(MBB, MBB2)) { @@ -1046,13 +1044,12 @@ DenseMap reduceClonedMBBs( } } } - for (auto it : afterHotRangeMBBs) { - MachineBasicBlock *MBB = it; + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { auto &Dom = DomMap[MBB]; - for (MachineBasicBlock *domedMBB : Dom) { + for (MachineBasicBlock *DomedMBB : Dom) { // Remove domedMBB. - DomMap.erase(domedMBB); - UserMBBSet.erase(domedMBB); + DomMap.erase(DomedMBB); + UserMBBSet.erase(DomedMBB); } } } @@ -1062,7 +1059,7 @@ DenseMap reduceClonedMBBs( // Look for an earlier insert point if the InstructionToMove // writes to scc and scc is live at the CurrentInsertPoint. -static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash( +static MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash( MachineInstr *InstructionToMove, MachineBasicBlock *MBB, MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { @@ -1078,7 +1075,7 @@ static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash( // Look for an earlier insert point if the SubExp // writes to scc and scc is live at the CurrentInsertPoint. -static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash( +static MachineBasicBlock::iterator adjustInsertPointForSubExpToAvoidSccSmash( const SubExp &SubExpToMove, MachineBasicBlock *MBB, MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { @@ -1092,7 +1089,7 @@ static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash( } // Return trun if moving MI to Location will smash a live scc value. -static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB, +static bool willSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock::iterator Location) { // It is ok to pass nullptr to `modifiesRegister` for TRI here since // SCC has no subreg/suprereg relationships. @@ -1100,8 +1097,8 @@ static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB, llvm::IsSccLiveAt(MBB, Location); } -void ApplyCloneRemat(Remat *Remat, RematNode &Node, - std::vector &hotBlocks, +void applyCloneRemat(Remat *Remat, RematNode &Node, + std::vector &HotBlocks, MachineDominatorTree *DT, MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) { @@ -1125,18 +1122,18 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, // Group user in same blocks. BlockMap> UserMap; DenseSet UserMBBSet; - for (auto useIt = MRI.use_instr_nodbg_begin(Reg); - useIt != MRI.use_instr_nodbg_end();) { - MachineInstr &UseMI = *(useIt++); + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); UserMap[UseMI.getParent()].emplace_back(&UseMI); UserMBBSet.insert(UseMI.getParent()); } DenseMap DomMap = - reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, DT); + reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT); - for (auto useIt : UserMap) { - MachineBasicBlock *MBB = useIt.first; + for (auto UseIt : UserMap) { + MachineBasicBlock *MBB = UseIt.first; // Skip same block uses. if (MBB == DefMI->getParent()) { continue; @@ -1145,24 +1142,24 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, if (UserMBBSet.count(MBB) == 0) continue; - unsigned NewReg = MRI.createVirtualRegister(RC); + Register NewReg = MRI.createVirtualRegister(RC); auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg); - for (unsigned i = 1; i < OpNum; i++) { - NewDef = NewDef.add(DefMI->getOperand(i)); + for (unsigned I = 1; I < OpNum; I++) { + NewDef = NewDef.add(DefMI->getOperand(I)); } - MachineInstr *InsertPointMI = useIt.second.front(); - SlotIndex lastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI); + MachineInstr *InsertPointMI = UseIt.second.front(); + SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI); - for (MachineInstr *UseMI : useIt.second) { - SlotIndex slot = SlotIndexes->getInstructionIndex(*UseMI); - if (lastSlot > slot) { - lastSlot = slot; + for (MachineInstr *UseMI : UseIt.second) { + SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI); + if (LastSlot > Slot) { + LastSlot = Slot; InsertPointMI = UseMI; } } - MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash( + MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash( DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII); for (MachineMemOperand *MO : DefMI->memoperands()) { @@ -1173,15 +1170,15 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, SlotIndexes->insertMachineInstrInMaps(*NewDef); - SmallVector &userMIs = useIt.second; - updateUsers(Reg, NewReg, IsSubRegDef, userMIs); + SmallVector &UserMIs = UseIt.second; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); // update users in dom MBBs. - auto domMapIt = DomMap.find(MBB); - if (domMapIt != DomMap.end()) { - for (MachineBasicBlock *UpdateMBB : domMapIt->second) { - SmallVector &userMIs = UserMap[UpdateMBB]; - updateUsers(Reg, NewReg, IsSubRegDef, userMIs); + auto DomMapIt = DomMap.find(MBB); + if (DomMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : DomMapIt->second) { + SmallVector &UserMIs = UserMap[UpdateMBB]; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); } } @@ -1194,8 +1191,8 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node, } } -void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, - SlotIndexes *slotIndexes, +void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { MachineInstr *DefMI = Node.DefMI; @@ -1212,7 +1209,7 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, MBB = Node.InsertBlock; } - InsertPoint = AdjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI, + InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI, SIRI, SIII); // Move instruction to new location. @@ -1220,33 +1217,33 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, InsertPoint->getParent()->insert(InsertPoint, DefMI); // Update slot index. - slotIndexes->removeSingleMachineInstrFromMaps(*DefMI); - slotIndexes->insertMachineInstrInMaps(*DefMI); + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); } -void ApplyRemat(Remat *Remat, MapVector &RematMap, - std::vector &hotBlocks, - MachineDominatorTree *DT, SlotIndexes *slotIndexes, - MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineFunction &MF) { +void applyRemat(Remat *Remat, MapVector &RematMap, + std::vector &HotBlocks, MachineDominatorTree *DT, + SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + MachineFunction &MF) { std::vector UpdateList; - for (auto &it : RematMap) { - UpdateList.emplace_back(it.second); + for (auto &It : RematMap) { + UpdateList.emplace_back(It.second); } // Sort update list with slotIndex to make sure def moved before use. - // If use moved before def, it might not be the first use anymore. + // If use moved before def, It might not be the first use anymore. std::sort(UpdateList.begin(), UpdateList.end(), - [&slotIndexes](RematNode &i, RematNode &j) { - SlotIndex a = slotIndexes->getInstructionIndex(*i.DefMI); - SlotIndex b = slotIndexes->getInstructionIndex(*j.DefMI); - return a < b; + [&SlotIndexes](RematNode &I, RematNode &J) { + SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI); + SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI); + return A < B; }); for (RematNode &Node : UpdateList) { if (Node.Kind == RematNode::RematKind::OneDefOneUse) { - ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII); + applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII); } else if (Node.Kind == RematNode::RematKind::Clone) { - ApplyCloneRemat(Remat, Node, hotBlocks, DT, MRI, slotIndexes, SIRI, SIII, + applyCloneRemat(Remat, Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF); } } @@ -1255,8 +1252,8 @@ void ApplyRemat(Remat *Remat, MapVector &RematMap, void dumpRematMap(MapVector &RematMap, const SIRegisterInfo *SIRI) { dbgs() << "\n rematMap: \n"; - for (auto it : RematMap) { - int Reg = it.first; + for (auto It : RematMap) { + int Reg = It.first; dbgs() << printReg(Reg, SIRI); dbgs() << "\n"; } @@ -1308,29 +1305,29 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, auto &MRI = MF.getRegInfo(); bool IsUpdated = false; - RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST); + RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST); const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; - if (status.TargetOcc >= MaxOcc) + if (Status.TargetOcc >= MaxOcc) return false; - unsigned VLimit = status.TargetVLimit; - unsigned SLimit = status.TargetSLimit; + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; - int rematSCnt = status.MaxSPressure - SLimit; + int RematSCnt = Status.MaxSPressure - SLimit; // when agressive sgpr remat, reserve some for allocation lost. if (EnableAggressive) - rematSCnt += NearTargetRegLimit; + RematSCnt += NearTargetRegLimit; bool IsSGPRSpill = false; - if (rematSCnt > 0) { - IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); } - bool IsForceRematSgpr = IsSGPRSpill | status.NotBalance; + const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; // If bound by lds, skip. - if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && + if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && !IsForceRematSgpr) return false; @@ -1343,27 +1340,27 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, MapVector SRematMap; // Reg which cannot move around to remat. DenseSet PinnedRegSet; - std::vector hotBlocks; - for (auto it = po_begin(EntryMBB); it != po_end(EntryMBB); it++) { - MachineBasicBlock *MBB = *it; - auto &RP = status.MBBPressureMap[MBB]; + std::vector HotBlocks; + for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) { + MachineBasicBlock *MBB = *It; + auto &RP = Status.MBBPressureMap[MBB]; // ignore block not hot. - if (RP.getVGPRNum(ST->hasGFX90AInsts()) < status.TargetVLimit && - (RP.getMaxSGPR() + RegForVCC + status.InputPhysicalSPressure) < - status.TargetSLimit) + if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit && + (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) < + Status.TargetSLimit) continue; // Collect reg pressure. - unsigned maxVPressure = 0; - unsigned maxSPressure = 0; - const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB]; + unsigned MaxVPressure = 0; + unsigned MaxSPressure = 0; + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; - const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB]; + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; LLVM_DEBUG( - dumpHotBlock(inputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI)); + dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI)); GCNDownwardRPTracker Tracker(*LIS); - Tracker.reset(*MBB->begin(), &inputLive); + Tracker.reset(*MBB->begin(), &InputLive); for (MachineInstr &MI : *MBB) { if (MI.isDebugInstr()) @@ -1371,30 +1368,30 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, Tracker.advance(); auto LISLR = Tracker.getLiveRegs(); // Update live set for things already remated. - updateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap); - updateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap); + updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap); + updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap); - const GCNRPTracker::LiveRegSet &liveSet = LISLR; + const GCNRPTracker::LiveRegSet &LiveSet = LISLR; unsigned VPressure = 0; unsigned SPressure = 0; - CollectLiveSetPressure(liveSet, MRI, SIRI, VPressure, SPressure); - if (maxVPressure < VPressure) - maxVPressure = VPressure; - if (maxSPressure < SPressure) - maxSPressure = SPressure; - } - maxSPressure += RegForVCC + status.InputPhysicalSPressure; - if (maxVPressure <= VLimit && maxSPressure <= SLimit) + CollectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure); + if (MaxVPressure < VPressure) + MaxVPressure = VPressure; + if (MaxSPressure < SPressure) + MaxSPressure = SPressure; + } + MaxSPressure += RegForVCC + Status.InputPhysicalSPressure; + if (MaxVPressure <= VLimit && MaxSPressure <= SLimit) continue; // Build block live info. // Use outputLive for EntryMBB. - BlockLiveInfo LiveInfo = {MBB, maxSPressure, maxVPressure, - MBB != EntryMBB ? inputLive : outputLive}; + BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure, + MBB != EntryMBB ? InputLive : OutputLive}; // Skip entry block when save hotBlock to reduce clone because not clone in // entry block. if (MBB != EntryMBB) - hotBlocks.emplace_back(LiveInfo); + HotBlocks.emplace_back(LiveInfo); GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive; // Update reg pressure based on remat list. @@ -1406,18 +1403,18 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInfo, RPOTIndexMap); // Calculate size need to be remat. - int rematVCnt = maxVPressure - VReduced - VLimit; - int rematSCnt = maxSPressure - SReduced - SLimit; + int RematVCnt = MaxVPressure - VReduced - VLimit; + int RematSCnt = MaxSPressure - SReduced - SLimit; bool IsSGPRSpill = false; - if (rematSCnt > 0) { - IsSGPRSpill = nearSgprSpill(maxSPressure, ST, MF); + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF); } - bool IsForceRematSgpr = IsSGPRSpill || status.NotBalance; + bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; // Try to add candidates into remat list. - int newRematSCnt = 0; - if (rematSCnt > 0) { + int NewRematSCnt = 0; + if (RematSCnt > 0) { // Build candidate nodes. std::vector SRematCandidates; buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI, @@ -1426,19 +1423,19 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI)); std::vector SRematList; // Filter candidates. - newRematSCnt = filterRematCandiates(SRematCandidates, SRematList, + NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList, PinnedRegSet, DT, PDT, MLI, MRI, - /*IsVGPR*/ false, status.MemBound); - if (newRematSCnt > rematSCnt) { + /*IsVGPR*/ false, Status.MemBound); + if (NewRematSCnt > RematSCnt) { // Has enough remat node to cover rematCnt. - int rematCnt = 0; + int RematCnt = 0; for (RematNode &Node : SRematList) { SRematMap[Node.Reg] = Node; - rematCnt += Node.Size; - if (rematCnt > rematSCnt && !EnableAggressive) + RematCnt += Node.Size; + if (RematCnt > RematSCnt && !EnableAggressive) break; } - newRematSCnt = 0; + NewRematSCnt = 0; } else { for (RematNode &Node : SRematList) { @@ -1447,8 +1444,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, // Check shared size. int SharedReducedSize = getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI); - if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= - rematSCnt) { + if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >= + RematSCnt) { for (RematNode &Node : SRematList) { SRematMap[Node.Reg] = Node; } @@ -1477,14 +1474,14 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); if (UseMI.getParent() != MBB) continue; - int gain = rematGain(&MI, Reg, MRI, SIRI, + int Gain = rematGain(&MI, Reg, MRI, SIRI, /*IsVGPR*/ false); - if (gain > 0) { + if (Gain > 0) { // Skip case when DefMI has implicit define which used by UseMI. if (isImplicitDefUse(&MI, &UseMI)) { continue; } - RematNode Node = {Reg, &MI, (unsigned)gain >> 5}; + RematNode Node = {Reg, &MI, (unsigned)Gain >> 5}; Node.InsertPointMI = &UseMI; Node.Kind = RematNode::RematKind::OneDefOneUse; SRematMap[Reg] = Node; @@ -1492,7 +1489,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, } } } - newRematSCnt = rematSCnt - newRematSCnt - SharedReducedSize; + NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize; } } // If works, continue. @@ -1503,17 +1500,17 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, // Apply the remat. int NewRematVCnt = 0; - if (rematVCnt > 0) { + if (RematVCnt > 0) { // TODO: V remat. } - bool NeedSRemat = rematSCnt > 0; - bool NeedVRemat = rematVCnt > 0; + bool NeedSRemat = RematSCnt > 0; + bool NeedVRemat = RematVCnt > 0; // If sgpr spill, always do remat. bool IsSRematOK = - (newRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr; + (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr; bool IsVRematOK = - (status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty(); + (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty(); if (NeedSRemat && NeedVRemat) { if (IsVRematOK && IsSRematOK) { IsUpdated = true; @@ -1530,8 +1527,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, } } // TODO: what to do when cannot reach target? - if (newRematSCnt > 0) { - if ((unsigned)newRematSCnt <= NearTargetRegLimit) { + if (NewRematSCnt > 0) { + if ((unsigned)NewRematSCnt <= NearTargetRegLimit) { IsNearTarget = true; } else { if (!IsSGPRSpill) @@ -1546,7 +1543,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, if (!SRematMap.empty()) { IsUpdated = true; - ApplyRemat(Remat, SRematMap, hotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, + applyRemat(Remat, SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, MF); LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); } @@ -1567,7 +1564,7 @@ bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) { return DefMIs.size() == 1; } -static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) { +static bool isImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) { if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) { return false; } @@ -1575,15 +1572,7 @@ static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) { return MO.getReg() == Reg; } -static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) { - if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) { - return false; - } - - return MO.getReg() == Reg; -} - -static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, +static bool isSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII) { // Make sure UseMI is not wqm like sample. if (SIII->isWQM(UseMI->getOpcode())) @@ -1628,17 +1617,17 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, unsigned OpNum = DefMI->getNumOperands(); // Only move DefMI which all operand is unique def. - for (unsigned i = 0; i < OpNum; i++) { - MachineOperand &Op = DefMI->getOperand(i); + for (unsigned I = 0; I < OpNum; I++) { + MachineOperand &Op = DefMI->getOperand(I); if (!Op.isReg()) continue; Register OpReg = Op.getReg(); - if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || - IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO)) + if (isImplicitUseOfReg(Op, AMDGPU::EXEC) || + isImplicitUseOfReg(Op, AMDGPU::EXEC_LO)) continue; - if (IsImplicitUseOfReg(Op, AMDGPU::MODE)) + if (isImplicitUseOfReg(Op, AMDGPU::MODE)) continue; - if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI)) + if (isImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI)) continue; // Alow unused scc define. if (Op.isImplicit() && Op.isDead() && Op.isDef()) @@ -1658,7 +1647,7 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, } for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { - if (!IsSafeRematCandidateUser(&UseMI, SIII)) + if (!isSafeRematCandidateUser(&UseMI, SIII)) return false; } } @@ -1669,30 +1658,30 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, std::vector buildSubExpFromCandidates( Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, - GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) { + const MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, + GCNRPTracker::LiveRegSet &UnusedPassThrus, bool AllowPartialUseInSubExp) { InstSet CandidateDefs; DenseSet RemovedCandidates; std::vector CandidateRegs; CandidateRegs.reserve(Candidates.size()); - for (auto it : Candidates) { - unsigned Reg = it.first; + for (auto It : Candidates) { + unsigned Reg = It.first; CandidateRegs.emplace_back(Reg); } // Sort candidate by defMI order to make sure defMI has dependent check after // all its dependent node. std::sort(CandidateRegs.begin(), CandidateRegs.end(), - [&MRI, &slotIndexes](const unsigned a, unsigned b) { - MachineInstr *MIa = MRI.getUniqueVRegDef(a); + [&MRI, &SlotIndexes](const unsigned A, unsigned B) { + MachineInstr *MIa = MRI.getUniqueVRegDef(A); - MachineInstr *MIb = MRI.getUniqueVRegDef(b); + MachineInstr *MIb = MRI.getUniqueVRegDef(B); // Later instr first. return !SlotIndex::isEarlierInstr( - slotIndexes->getInstructionIndex(*MIa), - slotIndexes->getInstructionIndex(*MIb)); + SlotIndexes->getInstructionIndex(*MIa), + SlotIndexes->getInstructionIndex(*MIb)); }); - // If Candidate def has user in MBB, add it when allow partial candidates. + // If Candidate def has user in MBB, add It when allow partial candidates. // And the subExp has the define could only be clone, cannot move cross blocks // because user in MBB. DenseSet PartialCandidates; @@ -1704,7 +1693,7 @@ std::vector buildSubExpFromCandidates( if (UseMI.getParent() == MI->getParent()) { if (UseMI.getNumExplicitDefs() == 1) { // Skip user which already in Candidates. - unsigned UserDefReg = UseMI.getOperand(0).getReg(); + Register UserDefReg = UseMI.getOperand(0).getReg(); if (Candidates.count(UserDefReg) > 0 && RemovedCandidates.count(UserDefReg) == 0) continue; @@ -1728,14 +1717,14 @@ std::vector buildSubExpFromCandidates( if (CandidateDefs.empty()) return std::vector(); for (unsigned Reg : RemovedCandidates) { - unUsedPassThrus[Reg] = Candidates[Reg]; + UnusedPassThrus[Reg] = Candidates[Reg]; Candidates.erase(Reg); } // iterate MBB backward. // add inst which only used for candidate defines. - for (auto it = MBB->rbegin(); it != MBB->rend(); it++) { - MachineInstr &MI = *it; + for (auto It = MBB->rbegin(); It != MBB->rend(); It++) { + MachineInstr &MI = *It; if (CandidateDefs.count(&MI) > 0) { continue; } @@ -1815,45 +1804,45 @@ std::vector buildSubExpFromCandidates( } // Build defs in order. - std::vector defs; - defs.reserve(CandidateDefs.size()); + std::vector Defs; + Defs.reserve(CandidateDefs.size()); for (MachineInstr &MI : *MBB) { if (CandidateDefs.count(&MI) == 0) continue; - defs.emplace_back(&MI); + Defs.emplace_back(&MI); } LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI - : defs) { + : Defs) { MI->dump(); } dbgs() << "\nFinished Candidate Defs End\n";); // Build SubExp with CandidateDefs as Nodes, CandidateInput as input // Candidates as output. - ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); - dag.build(CandidateInput, Candidates, defs); + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); + Dag.build(CandidateInput, Candidates, Defs); if (AllowPartialUseInSubExp) { - for (auto &subExp : dag.SubExps) { - for (auto *MI : subExp.SUnits) { + for (auto &SubExp : Dag.SubExps) { + for (auto *MI : SubExp.SUnits) { if (PartialCandidates.count(MI)) { - subExp.IsCloneOnly = true; + SubExp.IsCloneOnly = true; break; } } } } - return dag.SubExps; + return Dag.SubExps; } std::vector buildSubExpFromCandidatesTopBottom( Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) { + const MachineRegisterInfo &MRI) { InstSet CandidateDefs; LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";); - for (auto it : Candidates) { - unsigned Reg = it.first; + for (auto It : Candidates) { + unsigned Reg = It.first; MachineInstr *MI = MRI.getUniqueVRegDef(Reg); for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { @@ -1893,8 +1882,8 @@ std::vector buildSubExpFromCandidatesTopBottom( // iterate MBB. GCNRPTracker::LiveRegSet LocalCandidates = Candidates; // add inst which only used by candidate defines. - for (auto it = MBB->begin(); it != MBB->end(); it++) { - MachineInstr &MI = *it; + for (auto It = MBB->begin(); It != MBB->end(); It++) { + MachineInstr &MI = *It; if (CandidateDefs.count(&MI) > 0) { for (MachineOperand &MO : MI.operands()) { if (!MO.isReg()) @@ -2003,33 +1992,33 @@ std::vector buildSubExpFromCandidatesTopBottom( } // Build defs in order. - std::vector defs; - defs.reserve(CandidateDefs.size()); + std::vector Defs; + Defs.reserve(CandidateDefs.size()); for (MachineInstr &MI : *MBB) { if (CandidateDefs.count(&MI) == 0) continue; - defs.emplace_back(&MI); + Defs.emplace_back(&MI); } LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI - : defs) { + : Defs) { MI->dump(); } dbgs() << "\nFinished Candidate Defs End\n";); - LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it + LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto It : LocalCandidates) { - pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs()); + pressure::print_reg(It.first, MRI, SIRI, llvm::dbgs()); } dbgs() << "\nLocalCandidates End\n";); // Make sure all input reg are uniqueDef. // Input is Candidates, output is? // Build SubExp with CandidateDefs as Nodes, CandidateInput as input // Candidates as output. - ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); - dag.build(Candidates, LocalCandidates, defs); - return dag.SubExps; + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); + Dag.build(Candidates, LocalCandidates, Defs); + return Dag.SubExps; } -void print_vreg(Register Reg, const MachineRegisterInfo &MRI) { +void printVreg(Register Reg, const MachineRegisterInfo &MRI) { if (Reg.isVirtual()) { StringRef Name = MRI.getVRegName(Reg); if (Name != "") { @@ -2040,50 +2029,49 @@ void print_vreg(Register Reg, const MachineRegisterInfo &MRI) { } } -MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, +MachineBasicBlock *findTargetBlock(unsigned Reg, MachineBasicBlock *FromBB, const MachineRegisterInfo &MRI, MachineDominatorTree *DT) { - BlockSet userBlocks; + BlockSet UserBlocks; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserBB = UseMI.getParent(); // Skip current BB. if (UserBB != FromBB) - userBlocks.insert(UserBB); + UserBlocks.insert(UserBB); else // When has user in FromBB, userBlock will be FromBB. return nullptr; } - if (userBlocks.empty()) + if (UserBlocks.empty()) return nullptr; - MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks); - if (!DT->dominates(FromBB, userBlock)) { + MachineBasicBlock *UserBlock = nearestCommonDominator(DT, UserBlocks); + if (!DT->dominates(FromBB, UserBlock)) { return nullptr; } - if (userBlock == FromBB) + if (UserBlock == FromBB) return nullptr; - return userBlock; + return UserBlock; } -void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, +void applySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, MachineDominatorTree *DT, - SlotIndexes *slotIndexes, const SIInstrInfo *SIII, - const SIRegisterInfo *SIRI) { + SlotIndexes *SlotIndexes) { // Move from bottom. MachineBasicBlock *FromBB = Exp.FromBB; - for (auto it = Exp.SUnits.rbegin(); it != Exp.SUnits.rend(); it++) { - MachineInstr *DefMI = *it; + for (auto It = Exp.SUnits.rbegin(); It != Exp.SUnits.rend(); It++) { + MachineInstr *DefMI = *It; if (DefMI->getNumExplicitDefs() != 1) continue; - unsigned Reg = DefMI->getOperand(0).getReg(); - MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, DT); + Register Reg = DefMI->getOperand(0).getReg(); + MachineBasicBlock *ToBB = findTargetBlock(Reg, FromBB, MRI, DT); if (!ToBB) continue; // Do not overwrite a live scc. MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin()); - if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint)) + if (willSmashSccAtLocation(DefMI, ToBB, InsertPoint)) continue; DefMI->removeFromParent(); @@ -2094,14 +2082,13 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI, if (DefMI->isDebugInstr()) continue; // Update slot index. - slotIndexes->removeSingleMachineInstrFromMaps(*DefMI); - slotIndexes->insertMachineInstrInMaps(*DefMI); + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); } } -void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, - MachineDominatorTree *DT, - SlotIndexes *slotIndexes, +void applySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI) { // Move from top. @@ -2119,11 +2106,11 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, Terminator = ToBB->end(); } - Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator, + Terminator = adjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator, MRI, SIRI, SIII); - for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { - MachineInstr *DefMI = *it; + for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) { + MachineInstr *DefMI = *It; if (DefMI->getNumExplicitDefs() != 1) continue; if (SIII->isEXP(DefMI->getOpcode())) @@ -2138,38 +2125,38 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI, if (DefMI->isDebugInstr()) continue; // Update slot index. - slotIndexes->removeSingleMachineInstrFromMaps(*DefMI); - slotIndexes->insertMachineInstrInMaps(*DefMI); + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); } } -DenseSet buildCloneSet(ExpDag &dag, - DenseSet &dagBottoms, - GCNRPTracker::LiveRegSet &usedOutput) { - DenseSet copySet; - for (auto it = dag.SUnits.rbegin(); it != dag.SUnits.rend(); it++) { - SUnit &SU = *it; +DenseSet buildCloneSet(ExpDag &Dag, + DenseSet &DagBottoms, + GCNRPTracker::LiveRegSet &UsedOutput) { + DenseSet CopySet; + for (auto It = Dag.SUnits.rbegin(); It != Dag.SUnits.rend(); It++) { + SUnit &SU = *It; // Skip non-inst node. if (!SU.isInstr()) continue; MachineInstr *MI = SU.getInstr(); - if (dagBottoms.find(&SU) != dagBottoms.end()) { + if (DagBottoms.find(&SU) != DagBottoms.end()) { bool IsUsed = false; // For bottom SU, if in usedOutput, add to copySet; for (MachineOperand &DefMO : MI->defs()) { if (!DefMO.isReg()) continue; - unsigned Reg = DefMO.getReg(); - if (usedOutput.count(Reg) > 0) { + Register Reg = DefMO.getReg(); + if (UsedOutput.count(Reg) > 0) { IsUsed = true; break; } } if (IsUsed) { - copySet.insert(MI); + CopySet.insert(MI); continue; } - // bottom SU may still have succNode when it used both inExp and outExp. + // bottom SU may still have succNode when It used both inExp and outExp. // So continue check succNode. } @@ -2178,29 +2165,29 @@ DenseSet buildCloneSet(ExpDag &dag, for (SDep &SucDep : SU.Succs) { SUnit *SucSU = SucDep.getSUnit(); MachineInstr *SuccMI = SucSU->getInstr(); - if (copySet.count(SuccMI) > 0) { + if (CopySet.count(SuccMI) > 0) { IsSuccCopied = true; break; } } if (IsSuccCopied) - copySet.insert(MI); + CopySet.insert(MI); } - return copySet; + return CopySet; } -void updateUsers(SmallVector &userMIs, +void updateUsers(SmallVector &UserMIs, DenseMap &RegMap) { - for (MachineInstr *UserMI : userMIs) { + for (MachineInstr *UserMI : UserMIs) { for (MachineOperand &MO : UserMI->uses()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - auto it = RegMap.find(Reg); - if (it == RegMap.end()) + Register Reg = MO.getReg(); + auto It = RegMap.find(Reg); + if (It == RegMap.end()) continue; - unsigned NewReg = it->second; + unsigned NewReg = It->second; MO.setReg(NewReg); } } @@ -2208,24 +2195,24 @@ void updateUsers(SmallVector &userMIs, struct HotBlock { MachineBasicBlock *MBB = nullptr; - GCNRPTracker::LiveRegSet inputLive; - std::pair maxPressures; + GCNRPTracker::LiveRegSet InputLive; + std::pair MaxPressures; // Info about vmemLd. - int vmemLdInputSize; - int vmemLdOutputSize; + int VmemLdInputSize; + int VmemLdOutputSize; }; DenseMap reduceClonedMBBs( SubExp &Exp, - MapVector> &userBlocks, - DenseMap &userBlocksLiveRegs, - std::vector &hotBlocks, MachineDominatorTree *DT) { + MapVector> &UserBlocks, + DenseMap &UserBlocksLiveRegs, + std::vector &HotBlocks, MachineDominatorTree *DT) { // Collect hot blocks which Exp is live in. - DenseSet hotBlockSet; - for (HotBlock &hotBlock : hotBlocks) { + DenseSet HotBlockSet; + for (HotBlock &HotBlock : HotBlocks) { for (unsigned Reg : Exp.BottomRegs) { - if (hotBlock.inputLive.count(Reg)) { - hotBlockSet.insert(hotBlock.MBB); + if (HotBlock.InputLive.count(Reg)) { + HotBlockSet.insert(HotBlock.MBB); break; } } @@ -2235,20 +2222,20 @@ DenseMap reduceClonedMBBs( // the value not cross hotBlocks when later blocks are cloned. // For userBlocks which dominated by all hotBlocks, they could share clones // because once after hot block, the pressure is OK. - DenseSet afterHotRangeMBBs; - for (auto it : userBlocksLiveRegs) { - MachineBasicBlock *MBB = it.first; + DenseSet AfterHotRangeMBBs; + for (auto It : UserBlocksLiveRegs) { + MachineBasicBlock *MBB = It.first; // Always clone in hot block. - if (hotBlockSet.count(MBB)) + if (HotBlockSet.count(MBB)) continue; bool IsDomAllHotBlocks = true; bool IsDomedByAllHotBlocks = true; - for (MachineBasicBlock *hotMBB : hotBlockSet) { - if (!DT->dominates(MBB, hotMBB)) { + for (MachineBasicBlock *HotMBB : HotBlockSet) { + if (!DT->dominates(MBB, HotMBB)) { IsDomAllHotBlocks = false; } - if (!DT->dominates(hotMBB, MBB)) { + if (!DT->dominates(HotMBB, MBB)) { IsDomedByAllHotBlocks = false; } if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) { @@ -2256,19 +2243,17 @@ DenseMap reduceClonedMBBs( } } if (IsDomAllHotBlocks) { - userBlocks.erase(MBB); + UserBlocks.erase(MBB); } else if (IsDomedByAllHotBlocks) { - afterHotRangeMBBs.insert(MBB); + AfterHotRangeMBBs.insert(MBB); } } // Split after hotRange block set by domtree. DenseMap DomMap; - if (!afterHotRangeMBBs.empty()) { - for (auto it : afterHotRangeMBBs) { - MachineBasicBlock *MBB = it; - for (auto it2 : afterHotRangeMBBs) { - MachineBasicBlock *MBB2 = it2; + if (!AfterHotRangeMBBs.empty()) { + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) { if (MBB == MBB2) continue; if (DT->dominates(MBB, MBB2)) { @@ -2279,16 +2264,15 @@ DenseMap reduceClonedMBBs( } } } - for (auto it : afterHotRangeMBBs) { - MachineBasicBlock *MBB = it; - auto &usedOutput = userBlocksLiveRegs[MBB]; + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + auto &UsedOutput = UserBlocksLiveRegs[MBB]; auto &Dom = DomMap[MBB]; - for (MachineBasicBlock *domedMBB : Dom) { + for (MachineBasicBlock *DomedMBB : Dom) { // Merge domed use to MBB use. - mergeLiveRegSet(usedOutput, userBlocksLiveRegs[domedMBB]); + mergeLiveRegSet(UsedOutput, UserBlocksLiveRegs[DomedMBB]); // Remove domedMBB. - DomMap.erase(domedMBB); - userBlocksLiveRegs.erase(domedMBB); + DomMap.erase(DomedMBB); + UserBlocksLiveRegs.erase(DomedMBB); } } } @@ -2296,13 +2280,13 @@ DenseMap reduceClonedMBBs( return DomMap; } -void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, +void applySubExpCloneNearUser(SubExp &Exp, std::vector &HotBlocks, MachineDominatorTree *DT, MachineRegisterInfo &MRI, - SlotIndexes *slotIndexes, const SIInstrInfo *SIII, + SlotIndexes *SlotIndexes, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI) { - MapVector> userBlocks; - DenseMap userBlocksLiveRegs; + MapVector> UserBlocks; + DenseMap UserBlocksLiveRegs; for (unsigned Reg : Exp.BottomRegs) { for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserBB = UseMI.getParent(); @@ -2310,36 +2294,36 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, if (UserBB == Exp.FromBB) continue; - userBlocks[UserBB].emplace_back(&UseMI); - auto &userLives = userBlocksLiveRegs[UserBB]; + UserBlocks[UserBB].emplace_back(&UseMI); + auto &UserLives = UserBlocksLiveRegs[UserBB]; for (MachineOperand &MO : UseMI.uses()) { if (!MO.isReg()) continue; - unsigned UseReg = MO.getReg(); + Register UseReg = MO.getReg(); if (Reg != UseReg) continue; - userLives[Reg] |= getRegMask(MO, MRI); + UserLives[Reg] |= getRegMask(MO, MRI); } } } // Build dag for SubExp to help remove unused inst when clone. - ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); - dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits); - DenseSet dagBottoms; - for (SUnit &SU : dag.SUnits) { + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); + Dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits); + DenseSet DagBottoms; + for (SUnit &SU : Dag.SUnits) { if (!SU.isInstr()) continue; if (SU.NumSuccs == 0) { - dagBottoms.insert(&SU); + DagBottoms.insert(&SU); } else { MachineInstr *MI = SU.getInstr(); // Add SU which def value in Exp.outputLive. for (MachineOperand &DefMO : MI->defs()) { if (!DefMO.isReg()) continue; - unsigned Reg = DefMO.getReg(); + Register Reg = DefMO.getReg(); if (Exp.BottomRegs.count(Reg) > 0) { - dagBottoms.insert(&SU); + DagBottoms.insert(&SU); break; } } @@ -2351,46 +2335,46 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, // For userBlocks which dominated by all hotBlocks, they could share clones // because once after hot block, the pressure is OK. DenseMap DomMap = - reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, DT); + reduceClonedMBBs(Exp, UserBlocks, UserBlocksLiveRegs, HotBlocks, DT); // Sort to make stable order. std::sort( - userBlocks.begin(), userBlocks.end(), - [](std::pair> &it0, - std::pair> &it1) { - return it0.first->getNumber() < it1.first->getNumber(); + UserBlocks.begin(), UserBlocks.end(), + [](std::pair> &It0, + std::pair> &It1) { + return It0.first->getNumber() < It1.first->getNumber(); }); const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); // Clone for each userBlocks. Not share clone thru dom tree which cannot help // reg pressure. - for (auto it : userBlocks) { - MachineBasicBlock *MBB = it.first; + for (auto It : UserBlocks) { + MachineBasicBlock *MBB = It.first; // Skip MBB which share clone from other MBBs. - if (userBlocksLiveRegs.count(MBB) == 0) + if (UserBlocksLiveRegs.count(MBB) == 0) continue; - auto &usedOutput = userBlocksLiveRegs[MBB]; - auto copySet = buildCloneSet(dag, dagBottoms, usedOutput); + auto &UsedOutput = UserBlocksLiveRegs[MBB]; + auto CopySet = buildCloneSet(Dag, DagBottoms, UsedOutput); // Clone to MBB. // Create new regs first. DenseMap RegMap; - auto insertPtr = MBB->getFirstNonPHI(); + auto InsertPtr = MBB->getFirstNonPHI(); // If Exp has scc read/write, make sure MBB not have scc in liveins. - if (IsModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr)) + if (IsModifiesScc && llvm::IsSccLiveAt(MBB, InsertPtr)) continue; MachineFunction *MF = MBB->getParent(); - for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { - MachineInstr *DefMI = *it; + for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) { + MachineInstr *DefMI = *It; // Not clone if already in MBB. if (DefMI->getParent() == MBB) continue; // Not clone if not used for MBB. - if (copySet.count(DefMI) == 0) + if (CopySet.count(DefMI) == 0) continue; auto ClonedMI = - BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); + BuildMI(*MBB, InsertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); for (MachineOperand &Def : DefMI->defs()) { Register Reg = Def.getReg(); @@ -2399,7 +2383,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, continue; ClonedMI.addDef(Reg, 0, Def.getSubReg()); } else { - unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); RegMap[Reg] = NewReg; ClonedMI.addDef(NewReg, 0, Def.getSubReg()); } @@ -2413,11 +2397,11 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, continue; ClonedMI.addReg(Reg, 0, MO.getSubReg()); } else { - auto it = RegMap.find(Reg); - if (it == RegMap.end()) { + auto It = RegMap.find(Reg); + if (It == RegMap.end()) { ClonedMI.addReg(Reg, 0, MO.getSubReg()); } else { - ClonedMI.addReg(it->second, 0, MO.getSubReg()); + ClonedMI.addReg(It->second, 0, MO.getSubReg()); } } } else { @@ -2426,7 +2410,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, } MachineInstr *NewDef = ClonedMI.getInstr(); - slotIndexes->insertMachineInstrInMaps(*NewDef); + SlotIndexes->insertMachineInstrInMaps(*NewDef); // Set mem operand for (MachineMemOperand *MO : DefMI->memoperands()) { NewDef->addMemOperand(*MF, MO); @@ -2434,43 +2418,43 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector &hotBlocks, } // update users in MBB. - SmallVector &userMIs = it.second; - updateUsers(userMIs, RegMap); + SmallVector &UserMIs = It.second; + updateUsers(UserMIs, RegMap); // update users in dom MBBs. - auto domMapIt = DomMap.find(MBB); - if (domMapIt != DomMap.end()) { - for (MachineBasicBlock *UpdateMBB : domMapIt->second) { - SmallVector &userMIs = userBlocks[UpdateMBB]; - updateUsers(userMIs, RegMap); + auto DomMapIt = DomMap.find(MBB); + if (DomMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : DomMapIt->second) { + SmallVector &UserMIs = UserBlocks[UpdateMBB]; + updateUsers(UserMIs, RegMap); } } } } -void ApplySubExpCloneNearUserInBlock( +void applySubExpCloneNearUserInBlock( SubExp &Exp, - DenseMap &inBlockHotVInstMap, - DenseMap &inBlockHotSInstMap, - MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, const SIInstrInfo *SIII, + DenseMap &InBlockHotVInstMap, + DenseMap &InBlockHotSInstMap, + MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI) { MachineBasicBlock *MBB = Exp.FromBB; MachineFunction *MF = MBB->getParent(); - MachineInstr *hotVMI = inBlockHotVInstMap[MBB]; - MachineInstr *hotSMI = inBlockHotSInstMap[MBB]; + MachineInstr *HotVMI = InBlockHotVInstMap[MBB]; + MachineInstr *HotSMI = InBlockHotSInstMap[MBB]; // Exp is build with hotVMI or hotSMI, cannot mix. - assert(!(hotVMI && hotSMI) && "cannot mix hot MI"); - MachineInstr *hotMI = hotVMI; - if (!hotMI) { - hotMI = hotSMI; + assert(!(HotVMI && HotSMI) && "cannot mix hot MI"); + MachineInstr *HotMI = HotVMI; + if (!HotMI) { + HotMI = HotSMI; } - SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex(); + SlotIndex HotSlot = SlotIndexes->getInstructionIndex(*HotMI).getBaseIndex(); const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI); for (unsigned Reg : Exp.BottomRegs) { - SmallVector useMIs; + SmallVector UseMIs; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserBB = UseMI.getParent(); // Skip current BB. @@ -2479,40 +2463,40 @@ void ApplySubExpCloneNearUserInBlock( // Skip inst in Exp. if (Exp.BottomRoots.find(&UseMI) != Exp.BottomRoots.end()) continue; - SlotIndex useSlot = - slotIndexes->getInstructionIndex(UseMI).getBaseIndex(); + SlotIndex UseSlot = + SlotIndexes->getInstructionIndex(UseMI).getBaseIndex(); // Only clone for use after hot slot. - if (useSlot < hotSlot) + if (UseSlot < HotSlot) continue; // Do not overwrite a live scc. if (IsModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI)) continue; - useMIs.emplace_back(&UseMI); + UseMIs.emplace_back(&UseMI); } - if (useMIs.empty()) + if (UseMIs.empty()) continue; DenseMap RegMap; - std::sort(useMIs.begin(), useMIs.end(), - [&slotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) { - return slotIndexes->getInstructionIndex(*MIa).getBaseIndex() < - slotIndexes->getInstructionIndex(*MIb).getBaseIndex(); + std::sort(UseMIs.begin(), UseMIs.end(), + [&SlotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) { + return SlotIndexes->getInstructionIndex(*MIa).getBaseIndex() < + SlotIndexes->getInstructionIndex(*MIb).getBaseIndex(); }); - auto insertPtr = useMIs.front()->getIterator(); + auto InsertPtr = UseMIs.front()->getIterator(); - for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) { - MachineInstr *DefMI = *it; + for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) { + MachineInstr *DefMI = *It; auto ClonedMI = - BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); + BuildMI(*MBB, InsertPtr, DefMI->getDebugLoc(), DefMI->getDesc()); for (MachineOperand &Def : DefMI->defs()) { Register Reg = Def.getReg(); if (Reg.isPhysical()) { ClonedMI.addDef(Reg, 0, Def.getSubReg()); } else { - unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg)); RegMap[Reg] = NewReg; ClonedMI.addDef(NewReg, 0, Def.getSubReg()); } @@ -2527,11 +2511,11 @@ void ApplySubExpCloneNearUserInBlock( if (Reg.isPhysical()) { ClonedMI.addReg(Reg, 0, MO.getSubReg()); } else { - auto it = RegMap.find(Reg); - if (it == RegMap.end()) { + auto It = RegMap.find(Reg); + if (It == RegMap.end()) { ClonedMI.addReg(Reg, 0, MO.getSubReg()); } else { - ClonedMI.addReg(it->second, 0, MO.getSubReg()); + ClonedMI.addReg(It->second, 0, MO.getSubReg()); } } } else { @@ -2540,55 +2524,55 @@ void ApplySubExpCloneNearUserInBlock( } MachineInstr *NewDef = ClonedMI.getInstr(); - slotIndexes->insertMachineInstrInMaps(*NewDef); + SlotIndexes->insertMachineInstrInMaps(*NewDef); // Set mem operand for (MachineMemOperand *MO : DefMI->memoperands()) { NewDef->addMemOperand(*MF, MO); } } // TODO: only clone to cross hot range. - for (MachineInstr *UseMI : useMIs) { + for (MachineInstr *UseMI : UseMIs) { for (MachineOperand &MO : UseMI->uses()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - auto it = RegMap.find(Reg); - if (it == RegMap.end()) + Register Reg = MO.getReg(); + auto It = RegMap.find(Reg); + if (It == RegMap.end()) continue; - unsigned NewReg = it->second; + Register NewReg = It->second; MO.setReg(NewReg); } } } } -bool isInLiveSet(unsigned Reg, LaneBitmask mask, - const GCNRPTracker::LiveRegSet &live) { - auto it = live.find(Reg); - if (it == live.end()) +bool isInLiveSet(unsigned Reg, LaneBitmask Mask, + const GCNRPTracker::LiveRegSet &Live) { + auto It = Live.find(Reg); + if (It == Live.end()) return false; - LaneBitmask liveMask = it->second; - return (liveMask | mask) == liveMask; + LaneBitmask LiveMask = It->second; + return (LiveMask | Mask) == LiveMask; } unsigned getPacifistLevel(unsigned Reg, - DenseMap &pacifistLevels, + DenseMap &PacifistLevels, const MachineRegisterInfo &MRI) { - unsigned level = 0; + unsigned Level = 0; for (MachineInstr &MI : MRI.def_instructions(Reg)) { - auto it = pacifistLevels.find(&MI); - if (it == pacifistLevels.end()) + auto It = PacifistLevels.find(&MI); + if (It == PacifistLevels.end()) continue; - level = it->second; + Level = It->second; } - return level; + return Level; } bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB, const MachineRegisterInfo &MRI) { - for (MachineInstr &def : MRI.def_instructions(Reg)) { - if (def.getParent() != MBB) + for (MachineInstr &Def : MRI.def_instructions(Reg)) { + if (Def.getParent() != MBB) continue; return true; } @@ -2596,38 +2580,36 @@ bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB, } MachineInstr *getInBlockUniqueDef(unsigned Reg, MachineBasicBlock *MBB, - const GCNRPTracker::LiveRegSet &inputLive, - const GCNRPTracker::LiveRegSet &outputLive, + const GCNRPTracker::LiveRegSet &InputLive, const MachineRegisterInfo &MRI) { MachineInstr *DefMI = nullptr; // If live as input for MBB, cannot be unique def. - if (inputLive.count(Reg)) + if (InputLive.count(Reg)) return DefMI; - for (MachineInstr &def : MRI.def_instructions(Reg)) { - if (def.getParent() != MBB) + for (MachineInstr &Def : MRI.def_instructions(Reg)) { + if (Def.getParent() != MBB) continue; if (DefMI) { // Not unique. DefMI = nullptr; break; } - DefMI = &def; + DefMI = &Def; } return DefMI; } -bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive, - const GCNRPTracker::LiveRegSet &outputLive) { - return inputLive.count(Reg) && outputLive.count(Reg); +bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &InputLive, + const GCNRPTracker::LiveRegSet &OutputLive) { + return InputLive.count(Reg) && OutputLive.count(Reg); } // Instructions which only use imm/passThru reg/output only reg will not kill // any live reg, so name them pacifist here. bool collectPacifist(MachineInstr &MI, - const GCNRPTracker::LiveRegSet &inputLive, - const GCNRPTracker::LiveRegSet &outputLive, - const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI) { + const GCNRPTracker::LiveRegSet &InputLive, + const GCNRPTracker::LiveRegSet &OutputLive, + const MachineRegisterInfo &MRI) { // If has implicit def, not move. if (MI.getDesc().NumImplicitDefs != 0) return false; @@ -2645,16 +2627,15 @@ bool collectPacifist(MachineInstr &MI, if (Reg.isPhysical()) return false; // The def for reg must be unique def in block or pass thru which not has - // def in block. If not, it is not safe to move. - if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), inputLive, - outputLive, MRI) || - (isPassThru(Reg, inputLive, outputLive) && + // def in block. If not, It is not safe to move. + if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI) || + (isPassThru(Reg, InputLive, OutputLive) && !hasInBlockDef(Reg, MI.getParent(), MRI)))) return false; - LaneBitmask mask = llvm::getRegMask(MO, MRI); + LaneBitmask Mask = llvm::getRegMask(MO, MRI); - if (isInLiveSet(Reg, mask, outputLive)) + if (isInLiveSet(Reg, Mask, OutputLive)) continue; return false; @@ -2666,13 +2647,12 @@ bool collectPacifist(MachineInstr &MI, if (Reg.isPhysical()) return false; - if (nullptr == - getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) + if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI)) return false; IsHasDef = true; } - // If no def, it will not increase pressure, don't mark it. + // If no def, It will not increase pressure, don't mark It. return IsHasDef; } @@ -2696,103 +2676,102 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI, AliasAnalysis *AA, - SlotIndexes *slotIndexes) { + SlotIndexes *SlotIndexes) { - SmallVector users; + SmallVector Users; // We cannot move the pacifist instruction past any memory - // op with which it aliases. Find the first instruction - // that aliases the pacifist MI (if any) and add it to the list + // op with which It aliases. Find the first instruction + // that aliases the pacifist MI (if any) and add It to the list // of users. The sort() below will select the earliest user instruction. if (MachineInstr *AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) { - users.push_back(AliasMI); + Users.push_back(AliasMI); } for (MachineOperand &MO : MI.defs()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { if (&MBB != UseMI.getParent()) continue; - users.emplace_back(&UseMI); + Users.emplace_back(&UseMI); } } - if (users.empty()) + if (Users.empty()) return nullptr; - std::sort(users.begin(), users.end(), - [&slotIndexes](const MachineInstr *MIa, MachineInstr *MIb) { + std::sort(Users.begin(), Users.end(), + [&SlotIndexes](const MachineInstr *MIa, MachineInstr *MIb) { // Early instr first. return SlotIndex::isEarlierInstr( - slotIndexes->getInstructionIndex(*MIa), - slotIndexes->getInstructionIndex(*MIb)); + SlotIndexes->getInstructionIndex(*MIa), + SlotIndexes->getInstructionIndex(*MIb)); }); - return users.front(); + return Users.front(); } // Pacifist inst will only add pressure since they don't kill. // Try to hold them as late as possible in a MBB to help pressure. bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, AliasAnalysis *AA, - RematStatus &status) { - const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB]; - const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; + AliasAnalysis *AA, RematStatus &Status) { + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[&MBB]; + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; - SmallVector pacifistList; + SmallVector PacifistList; LLVM_DEBUG(dbgs() << "pacifist begin\n"); for (MachineInstr &MI : MBB) { if (MI.isDebugInstr()) continue; - if (collectPacifist(MI, inputLive, outputLive, MRI, SIRI)) { - pacifistList.emplace_back(&MI); + if (collectPacifist(MI, InputLive, OutputLive, MRI)) { + PacifistList.emplace_back(&MI); LLVM_DEBUG(MI.dump()); } } LLVM_DEBUG(dbgs() << "pacifist end\n"); - SlotIndexes *slotIndexes = LIS->getSlotIndexes(); + SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); bool IsUpdated = false; // Move pacifist to its first user. // for (MachineInstr *MI : pacifistList) { - for (auto it = pacifistList.rbegin(); it != pacifistList.rend(); it++) { - MachineInstr *MI = *it; - MachineInstr *firstUser = - findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes); - if (firstUser == MI) + for (auto It = PacifistList.rbegin(); It != PacifistList.rend(); It++) { + MachineInstr *MI = *It; + MachineInstr *FirstUser = + findPacifistInsertPoint(*MI, MBB, MRI, AA, SlotIndexes); + if (FirstUser == MI) continue; - if (firstUser == MI->getNextNode()) + if (FirstUser == MI->getNextNode()) continue; - auto insertPoint = MBB.getFirstInstrTerminator(); - if (firstUser) { - insertPoint = firstUser->getIterator(); + auto InsertPoint = MBB.getFirstInstrTerminator(); + if (FirstUser) { + InsertPoint = FirstUser->getIterator(); } else { // When there's no terminator. - if (insertPoint == MBB.end()) - insertPoint--; + if (InsertPoint == MBB.end()) + InsertPoint--; else - // BRANCH may have exec update before it. - insertPoint--; - - insertPoint = - llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); - - while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) || - insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) && - insertPoint != MI->getIterator()) { - insertPoint--; - insertPoint = - llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin()); + // BRANCH may have exec update before It. + InsertPoint--; + + InsertPoint = + llvm::skipDebugInstructionsBackward(InsertPoint, MBB.instr_begin()); + + while ((InsertPoint->definesRegister(AMDGPU::EXEC, SIRI) || + InsertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) && + InsertPoint != MI->getIterator()) { + InsertPoint--; + InsertPoint = + llvm::skipDebugInstructionsBackward(InsertPoint, MBB.instr_begin()); } - if (insertPoint == MI->getIterator()) + if (InsertPoint == MI->getIterator()) continue; } // Do not overwrite a live scc. - if (WillSmashSccAtLocation(MI, &MBB, insertPoint)) + if (willSmashSccAtLocation(MI, &MBB, InsertPoint)) continue; MI->removeFromParent(); - MBB.insert(insertPoint, MI); + MBB.insert(InsertPoint, MI); LIS->handleMove(*MI); IsUpdated = true; @@ -2813,16 +2792,16 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI, continue; if (MI.getNumDefs() != 1) continue; - unsigned dstIdx = + unsigned DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst); - if (dstIdx == (unsigned)-1) + if (DstIdx == (unsigned)-1) continue; - MachineOperand &DstMO = MI.getOperand(dstIdx); + MachineOperand &DstMO = MI.getOperand(DstIdx); if (DstMO.getSubReg() != 0) continue; if (DstMO.isTied()) continue; - unsigned Reg = DstMO.getReg(); + Register Reg = DstMO.getReg(); if (MRI.getUniqueVRegDef(Reg) == nullptr) continue; @@ -2839,22 +2818,21 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI, return UniformMap; } -// Try insert readfirstlane on uniform vgpr to turn it in sgpr and save vgpr +// Try insert readfirstlane on uniform vgpr to turn It in sgpr and save vgpr // pressure. bool collectVToSCrossHotSpot( - MachineBasicBlock &MBB, RematStatus &status, + MachineBasicBlock &MBB, RematStatus &Status, DenseMap &UniformMap, - SmallMapVector &VToSMap, LiveIntervals *LIS) -{ - unsigned VLimit = status.TargetVLimit; - unsigned SLimit = status.TargetSLimit; + SmallMapVector &VToSMap, LiveIntervals *LIS) { + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; auto &ST = MBB.getParent()->getSubtarget(); GCNDownwardRPTracker Tracker(*LIS); bool IsUpdated = false; - const auto inputLive = status.MBBInputLiveMap[&MBB]; - Tracker.reset(*MBB.begin(), &inputLive); + const auto InputLive = Status.MBBInputLiveMap[&MBB]; + Tracker.reset(*MBB.begin(), &InputLive); for (MachineInstr &MI : MBB) { if (MI.isDebugInstr()) { continue; @@ -2876,8 +2854,8 @@ bool collectVToSCrossHotSpot( // Try to make all possible vtos to reduce vpressure. const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs(); - for (auto it : CurLives) { - unsigned Reg = it.first; + for (auto It : CurLives) { + unsigned Reg = It.first; auto UniformIt = UniformMap.find(Reg); if (UniformIt == UniformMap.end()) continue; @@ -2889,53 +2867,53 @@ bool collectVToSCrossHotSpot( } // Return true if the user is outside of the def's loop. -static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, +static bool isCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI) { MachineLoop *L = MLI->getLoopFor(Def->getParent()); return L && !L->contains(User->getParent()); } -bool rematUniformVgprToSgpr( - Remat *Remat, MachineFunction &MF, RematStatus &status, - DenseMap &MBBPressureMap, - std::vector &hotBlocks, LiveIntervals *LIS, - MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, MachineLoopInfo *MLI) { +bool rematUniformVgprToSgpr(Remat *Remat, MachineFunction &MF, + RematStatus &Status, + std::vector &HotBlocks, + LiveIntervals *LIS, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + MachineLoopInfo *MLI) { DenseMap UniformVgprMap = collectUniformVgprs(Remat, MF, MRI, SIRI); SmallMapVector VToSMap; - for (auto &hotBlock : hotBlocks) { - MachineBasicBlock &MBB = *hotBlock.MBB; - collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS); + for (auto &HotBlock : HotBlocks) { + MachineBasicBlock &MBB = *HotBlock.MBB; + collectVToSCrossHotSpot(MBB, Status, UniformVgprMap, VToSMap, LIS); } if (VToSMap.empty()) return false; - SlotIndexes *slotIndexes = LIS->getSlotIndexes(); + SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); const MCInstrDesc &ReadFirstLaneDesc = SIII->get(AMDGPU::V_READFIRSTLANE_B32); - for (auto it : VToSMap) { - unsigned Reg = it.first; - MachineInstr *MI = it.second; + for (auto It : VToSMap) { + unsigned Reg = It.first; + MachineInstr *MI = It.second; auto *VRC = SIRI->getRegClassForReg(MRI, Reg); // TODO: support bigger vgpr to sgpr. if (VRC != &AMDGPU::VGPR_32RegClass) continue; auto *NewRC = SIRI->getEquivalentSGPRClass(VRC); - unsigned newDst = MRI.createVirtualRegister(NewRC); + Register NewDst = MRI.createVirtualRegister(NewRC); auto ReadFirstLane = - BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, newDst); - SmallVector userMIs; - for (MachineInstr &userMI : MRI.use_nodbg_instructions(Reg)) { + BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, NewDst); + SmallVector UserMIs; + for (MachineInstr &UserMI : MRI.use_nodbg_instructions(Reg)) { // Do not replace v->s across loops. Even if the value is uniform // branch divergence can cause a uniform value in a loop to be // non-uniform when used outside a loop. - if (IsSafeRematCandidateUser(&userMI, SIII) && - !IsCrossLoopUse(MI, &userMI, MLI)) - userMIs.emplace_back(&userMI); + if (isSafeRematCandidateUser(&UserMI, SIII) && + !isCrossLoopUse(MI, &UserMI, MLI)) + UserMIs.emplace_back(&UserMI); } // Finish readfirstlane @@ -2945,32 +2923,32 @@ bool rematUniformVgprToSgpr( Remat->SafeToRemoveInsts.insert(VToSMI); MachineBasicBlock *MBB = MI->getParent(); MBB->insertAfter(MI->getIterator(), VToSMI); - slotIndexes->insertMachineInstrInMaps(*VToSMI); + SlotIndexes->insertMachineInstrInMaps(*VToSMI); - for (MachineInstr *userMI : userMIs) { - const auto &Desc = userMI->getDesc(); + for (MachineInstr *UserMI : UserMIs) { + const auto &Desc = UserMI->getDesc(); bool IsIllegal = false; - for (unsigned i = 0; i < userMI->getNumOperands(); i++) { - MachineOperand &MO = userMI->getOperand(i); + for (unsigned I = 0; I < UserMI->getNumOperands(); I++) { + MachineOperand &MO = UserMI->getOperand(I); if (!MO.isReg()) continue; if (MO.isDef()) continue; if (MO.getReg() != Reg) continue; - if (i >= Desc.getNumOperands()) { + if (I >= Desc.getNumOperands()) { IsIllegal = true; break; } - MO.setReg(newDst); - if (userMI->getDesc().operands()[i].RegClass != -1) { - if (!SIII->isOperandLegal(*userMI, i, &MO)) { - SIII->legalizeOperands(*userMI); + MO.setReg(NewDst); + if (UserMI->getDesc().operands()[I].RegClass != -1) { + if (!SIII->isOperandLegal(*UserMI, I, &MO)) { + SIII->legalizeOperands(*UserMI); // In case legalizeOperands not help, just legalize with mov. - if (userMI->getDesc().operands()[i].RegClass != -1 && - !SIII->isOperandLegal(*userMI, i)) { - SIII->legalizeOpWithMove(*userMI, i); + if (UserMI->getDesc().operands()[I].RegClass != -1 && + !SIII->isOperandLegal(*UserMI, I)) { + SIII->legalizeOpWithMove(*UserMI, I); } } } else { @@ -2980,12 +2958,12 @@ bool rematUniformVgprToSgpr( if (IsIllegal) continue; - auto rit = userMI->getReverseIterator(); - rit++; - auto endIt = userMI->getParent()->rend(); - while (rit != endIt && !rit->isDebugInstr() && - !slotIndexes->hasIndex(*rit)) - slotIndexes->insertMachineInstrInMaps(*(rit++)); + auto RIt = UserMI->getReverseIterator(); + RIt++; + auto EndIt = UserMI->getParent()->rend(); + while (RIt != EndIt && !RIt->isDebugInstr() && + !SlotIndexes->hasIndex(*RIt)) + SlotIndexes->insertMachineInstrInMaps(*(RIt++)); } } @@ -2993,19 +2971,17 @@ bool rematUniformVgprToSgpr( } bool collectRematableHotReg( - MachineInstr &MI, const GCNRPTracker::LiveRegSet &hotLive, - GCNRPTracker::LiveRegSet &pureHotRematSet, - DenseMap &pureHotRematLevels, unsigned &DefReg, - const GCNRPTracker::LiveRegSet &inputLive, - const GCNRPTracker::LiveRegSet &outputLive, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI) { + MachineInstr &MI, const GCNRPTracker::LiveRegSet &HotLive, + GCNRPTracker::LiveRegSet &PureHotRematSet, + DenseMap &PureHotRematLevels, unsigned &DefReg, + const GCNRPTracker::LiveRegSet &InputLive, const MachineRegisterInfo &MRI) { // Ignore inst not have def or more than 1 def. if (MI.getDesc().getNumDefs() != 1) return false; DefReg = MI.defs().begin()->getReg(); - unsigned level = 0; + unsigned Level = 0; for (MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; @@ -3016,7 +2992,7 @@ bool collectRematableHotReg( // If user is in same MI like // %4:vgpr_32 = V_MAD_LEGACY_F32 %2:vgpr_32, %3:vgpr_32, %4:vgpr_32 - // remat it will not help. + // remat It will not help. if (Reg == DefReg) { return false; } @@ -3026,18 +3002,17 @@ bool collectRematableHotReg( if (Reg.isPhysical()) return false; - if (nullptr == - getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) + if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI)) return false; - LaneBitmask mask = llvm::getRegMask(MO, MRI); + LaneBitmask Mask = llvm::getRegMask(MO, MRI); - if (isInLiveSet(Reg, mask, hotLive)) + if (isInLiveSet(Reg, Mask, HotLive)) continue; - if (isInLiveSet(Reg, mask, pureHotRematSet)) { - unsigned regLevel = getPacifistLevel(Reg, pureHotRematLevels, MRI); - level = std::max(level, regLevel); + if (isInLiveSet(Reg, Mask, PureHotRematSet)) { + unsigned RegLevel = getPacifistLevel(Reg, PureHotRematLevels, MRI); + Level = std::max(Level, RegLevel); continue; } @@ -3050,46 +3025,44 @@ bool collectRematableHotReg( if (Reg.isPhysical()) return false; - if (nullptr == - getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI)) + if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI)) return false; - LaneBitmask mask = llvm::getRegMask(MO, MRI); - pureHotRematSet[Reg] |= mask; + LaneBitmask Mask = llvm::getRegMask(MO, MRI); + PureHotRematSet[Reg] |= Mask; } - pureHotRematLevels[&MI] = level + 1; - // If no def, it will not increase pressure, don't mark it. + PureHotRematLevels[&MI] = Level + 1; + // If no def, It will not increase pressure, don't mark It. return true; } -bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, - std::vector &inBlockCloneSubExps, bool IsVGPR, - const GCNRPTracker::LiveRegSet &inputLive, - const GCNRPTracker::LiveRegSet &outputLive, - DenseSet &hotSet, int vDistance, int sDistance, +bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi, + std::vector &InBlockCloneSubExps, bool IsVGPR, + const GCNRPTracker::LiveRegSet &InputLive, + DenseSet &HotSet, int VDistance, int SDistance, unsigned VLimit, unsigned SLimit, const DenseSet &MemWriteMBBSet, LiveIntervals *LIS, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { auto &ST = MBB.getParent()->getSubtarget(); - const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex(); + const auto &SI = LIS->getInstructionIndex(*HotMi).getBaseIndex(); const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI); - GCNRPTracker::LiveRegSet hotLive = LISLR; + GCNRPTracker::LiveRegSet HotLive = LISLR; - GCNRPTracker::LiveRegSet pureHotRematSet; - std::vector pureHotRematList; - DenseMap pureHotRematLevels; + GCNRPTracker::LiveRegSet PureHotRematSet; + std::vector PureHotRematList; + DenseMap PureHotRematLevels; - GCNRPTracker::LiveRegSet outputSet; + GCNRPTracker::LiveRegSet OutputSet; LLVM_DEBUG(dbgs() << "pure hot remat begin\n"); // Find reg which could remat from other reg in liveSet. - const unsigned kMaxRematLevel = 6; + const unsigned KMaxRematLevel = 6; GCNDownwardRPTracker Tracker(*LIS); - Tracker.reset(*MBB.begin(), &inputLive); - for (auto it = MBB.begin(); it != MBB.end(); it++) { - MachineInstr &MI = *it; + Tracker.reset(*MBB.begin(), &InputLive); + for (auto It = MBB.begin(); It != MBB.end(); It++) { + MachineInstr &MI = *It; const GCNRegPressure &RP = Tracker.getPressure(); if (MI.isDebugInstr()) @@ -3103,31 +3076,31 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, } // Stop at hotMI. - if (&MI == hotMI) + if (&MI == HotMi) break; Tracker.advance(); unsigned DefReg = 0; - if (collectRematableHotReg(MI, hotLive, pureHotRematSet, pureHotRematLevels, - DefReg, inputLive, outputLive, MRI, SIRI)) { - unsigned level = pureHotRematLevels[&MI]; - if (level >= kMaxRematLevel) + if (collectRematableHotReg(MI, HotLive, PureHotRematSet, PureHotRematLevels, + DefReg, InputLive, MRI)) { + unsigned Level = PureHotRematLevels[&MI]; + if (Level >= KMaxRematLevel) continue; // If the def reg is in hot reg. // Add to output. - if (hotLive.find(DefReg) != hotLive.end()) { + if (HotLive.find(DefReg) != HotLive.end()) { bool IsUserIsHot = false; for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) { if (UseMI.getParent() != &MBB) continue; - if (0 == hotSet.count(&UseMI)) + if (0 == HotSet.count(&UseMI)) continue; - const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex(); - // When has a hot user after hotMI, remat it may not help. - if (useSI > SI) { + const auto &UseSI = LIS->getInstructionIndex(UseMI).getBaseIndex(); + // When has a hot user after hotMI, remat It may not help. + if (UseSI > SI) { IsUserIsHot = true; break; } @@ -3135,14 +3108,14 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, if (IsUserIsHot) continue; - outputSet[DefReg]; + OutputSet[DefReg]; LLVM_DEBUG(dbgs() << "hotRemat:"); LLVM_DEBUG(MI.getOperand(0).dump()); - // remove it from hotLive to avoid it as input when build dag. - hotLive.erase(DefReg); + // remove It from hotLive to avoid It as input when build dag. + HotLive.erase(DefReg); } - pureHotRematList.emplace_back(&MI); - LLVM_DEBUG(dbgs() << "level:" << level); + PureHotRematList.emplace_back(&MI); + LLVM_DEBUG(dbgs() << "level:" << Level); LLVM_DEBUG(MI.dump()); } } @@ -3154,82 +3127,82 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI, // Build SubExp with pureHotRematList as Nodes, hotLive as input // rematHot as output. // Not join input when build ExpDag to get small subExps. - ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ false); - dag.build(hotLive, outputSet, pureHotRematList); + ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ false); + Dag.build(HotLive, OutputSet, PureHotRematList); // Find best subExp add to inBlockCloneSubExps. // Sort by size of subExp. - std::sort(dag.SubExps.begin(), dag.SubExps.end(), + std::sort(Dag.SubExps.begin(), Dag.SubExps.end(), [](const SubExp &A, const SubExp &B) { return A.SUnits.size() < B.SUnits.size(); }); - std::vector cloneSubExps; - int distance = IsVGPR ? vDistance : sDistance; - for (SubExp &subExp : dag.SubExps) { - if (subExp.IsNotSafeToCopy) + std::vector CloneSubExps; + int Distance = IsVGPR ? VDistance : SDistance; + for (SubExp &SubExp : Dag.SubExps) { + if (SubExp.IsNotSafeToCopy) continue; if (IsVGPR) { - if (subExp.vOutputSize == 0) + if (SubExp.vOutputSize == 0) continue; } else { - if (subExp.sOutputSize == 0) + if (SubExp.sOutputSize == 0) continue; } - if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) + if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) continue; // Not clone . - if (subExp.SUnits.size() > 10) + if (SubExp.SUnits.size() > 10) continue; // Do not allow remat in the block when the expression has a memory op and // the block has a write. We could allow this in some cases with better // analysis. - if (subExp.IsHasMemInst && MemWriteMBBSet.count(&MBB)) + if (SubExp.IsHasMemInst && MemWriteMBBSet.count(&MBB)) continue; if (IsVGPR) { - distance -= subExp.vOutputSize; + Distance -= SubExp.vOutputSize; } else { - distance -= subExp.sOutputSize; + Distance -= SubExp.sOutputSize; } - cloneSubExps.emplace_back(subExp); - if (distance <= 0) + CloneSubExps.emplace_back(SubExp); + if (Distance <= 0) break; } - if (distance <= 0) { - inBlockCloneSubExps.insert(inBlockCloneSubExps.end(), cloneSubExps.begin(), - cloneSubExps.end()); + if (Distance <= 0) { + InBlockCloneSubExps.insert(InBlockCloneSubExps.end(), CloneSubExps.begin(), + CloneSubExps.end()); } - return distance <= 0; + return Distance <= 0; } // Try to remat live reg in hot spot from other live reg in hot spot. // bool tryRematInHotSpot( - MachineBasicBlock &MBB, RematStatus &status, int vDistance, int sDistance, - int vSaved, int sSaved, std::vector &inBlockCloneSubExps, - DenseMap &inBlockHotVInstMap, - DenseMap &inBlockHotSInstMap, + MachineBasicBlock &MBB, RematStatus &Status, int VDistance, int SDistance, + int VSaved, int SSaved, std::vector &InBlockCloneSubExps, + DenseMap &InBlockHotVInstMap, + DenseMap &InBlockHotSInstMap, LiveIntervals *LIS, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { - unsigned VLimit = status.TargetVLimit; - unsigned SLimit = status.TargetSLimit; + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; auto &ST = MBB.getParent()->getSubtarget(); - const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB]; + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[&MBB]; - const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB]; + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; // Collect reg pressure. - unsigned maxLocalVPressure = 0; - unsigned maxLocalSPressure = 0; + unsigned MaxLocalVPressure = 0; + unsigned MaxLocalSPressure = 0; // Build a DAG or only on demand? - MachineInstr *hotVMI = nullptr; - MachineInstr *hotSMI = nullptr; - DenseSet hotSet; + MachineInstr *HotVMI = nullptr; + MachineInstr *HotSMI = nullptr; + DenseSet HotSet; GCNDownwardRPTracker Tracker(*LIS); - Tracker.reset(*MBB.begin(), &inputLive); - for (auto it = MBB.begin(); it != MBB.end(); it++) { - MachineInstr &MI = *it; + Tracker.reset(*MBB.begin(), &InputLive); + for (auto It = MBB.begin(); It != MBB.end(); It++) { + MachineInstr &MI = *It; if (MI.isDebugInstr()) { continue; } @@ -3239,42 +3212,42 @@ bool tryRematInHotSpot( SPressure += RegForVCC; - VPressure -= vSaved; - SPressure -= sSaved; + VPressure -= VSaved; + SPressure -= SSaved; Tracker.advance(); if (VPressure <= VLimit && SPressure <= SLimit) { continue; } - hotSet.insert(&MI); - if (maxLocalVPressure < VPressure) { - maxLocalVPressure = VPressure; - hotVMI = &MI; + HotSet.insert(&MI); + if (MaxLocalVPressure < VPressure) { + MaxLocalVPressure = VPressure; + HotVMI = &MI; } - if (maxLocalSPressure < SPressure) { - maxLocalSPressure = SPressure; - hotSMI = &MI; + if (MaxLocalSPressure < SPressure) { + MaxLocalSPressure = SPressure; + HotSMI = &MI; } } - inBlockHotVInstMap[&MBB] = hotVMI; - inBlockHotSInstMap[&MBB] = hotSMI; - if (vDistance > 0 && hotVMI) { + InBlockHotVInstMap[&MBB] = HotVMI; + InBlockHotSInstMap[&MBB] = HotSMI; + if (VDistance > 0 && HotVMI) { // Use hotVMI when apply. - inBlockHotSInstMap[&MBB] = nullptr; - if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*IsVGPR*/ true, inputLive, - outputLive, hotSet, vDistance, sDistance, VLimit, SLimit, - status.MemWriteMBBSet, LIS, MRI, SIRI, SIII)) + InBlockHotSInstMap[&MBB] = nullptr; + if (tryRemat(MBB, HotVMI, InBlockCloneSubExps, /*IsVGPR*/ true, InputLive, + HotSet, VDistance, SDistance, VLimit, SLimit, + Status.MemWriteMBBSet, LIS, MRI, SIRI, SIII)) return true; } - if (sDistance > 0 && hotSMI) { + if (SDistance > 0 && HotSMI) { // Use hotSMI when apply. - inBlockHotSInstMap[&MBB] = hotSMI; - inBlockHotVInstMap[&MBB] = nullptr; - return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*IsVGPR*/ false, - inputLive, outputLive, hotSet, vDistance, sDistance, VLimit, - SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII); + InBlockHotSInstMap[&MBB] = HotSMI; + InBlockHotVInstMap[&MBB] = nullptr; + return tryRemat(MBB, HotSMI, InBlockCloneSubExps, /*IsVGPR*/ false, + InputLive, HotSet, VDistance, VDistance, VLimit, SLimit, + Status.MemWriteMBBSet, LIS, MRI, SIRI, SIII); } return false; } @@ -3282,9 +3255,9 @@ bool tryRematInHotSpot( // If subExp0 use result of subExp1, subExp0 is deeper than subExp1. // When apply subExp1 before subExp0, new clone of subExp0 which use result of // subExp1 will have old reg of subExp1. And reg pressure will not be reduced. -void sortSubExpCandidates(std::vector &subExpCandidates) { - MapVector> inputMap; - MapVector> outputMap; +void sortSubExpCandidates(std::vector &SubExpCandidates) { + MapVector> InputMap; + MapVector> OutputMap; struct SortNode { SubExp Exp; unsigned Depth; @@ -3295,67 +3268,67 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { { SmallVector RegSortStorage; - for (SubExp &Exp : subExpCandidates) { + for (SubExp &Exp : SubExpCandidates) { RegSortStorage.assign(Exp.TopRegs.begin(), Exp.TopRegs.end()); std::sort(RegSortStorage.begin(), RegSortStorage.end()); - for (auto it : RegSortStorage) { - unsigned Reg = it; - inputMap[Reg].insert(&Exp); + for (auto It : RegSortStorage) { + unsigned Reg = It; + InputMap[Reg].insert(&Exp); } RegSortStorage.assign(Exp.BottomRegs.begin(), Exp.BottomRegs.end()); std::sort(RegSortStorage.begin(), RegSortStorage.end()); - for (auto it : RegSortStorage) { - unsigned Reg = it; - outputMap[Reg].insert(&Exp); + for (auto It : RegSortStorage) { + unsigned Reg = It; + OutputMap[Reg].insert(&Exp); } } } - MapVector sortMap; - for (auto it : inputMap) { - unsigned Reg = it.first; - auto outIt = outputMap.find(Reg); - if (outIt == outputMap.end()) + MapVector SortMap; + for (auto It : InputMap) { + unsigned Reg = It.first; + auto OutIt = OutputMap.find(Reg); + if (OutIt == OutputMap.end()) continue; - auto &inExps = it.second; - auto &outExps = outIt->second; - for (SubExp *inExp : inExps) { - for (SubExp *outExp : outExps) { - if (inExp->IsHoist != outExp->IsHoist) { + auto &InExps = It.second; + auto &OutExps = OutIt->second; + for (SubExp *InExp : InExps) { + for (SubExp *OutExp : OutExps) { + if (InExp->IsHoist != OutExp->IsHoist) { // Different direction. // If output (def) move up, input(use) move down, nothing happens. - if (outExp->IsHoist) + if (OutExp->IsHoist) continue; // Canot input(use) move up, output(def) move down. // Choose the exp which save more. - int inExpGain = inExp->vOutputSize - inExp->vInputSize; - int outExpGain = outExp->vInputSize - inExp->vOutputSize; - if (inExpGain >= outExpGain) { - outExp->SUnits.clear(); + int InExpGain = InExp->vOutputSize - InExp->vInputSize; + int OutExpGain = OutExp->vInputSize - InExp->vOutputSize; + if (InExpGain >= OutExpGain) { + OutExp->SUnits.clear(); } else { - inExp->SUnits.clear(); + InExp->SUnits.clear(); } continue; } // Link outExp to inExp. - if (inExp->IsHoist) { - sortMap[outExp].Preds.insert(inExp); - sortMap[inExp].Succs.insert(outExp); + if (InExp->IsHoist) { + SortMap[OutExp].Preds.insert(InExp); + SortMap[InExp].Succs.insert(OutExp); } else { - sortMap[inExp].Preds.insert(outExp); - sortMap[outExp].Succs.insert(inExp); + SortMap[InExp].Preds.insert(OutExp); + SortMap[OutExp].Succs.insert(InExp); } } } } - if (sortMap.empty()) + if (SortMap.empty()) return; SmallVector WorkList; - for (SubExp &Exp : subExpCandidates) { - SortNode &Node = sortMap[&Exp]; + for (SubExp &Exp : SubExpCandidates) { + SortNode &Node = SortMap[&Exp]; Node.Depth = 0; Node.Exp = Exp; Node.IsDepthDirty = !Node.Preds.empty(); @@ -3365,13 +3338,13 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { // Calc depth. while (!WorkList.empty()) { SubExp *Exp = WorkList.pop_back_val(); - SortNode &Node = sortMap[Exp]; + SortNode &Node = SortMap[Exp]; for (SubExp *Succ : Node.Succs) { - SortNode &SuccNode = sortMap[Succ]; + SortNode &SuccNode = SortMap[Succ]; SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1); bool IsAllPrevClean = true; for (SubExp *Prev : SuccNode.Preds) { - SortNode &PrevNode = sortMap[Prev]; + SortNode &PrevNode = SortMap[Prev]; if (PrevNode.IsDepthDirty) { IsAllPrevClean = false; break; @@ -3384,35 +3357,35 @@ void sortSubExpCandidates(std::vector &subExpCandidates) { } } - std::vector nodes; - for (auto &it : sortMap) { - SortNode &node = it.second; - nodes.emplace_back(&node); + std::vector Nodes; + for (auto &It : SortMap) { + SortNode &Node = It.second; + Nodes.emplace_back(&Node); } - struct sorter { - bool operator()(const SortNode *a, const SortNode *b) { - return a->Depth > b->Depth; + struct Sorter { + bool operator()(const SortNode *A, const SortNode *B) { + return A->Depth > B->Depth; } }; // subExp deeper should be apply first. - std::sort(nodes.begin(), nodes.end(), sorter()); + std::sort(Nodes.begin(), Nodes.end(), Sorter()); - subExpCandidates.clear(); - for (auto &node : nodes) { - subExpCandidates.emplace_back(node->Exp); + SubExpCandidates.clear(); + for (auto &Node : Nodes) { + SubExpCandidates.emplace_back(Node->Exp); } } // Compare pressure, return ture if maxV0/maxS0 pressure is higher than // maxV1/maxS1. -bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1, - unsigned maxS1, const GCNSubtarget *ST) { - unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0); - unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(maxV1); - unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(maxS0); - unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1); +bool pressureHigher(unsigned MaxV0, unsigned MaxS0, unsigned MaxV1, + unsigned MaxS1, const GCNSubtarget *ST) { + unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(MaxV0); + unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(MaxV1); + unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(MaxS0); + unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(MaxS1); unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0); unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1); // is low pressure. @@ -3422,146 +3395,146 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1, return true; // When sgpr bound, is high pressure. if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) { - return maxS0 > maxS1; + return MaxS0 > MaxS1; } // When vgpr bound or mix, vgpr higher is higher pressure. - return maxV0 > maxV1; + return MaxV0 > MaxV1; } // Return true if the subExp can help pressure for passThrus. -bool canHelpPressureWhenSink( - SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus, - const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, const MachineLoopInfo *MLI, - MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound) { - LLVM_DEBUG(subExp.dump(MRI, SIRI)); - if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) +bool canHelpPressureWhenSink(SubExp &SubExp, + const GCNRPTracker::LiveRegSet &PassThrus, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, + const MachineLoopInfo *MLI, + MachineDominatorTree *DT, bool IsCanClone, + bool IsSgprBound) { + LLVM_DEBUG(SubExp.dump(MRI, SIRI)); + if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) return false; // Update input size to ignore lives in which already in // passThrus. - for (auto it : subExp.inputLive) { - unsigned Reg = it.first; - if (passThrus.count(Reg) == 0) + for (auto It : SubExp.inputLive) { + unsigned Reg = It.first; + if (PassThrus.count(Reg) == 0) continue; - unsigned Size = getRegSize(Reg, it.second, MRI, SIRI); + unsigned Size = getRegSize(Reg, It.second, MRI, SIRI); if (SIRI->isVGPR(MRI, Reg)) { - subExp.vInputSize -= Size; + SubExp.vInputSize -= Size; } else { - subExp.sInputSize -= Size; + SubExp.sInputSize -= Size; } } - if (subExp.vInputSize > subExp.vOutputSize) + if (SubExp.vInputSize > SubExp.vOutputSize) return false; - if (subExp.sInputSize > subExp.sOutputSize && IsSgprBound) + if (SubExp.sInputSize > SubExp.sOutputSize && IsSgprBound) return false; - if (subExp.sInputSize >= subExp.sOutputSize && - subExp.vInputSize == subExp.vOutputSize) + if (SubExp.sInputSize >= SubExp.sOutputSize && + SubExp.vInputSize == SubExp.vOutputSize) return false; // Try to find a Insert Block. // Skip multi def output sub exp. // Collect user blocks, find common dom. - BlockSet userBlocks; - for (unsigned Reg : subExp.BottomRegs) { + BlockSet UserBlocks; + for (unsigned Reg : SubExp.BottomRegs) { for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserBB = UseMI.getParent(); // Skip current BB. - if (UserBB != subExp.FromBB) - userBlocks.insert(UserBB); + if (UserBB != SubExp.FromBB) + UserBlocks.insert(UserBB); } } - if (userBlocks.empty()) + if (UserBlocks.empty()) return false; - MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks); - if (!DT->dominates(subExp.FromBB, userBlock)) { + MachineBasicBlock *UserBlock = nearestCommonDominator(DT, UserBlocks); + if (!DT->dominates(SubExp.FromBB, UserBlock)) { return false; } - if (userBlock == subExp.FromBB && + if (UserBlock == SubExp.FromBB && // When allow clone, could go clone path if cannot move subExp. !IsCanClone) return false; - subExp.ToBB = userBlock; - if (auto *toLoop = MLI->getLoopFor(userBlock)) { - auto *fromLoop = MLI->getLoopFor(subExp.FromBB); - if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth()) - subExp.IsMoveIntoLoop = true; - } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) { - auto *toLoop = MLI->getLoopFor(userBlock); + SubExp.ToBB = UserBlock; + if (auto *ToLoop = MLI->getLoopFor(UserBlock)) { + auto *FromLoop = MLI->getLoopFor(SubExp.FromBB); + if (!FromLoop || FromLoop->getLoopDepth() < ToLoop->getLoopDepth()) + SubExp.IsMoveIntoLoop = true; + } else if (auto *FromLoop = MLI->getLoopFor(SubExp.FromBB)) { + auto *ToLoop = MLI->getLoopFor(UserBlock); // not safe to move out of loop. - if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() || - toLoop != fromLoop) + if (!ToLoop || FromLoop->getLoopDepth() > ToLoop->getLoopDepth() || + ToLoop != FromLoop) return false; } return true; } -bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, +bool canHelpPressureWhenHoist(SubExp &SubExp, const MachineRegisterInfo &MRI, const MachineLoopInfo *MLI, bool IsSgprBound) { - if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ true)) + if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ true)) return false; - if (subExp.vInputSize < subExp.vOutputSize) + if (SubExp.vInputSize < SubExp.vOutputSize) return false; - if (subExp.sInputSize < subExp.sOutputSize && IsSgprBound) + if (SubExp.sInputSize < SubExp.sOutputSize && IsSgprBound) return false; - if (subExp.sInputSize <= subExp.sOutputSize && - subExp.vInputSize == subExp.vOutputSize) + if (SubExp.sInputSize <= SubExp.sOutputSize && + SubExp.vInputSize == SubExp.vOutputSize) return false; // Try to find a Insert Block. // Skip multi def output sub exp. // Collect user blocks, find common dom. - BlockSet defBlocks; - for (unsigned Reg : subExp.TopRegs) { + BlockSet DefBlocks; + for (unsigned Reg : SubExp.TopRegs) { MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); if (!DefMI) continue; - defBlocks.insert(DefMI->getParent()); + DefBlocks.insert(DefMI->getParent()); } - if (defBlocks.size() != 1) + if (DefBlocks.size() != 1) return false; - MachineBasicBlock *defBlock = *defBlocks.begin(); - subExp.ToBB = defBlock; + MachineBasicBlock *DefBlock = *DefBlocks.begin(); + SubExp.ToBB = DefBlock; // Not do same block hoist. - if (subExp.ToBB == subExp.FromBB) + if (SubExp.ToBB == SubExp.FromBB) return false; - if (auto *toLoop = MLI->getLoopFor(defBlock)) { - auto *fromLoop = MLI->getLoopFor(subExp.FromBB); + if (auto *ToLoop = MLI->getLoopFor(DefBlock)) { + auto *FromLoop = MLI->getLoopFor(SubExp.FromBB); // TODO: enable move into loop when hoist. - if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth()) + if (!FromLoop || FromLoop->getLoopDepth() < ToLoop->getLoopDepth()) return false; - } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) { - auto *toLoop = MLI->getLoopFor(defBlock); + } else if (auto *FromLoop = MLI->getLoopFor(SubExp.FromBB)) { + auto *ToLoop = MLI->getLoopFor(DefBlock); // not safe to move out of loop. - if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() || - toLoop != fromLoop) + if (!ToLoop || FromLoop->getLoopDepth() > ToLoop->getLoopDepth() || + ToLoop != FromLoop) return false; } return true; } SmallVector> -groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus, - GCNRPTracker::LiveRegSet &usedPassThrus, +groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &PassThrus, + GCNRPTracker::LiveRegSet &UsedPassThrus, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { MapVector Candidates; // Group safe candidates by define block. - for (auto it : passThrus) { - unsigned Reg = it.first; - // Skip used pass thru reg to avoid count it twice for different hot block. - if (usedPassThrus.count(Reg)) + for (auto It : PassThrus) { + Register Reg = It.first; + // Skip used pass thru reg to avoid count It twice for different hot block. + if (UsedPassThrus.count(Reg)) continue; - LLVM_DEBUG(print_vreg(Reg, MRI)); + LLVM_DEBUG(printVreg(Reg, MRI)); LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr "; else dbgs() << " vgpr ";); if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true)) { @@ -3573,61 +3546,60 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus, MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()]; - DefInMBB[Reg] = it.second; + DefInMBB[Reg] = It.second; } llvm::SmallVector> - result = Candidates.takeVector(); + Result = Candidates.takeVector(); - LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it - : result) { - MachineBasicBlock *MBB = it.first; - auto &defInMBB = it.second; + LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto It + : Result) { + MachineBasicBlock *MBB = It.first; + auto &defInMBB = It.second; MBB->dump(); llvm::dumpLiveSet(defInMBB, SIRI); } llvm::dbgs() << "end of candidates\n";); - std::sort(result.begin(), result.end(), - [](std::pair &it0, - std::pair &it1) { - return it0.first->getNumber() < it1.first->getNumber(); + std::sort(Result.begin(), Result.end(), + [](std::pair &It0, + std::pair &It1) { + return It0.first->getNumber() < It1.first->getNumber(); }); - LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it - : result) { - MachineBasicBlock *MBB = it.first; - auto &defInMBB = it.second; + LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto It + : Result) { + MachineBasicBlock *MBB = It.first; + auto &defInMBB = It.second; MBB->dump(); llvm::dumpLiveSet(defInMBB, SIRI); } llvm::dbgs() << "end of candidates\n";); - return result; + return Result; } // collect pass thru regs of MBB. GCNRPTracker::LiveRegSet collectPassThrus(MachineBasicBlock *MBB, - const GCNRPTracker::LiveRegSet &inputLive, - const GCNRPTracker::LiveRegSet &outputLive, - const GCNRPTracker::LiveRegSet &usedPassThrus, - const GCNRPTracker::LiveRegSet &liveRegCandidates, + const GCNRPTracker::LiveRegSet &InputLive, + const GCNRPTracker::LiveRegSet &OutputLive, + const GCNRPTracker::LiveRegSet &LiveRegCandidates, MachineRegisterInfo &MRI, bool IsCanClone) { - GCNRPTracker::LiveRegSet passThrus; - llvm::mergeLiveRegSet(passThrus, inputLive); - llvm::andLiveRegSet(passThrus, outputLive); + GCNRPTracker::LiveRegSet PassThrus; + llvm::mergeLiveRegSet(PassThrus, InputLive); + llvm::andLiveRegSet(PassThrus, OutputLive); // Remove reg which not in liveRegCandidates. - GCNRPTracker::LiveRegSet tmpPassThrus = passThrus; - for (auto it : tmpPassThrus) { - unsigned Reg = it.first; - if (!liveRegCandidates.count(Reg)) { - passThrus.erase(Reg); + GCNRPTracker::LiveRegSet TmpPassThrus = PassThrus; + for (auto It : TmpPassThrus) { + unsigned Reg = It.first; + if (!LiveRegCandidates.count(Reg)) { + PassThrus.erase(Reg); } } - tmpPassThrus = passThrus; + TmpPassThrus = PassThrus; // Remove reg which has read/write in MBB. - for (auto it : tmpPassThrus) { - unsigned Reg = it.first; + for (auto It : TmpPassThrus) { + unsigned Reg = It.first; DenseSet DefMBBs; for (MachineInstr &DefMI : MRI.def_instructions(Reg)) { MachineBasicBlock *MBB = DefMI.getParent(); @@ -3646,45 +3618,45 @@ collectPassThrus(MachineBasicBlock *MBB, bool IsPassThru = !IsW && !IsR; if (!IsPassThru) - passThrus.erase(Reg); + PassThrus.erase(Reg); } - return passThrus; + return PassThrus; } // Try to build a free subExp which all input is passThrus. -SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, - GCNRPTracker::LiveRegSet &passThrus, +SubExp buildFreeSubExp(SubExp &Exp, + GCNRPTracker::LiveRegSet &PassThrus, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { - SubExp freeExp; + SubExp FreeExp; // Try to split the subExp to find a help case. // Scan all inst in subExp, propagate free inst which input is from // passThrus. - SmallDenseSet freeRegs; - SmallDenseSet freeInstUseRegs; - SmallVector freeInsts; - for (MachineInstr *MI : subExp.SUnits) { + SmallDenseSet FreeRegs; + SmallDenseSet FreeInstUseRegs; + SmallVector FreeInsts; + for (MachineInstr *MI : Exp.SUnits) { bool IsFree = true; // Check all use regs are free. for (MachineOperand &MO : MI->uses()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MO.isImplicit() && Reg == AMDGPU::EXEC) continue; if (MRI.getUniqueVRegDef(Reg) == nullptr) { IsFree = false; break; } - // Skip local pass thrus unless it is free. - if (passThrus.count(Reg) && subExp.TopRegs.count(Reg)) + // Skip local pass thrus unless It is free. + if (PassThrus.count(Reg) && Exp.TopRegs.count(Reg)) continue; - if (freeRegs.count(Reg)) + if (FreeRegs.count(Reg)) continue; IsFree = false; break; } // Check def is unique. for (MachineOperand &MO : MI->defs()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MRI.getUniqueVRegDef(Reg) == nullptr) { IsFree = false; break; @@ -3693,104 +3665,103 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, if (!IsFree) continue; // Save inst as free inst. - freeInsts.emplace_back(MI); + FreeInsts.emplace_back(MI); // Save def as free reg. for (MachineOperand &MO : MI->defs()) { - unsigned Reg = MO.getReg(); - freeRegs.insert(Reg); + Register Reg = MO.getReg(); + FreeRegs.insert(Reg); } // Save use regs as free use reg. for (MachineOperand &MO : MI->uses()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); - freeInstUseRegs.insert(Reg); + FreeInstUseRegs.insert(Reg); } } // Then remove local inst has no output use. - for (MachineInstr *MI : freeInsts) { + for (MachineInstr *MI : FreeInsts) { bool IsFreeUsed = false; for (MachineOperand &MO : MI->defs()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); // Used as freeInst or output. - IsFreeUsed |= - freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg); + IsFreeUsed |= FreeInstUseRegs.count(Reg) > 0 || Exp.BottomRegs.count(Reg); } if (!IsFreeUsed) continue; - freeExp.SUnits.emplace_back(MI); + FreeExp.SUnits.emplace_back(MI); } - if (freeExp.SUnits.empty()) { - // mark has terminator to make it unsafe. - freeExp.IsHasTerminatorInst = true; - return freeExp; + if (FreeExp.SUnits.empty()) { + // mark has terminator to make It unsafe. + FreeExp.IsHasTerminatorInst = true; + return FreeExp; } // Build BottomRegs and TopRegs for freeExp. // BottomRegs is freeRegs in subExp.BottomRegs. - for (unsigned freeReg : freeRegs) { - if (subExp.BottomRegs.count(freeReg)) - freeExp.BottomRegs.insert(freeReg); + for (Register FreeReg : FreeRegs) { + if (Exp.BottomRegs.count(FreeReg)) + FreeExp.BottomRegs.insert(FreeReg); } // TopRegs is freeInstUseRegs in subExp.TopRegs. - for (unsigned freeInstUseReg : freeInstUseRegs) { - if (subExp.TopRegs.count(freeInstUseReg)) - freeExp.TopRegs.insert(freeInstUseReg); + for (Register FreeInstUseReg : FreeInstUseRegs) { + if (Exp.TopRegs.count(FreeInstUseReg)) + FreeExp.TopRegs.insert(FreeInstUseReg); } - freeExp.FromBB = subExp.FromBB; - freeExp.ToBB = subExp.ToBB; + FreeExp.FromBB = Exp.FromBB; + FreeExp.ToBB = Exp.ToBB; // must be clone since is partial of subExp. - freeExp.IsCloneOnly = true; + FreeExp.IsCloneOnly = true; // Calc reg for freeExp. - for (unsigned Reg : freeExp.TopRegs) { - freeExp.inputLive[Reg]; + for (unsigned Reg : FreeExp.TopRegs) { + FreeExp.inputLive[Reg]; } - for (unsigned Reg : freeExp.BottomRegs) { - freeExp.outputLive[Reg]; + for (unsigned Reg : FreeExp.BottomRegs) { + FreeExp.outputLive[Reg]; } - CollectLiveSetPressure(freeExp.inputLive, MRI, SIRI, freeExp.vInputSize, - freeExp.sInputSize); - CollectLiveSetPressure(freeExp.outputLive, MRI, SIRI, freeExp.vOutputSize, - freeExp.sOutputSize); - return freeExp; + CollectLiveSetPressure(FreeExp.inputLive, MRI, SIRI, FreeExp.vInputSize, + FreeExp.sInputSize); + CollectLiveSetPressure(FreeExp.outputLive, MRI, SIRI, FreeExp.vOutputSize, + FreeExp.sOutputSize); + return FreeExp; } std::vector buildSubExpCandidates( Remat *Remat, SmallVector> &Candidates, - GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI, + GCNRPTracker::LiveRegSet &PassThrus, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, - const MachineLoopInfo *MLI, SlotIndexes *slotIndexes, + const MachineLoopInfo *MLI, SlotIndexes *SlotIndexes, MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound, - GCNRPTracker::LiveRegSet &unUsedPassThrus, + GCNRPTracker::LiveRegSet &UnusedPassThrus, DenseSet &MemWriteMBBSet, bool AllowPartialUseInSubExp) { - std::vector subExpCandidates; + std::vector SubExpCandidates; // Build exp dag on define blocks. // Save profit candidates into list. - for (auto &it : Candidates) { - MachineBasicBlock *DefMBB = it.first; + for (auto &It : Candidates) { + MachineBasicBlock *DefMBB = It.first; // Try to remove out reg def sub exp from DefMBB. - GCNRPTracker::LiveRegSet &DefInMBB = it.second; + GCNRPTracker::LiveRegSet &DefInMBB = It.second; // Go up on the dag until reach share node. - auto subExps = buildSubExpFromCandidates( - Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus, + auto SubExps = buildSubExpFromCandidates( + Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, SlotIndexes, UnusedPassThrus, AllowPartialUseInSubExp); - for (SubExp &subExp : subExps) { - if (subExp.IsHasMemInst) { + for (SubExp &Exp : SubExps) { + if (Exp.IsHasMemInst) { // Skip when memory ld/st inst need to cross MBB which write memory. // TODO: check all MBBs in between FromBB and ToBB not write memory. // Currently just skip when any memory write exist. if (!MemWriteMBBSet.empty()) { - MachineBasicBlock *FromBB = subExp.FromBB; - MachineBasicBlock *ToBB = subExp.ToBB; - if (subExp.IsHoist) { - FromBB = subExp.ToBB; - ToBB = subExp.FromBB; + MachineBasicBlock *FromBB = Exp.FromBB; + MachineBasicBlock *ToBB = Exp.ToBB; + if (Exp.IsHoist) { + FromBB = Exp.ToBB; + ToBB = Exp.FromBB; } bool IsCrossMemWriteMBB = false; for (MachineBasicBlock *MemMBB : MemWriteMBBSet) { @@ -3805,37 +3776,36 @@ std::vector buildSubExpCandidates( continue; } } - if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, DT, + if (!canHelpPressureWhenSink(Exp, PassThrus, MRI, SIRI, MLI, DT, IsCanClone, IsSgprBound)) { if (AllowPartialUseInSubExp && - subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) { - SubExp freeSubExp = - buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI); - if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, - MLI, DT, IsCanClone, IsSgprBound)) { - subExpCandidates.emplace_back(freeSubExp); + Exp.isSafeToMove(MRI, /*IsMoveUp*/ false)) { + SubExp FreeSubExp = buildFreeSubExp(Exp, PassThrus, MRI, SIRI); + if (canHelpPressureWhenSink(FreeSubExp, PassThrus, MRI, SIRI, MLI, DT, + IsCanClone, IsSgprBound)) { + SubExpCandidates.emplace_back(FreeSubExp); } } continue; } - subExpCandidates.emplace_back(subExp); + SubExpCandidates.emplace_back(Exp); } } - return subExpCandidates; + return SubExpCandidates; } std::pair -calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, - GCNRPTracker::LiveRegSet &inputLive, - GCNRPTracker::LiveRegSet &outputLive, bool IsVOutBound, +calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, + GCNRPTracker::LiveRegSet &InputLive, + GCNRPTracker::LiveRegSet &OutputLive, bool IsVOutBound, bool IsSOutBound, bool IsCanClone, MachineDominatorTree *DT, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { - int vgpr = 0; - int sgpr = 0; - MachineBasicBlock *MBB = hotBB.MBB; + int Vgpr = 0; + int Sgpr = 0; + MachineBasicBlock *MBB = HotBb.MBB; // Sink saving. - for (SubExp &Exp : subExpCandidates) { + for (SubExp &Exp : SubExpCandidates) { if (Exp.IsHoist) { // ToMBB -> MBB -> FromMBB. // If ToMBB not dom hot block, reg will not live in MBB. @@ -3851,28 +3821,28 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, continue; if (IsSOutBound && Exp.sOutputSize < Exp.sInputSize) continue; - vgpr += Exp.vInputSize; - vgpr -= Exp.vOutputSize; - sgpr += Exp.sInputSize; - sgpr -= Exp.sOutputSize; + Vgpr += Exp.vInputSize; + Vgpr -= Exp.vOutputSize; + Sgpr += Exp.sInputSize; + Sgpr -= Exp.sOutputSize; continue; } } - int vgprDiff = 0; - int sgprDiff = 0; + int VgprDiff = 0; + int SgprDiff = 0; MachineBasicBlock *ToMBB = Exp.ToBB; - // If subExp is to hotBB, it is crossing output instead of input. - GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive; + // If subExp is to hotBB, It is crossing output instead of input. + GCNRPTracker::LiveRegSet &CrossLive = MBB == ToMBB ? OutputLive : InputLive; bool IsClone = false; - GCNRPTracker::LiveRegSet newInput; + GCNRPTracker::LiveRegSet NewInput; if (!Exp.IsMoveIntoLoop) { if (Exp.IsHoist) { - // If FromBB dom hot block, it will not change live for MBB. + // If FromBB dom hot block, It will not change live for MBB. if (Exp.FromBB != MBB && DT->dominates(Exp.FromBB, MBB)) continue; } else { - // If ToBB dom hot block, it will not change live for MBB. + // If ToBB dom hot block, It will not change live for MBB. if (ToMBB != MBB && DT->dominates(ToMBB, MBB)) { if (IsCanClone && !Exp.IsNotSafeToCopy) { IsClone = true; @@ -3882,19 +3852,19 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, } } - for (auto outIt : Exp.outputLive) { - unsigned Reg = outIt.first; - LaneBitmask outMask = outIt.second; + for (auto OutIt : Exp.outputLive) { + unsigned Reg = OutIt.first; + LaneBitmask OutMask = OutIt.second; LaneBitmask MBBBeginMask; - if (crossLive.find(Reg) != crossLive.end()) - MBBBeginMask = crossLive[Reg]; + if (CrossLive.find(Reg) != CrossLive.end()) + MBBBeginMask = CrossLive[Reg]; // Check mask which live in both BeginSlot and exp output when sink to // kill the output. Check mask which not live in BeginSlot in // exp output when hoist to live the output. - LaneBitmask profitMask = Exp.IsHoist ? (outMask & (~MBBBeginMask)) - : (outMask & MBBBeginMask); + LaneBitmask ProfitMask = Exp.IsHoist ? (OutMask & (~MBBBeginMask)) + : (OutMask & MBBBeginMask); if (MBBBeginMask.any()) { - unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); LLVM_DEBUG(std::string movStr = Exp.IsHoist ? "output hoist:" : "output sink:"; dbgs() @@ -3904,36 +3874,36 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); if (Exp.IsHoist) - vgprDiff += Size; + VgprDiff += Size; else - vgprDiff -= Size; + VgprDiff -= Size; } else { LLVM_DEBUG(dbgs() << "s\n"); if (Exp.IsHoist) - sgprDiff += Size; + SgprDiff += Size; else - sgprDiff -= Size; + SgprDiff -= Size; } } } - for (auto inIt : Exp.inputLive) { - unsigned Reg = inIt.first; - LaneBitmask inMask = inIt.second; + for (auto InIt : Exp.inputLive) { + unsigned Reg = InIt.first; + LaneBitmask InMask = InIt.second; LaneBitmask MBBBeginMask; - if (crossLive.find(Reg) != crossLive.end()) - MBBBeginMask = crossLive[Reg]; + if (CrossLive.find(Reg) != CrossLive.end()) + MBBBeginMask = CrossLive[Reg]; // Check mask which not live in BeginSlot in exp input when // sink to live the input. Check mask which live in both BeginSlot and // exp output when hoist to kill the input. - LaneBitmask profitMask = - Exp.IsHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask)); - if (profitMask.any()) { + LaneBitmask ProfitMask = + Exp.IsHoist ? (InMask & MBBBeginMask) : (InMask & (~MBBBeginMask)); + if (ProfitMask.any()) { // Update input live to avoid count same input more than once. - newInput[Reg] |= inMask; + NewInput[Reg] |= InMask; // Exp in not live at block input. // It will increase live for MBB. - unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); LLVM_DEBUG( std::string movStr = Exp.IsHoist ? "input hoist:" : "input sink:"; @@ -3941,26 +3911,26 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); if (Exp.IsHoist) - vgprDiff -= Size; + VgprDiff -= Size; else - vgprDiff += Size; + VgprDiff += Size; } else { LLVM_DEBUG(dbgs() << "s\n"); if (Exp.IsHoist) - sgprDiff -= Size; + SgprDiff -= Size; else - sgprDiff += Size; + SgprDiff += Size; } } } } else { // When sink into loop, the input will live for every block inside loop. // The output will only lived between to blocks and the use blocks. - // If MBB dominate any user of output live reg, it will still live in + // If MBB dominate any user of output live reg, It will still live in // MBB. So cannot count that output live reg as profit. // Hoist into loop is not supported now. - for (auto outIt : Exp.outputLive) { - unsigned Reg = outIt.first; + for (auto OutIt : Exp.outputLive) { + unsigned Reg = OutIt.first; bool IsDomUser = false; for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserMBB = MI.getParent(); @@ -3972,142 +3942,142 @@ calculateSaving(HotBlock &hotBB, std::vector &subExpCandidates, if (IsDomUser) continue; - LaneBitmask outMask = outIt.second; + LaneBitmask OutMask = OutIt.second; LaneBitmask MBBBeginMask; - if (inputLive.find(Reg) != inputLive.end()) - MBBBeginMask = inputLive[Reg]; - LaneBitmask profitMask = outMask & MBBBeginMask; + if (InputLive.find(Reg) != InputLive.end()) + MBBBeginMask = InputLive[Reg]; + LaneBitmask ProfitMask = OutMask & MBBBeginMask; if (MBBBeginMask.any()) { - unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg) << " " << Size); // Exp out live at block input. // It will descrease live for MBB. if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); - vgprDiff -= Size; + VgprDiff -= Size; } else { LLVM_DEBUG(dbgs() << "s\n"); - sgprDiff -= Size; + SgprDiff -= Size; } } } - for (auto inIt : Exp.inputLive) { - unsigned Reg = inIt.first; - LaneBitmask inMask = inIt.second; + for (auto InIt : Exp.inputLive) { + unsigned Reg = InIt.first; + LaneBitmask InMask = InIt.second; LaneBitmask MBBBeginMask; - if (inputLive.find(Reg) != inputLive.end()) - MBBBeginMask = inputLive[Reg]; + if (InputLive.find(Reg) != InputLive.end()) + MBBBeginMask = InputLive[Reg]; // Check mask which not live in BeginSlot in exp input. - LaneBitmask profitMask = inMask & (~MBBBeginMask); - if (profitMask.any()) { + LaneBitmask ProfitMask = InMask & (~MBBBeginMask); + if (ProfitMask.any()) { // Update input live to avoid count same input more than once. - newInput[Reg] |= inMask; + NewInput[Reg] |= InMask; // Exp in not live at block input. // It will increase live for MBB. - unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI); + unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg) << " " << Size); if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); - vgprDiff += Size; + VgprDiff += Size; } else { LLVM_DEBUG(dbgs() << "s\n"); - sgprDiff += Size; + SgprDiff += Size; } } } } - if (IsVOutBound && vgprDiff > 0) + if (IsVOutBound && VgprDiff > 0) continue; - if (IsSOutBound && sgprDiff > 0) + if (IsSOutBound && SgprDiff > 0) continue; - llvm::mergeLiveRegSet(crossLive, newInput); - vgpr += vgprDiff; - sgpr += sgprDiff; + llvm::mergeLiveRegSet(CrossLive, NewInput); + Vgpr += VgprDiff; + Sgpr += SgprDiff; if (IsClone) Exp.IsCloneOnly = true; } - return std::make_pair(vgpr, sgpr); + return std::make_pair(Vgpr, Sgpr); } -void addExpCandidates(std::vector &subExpCandidates, - std::vector &subExps, - GCNRPTracker::LiveRegSet &usedRegs) { - subExpCandidates.insert(subExpCandidates.end(), subExps.begin(), - subExps.end()); - for (auto &Exp : subExps) { +void addExpCandidates(std::vector &SubExpCandidates, + std::vector &SubExps, + GCNRPTracker::LiveRegSet &UsedRegs) { + SubExpCandidates.insert(SubExpCandidates.end(), SubExps.begin(), + SubExps.end()); + for (auto &Exp : SubExps) { if (Exp.IsHoist) { for (auto &Reg : Exp.TopRegs) { - usedRegs[Reg]; + UsedRegs[Reg]; } } else { for (auto &Reg : Exp.BottomRegs) { - usedRegs[Reg]; + UsedRegs[Reg]; } } } } bool tryToAddSubExps( - Remat *Remat, HotBlock &hotBB, RematStatus &status, - std::vector &subExpCandidates, - std::vector &inBlockCloneSubExps, - DenseMap &inBlockHotVInstMap, - DenseMap &inBlockHotSInstMap, + Remat *Remat, HotBlock &HotBB, RematStatus &Status, + std::vector &SubExpCandidates, + std::vector &InBlockCloneSubExps, + DenseMap &InBlockHotVInstMap, + DenseMap &InBlockHotSInstMap, SmallVector> Candidates, - int vgpr, int sgpr, const GCNRPTracker::LiveRegSet &savingInputLive, - const GCNRPTracker::LiveRegSet &savingOutputLive, - GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs, + int Vgpr, int Sgpr, const GCNRPTracker::LiveRegSet &SavingInputLive, + const GCNRPTracker::LiveRegSet &SavingOutputLive, + GCNRPTracker::LiveRegSet &PassThrus, GCNRPTracker::LiveRegSet &UsedRegs, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, const MachineLoopInfo *MLI, - SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *DT, - bool IsCanClone, bool IsVOutBound, bool IsSOutBound, - GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) { - std::vector partialSubExps = buildSubExpCandidates( - Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, DT, - IsCanClone, IsSOutBound, unUsedPassThrus, status.MemWriteMBBSet, - AllowPartialUseInSubExp); - - GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive; - GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive; - std::pair curSaving = calculateSaving( - hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive, + const SIInstrInfo *SIII, const MachineLoopInfo *MLI, SlotIndexes *SI, + LiveIntervals *LIS, MachineDominatorTree *DT, bool IsCanClone, + bool IsVOutBound, bool IsSOutBound, + GCNRPTracker::LiveRegSet &UnusedPassThrus, bool AllowPartialUseInSubExp) { + std::vector PartialSubExps = + buildSubExpCandidates(Remat, Candidates, PassThrus, MRI, SIRI, SIII, MLI, + SI, DT, IsCanClone, IsSOutBound, UnusedPassThrus, + Status.MemWriteMBBSet, AllowPartialUseInSubExp); + + GCNRPTracker::LiveRegSet TmpSavingInputLive = SavingInputLive; + GCNRPTracker::LiveRegSet TmpSavingOutputLive = SavingOutputLive; + std::pair CurSaving = calculateSaving( + HotBB, PartialSubExps, TmpSavingInputLive, TmpSavingOutputLive, IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); - const int VLimit = status.TargetVLimit; - const int SLimit = status.TargetSLimit; + const int VLimit = Status.TargetVLimit; + const int SLimit = Status.TargetSLimit; - vgpr += curSaving.first; - sgpr += curSaving.second; + Vgpr += CurSaving.first; + Sgpr += CurSaving.second; - if (vgpr <= VLimit && sgpr <= SLimit) { - // nrmSubExps can help reach target occupancy, add it to + if (Vgpr <= VLimit && Sgpr <= SLimit) { + // nrmSubExps can help reach target occupancy, add It to // subExpCandidates. - addExpCandidates(subExpCandidates, partialSubExps, usedRegs); + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); return true; } if (EnableSubExpAggressive) { // Build candidates from passThrus used in partialSubExps. - GCNRPTracker::LiveRegSet sinkUsedRegs; - for (auto &Exp : partialSubExps) { + GCNRPTracker::LiveRegSet SinkUsedRegs; + for (auto &Exp : PartialSubExps) { for (auto &Reg : Exp.BottomRegs) { - sinkUsedRegs[Reg]; + SinkUsedRegs[Reg]; } } MapVector HoistCandidates; - for (auto &it : hotBB.inputLive) { - unsigned Reg = it.first; + for (auto &It : HotBB.InputLive) { + unsigned Reg = It.first; // Skip reg which already used for sink exp. - if (sinkUsedRegs.count(Reg)) + if (SinkUsedRegs.count(Reg)) continue; - if (usedRegs.count(Reg)) + if (UsedRegs.count(Reg)) continue; // Skip unsafe reg. if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ false)) { @@ -4133,42 +4103,40 @@ bool tryToAddSubExps( UseInMBB[Reg] = getRegMask(DefMI->getOperand(0), MRI); } - SlotIndexes *slotIndexes = LIS->getSlotIndexes(); // Build exp dag on define blocks. - std::vector hoistSubExpCandidates; + std::vector HoistSubExpCandidates; // Save profit candidates into list. - for (auto it : HoistCandidates) { - MachineBasicBlock *UseMBB = it.first; + for (auto It : HoistCandidates) { + MachineBasicBlock *UseMBB = It.first; // Try to remove out reg def sub exp from DefMBB. - GCNRPTracker::LiveRegSet &UseInMBB = it.second; + GCNRPTracker::LiveRegSet &UseInMBB = It.second; // Go up on the dag until reach share node. - auto subExps = buildSubExpFromCandidatesTopBottom( - Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes); - for (SubExp &subExp : subExps) { - if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, - IsSOutBound)) + auto SubExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, + SIRI, SIII, MRI); + for (SubExp &SubExp : SubExps) { + if (!canHelpPressureWhenHoist(SubExp, MRI, MLI, IsSOutBound)) continue; - subExp.IsHoist = true; - hoistSubExpCandidates.emplace_back(subExp); + SubExp.IsHoist = true; + HoistSubExpCandidates.emplace_back(SubExp); } } - std::pair hoistSaving = calculateSaving( - hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive, + std::pair HoistSaving = calculateSaving( + HotBB, HoistSubExpCandidates, TmpSavingInputLive, TmpSavingOutputLive, IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); - int hoistVgpr = vgpr + hoistSaving.first; - int hoistSgpr = sgpr + hoistSaving.second; + int HoistVgpr = Vgpr + HoistSaving.first; + int HoistSgpr = Sgpr + HoistSaving.second; - if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) || + if ((HoistVgpr <= VLimit && HoistSgpr <= SLimit) || // If status not balance, do the remat even cannot reach target. // TODO: check the result not help even one occupancy. - (!hoistSubExpCandidates.empty() && !status.NotBalance && + (!HoistSubExpCandidates.empty() && !Status.NotBalance && TargetOccupancy != 0)) { - // nrmSubExps can help reach target occupancy, add it to + // nrmSubExps can help reach target occupancy, add It to // subExpCandidates. - addExpCandidates(subExpCandidates, partialSubExps, usedRegs); - addExpCandidates(subExpCandidates, hoistSubExpCandidates, usedRegs); + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); + addExpCandidates(SubExpCandidates, HoistSubExpCandidates, UsedRegs); return true; } @@ -4179,132 +4147,131 @@ bool tryToAddSubExps( // If not, AllowPartialUseInSubExp will no chance to be true. (AllowPartialUseInSubExp || !EnableSubExpAggressive)) { // Assume vmemLdSize could be optimized by not parallel. - if (((vgpr - hotBB.vmemLdInputSize) <= VLimit || - (vgpr - hotBB.vmemLdOutputSize) <= VLimit) && - sgpr <= SLimit) { - // nrmSubExps can help reach target occupancy, add it to + if (((Vgpr - HotBB.VmemLdInputSize) <= VLimit || + (Vgpr - HotBB.VmemLdOutputSize) <= VLimit) && + Sgpr <= SLimit) { + // nrmSubExps can help reach target occupancy, add It to // subExpCandidates. - addExpCandidates(subExpCandidates, partialSubExps, usedRegs); + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); return true; } } - int vDistance = vgpr - (int)VLimit; - int sDistance = status.TargetOcc > 4 ? (sgpr - (int)SLimit) : 0; - int vSaved = hotBB.maxPressures.first - vgpr; - int sSaved = hotBB.maxPressures.second - sgpr; + int VDistance = Vgpr - (int)VLimit; + int SDistance = Status.TargetOcc > 4 ? (Sgpr - (int)SLimit) : 0; + int VSaved = HotBB.MaxPressures.first - Vgpr; + int SSaved = HotBB.MaxPressures.second - Sgpr; // Try to add inBlockCloneSubExps. - if (!tryRematInHotSpot(*hotBB.MBB, status, vDistance, sDistance, vSaved, - sSaved, inBlockCloneSubExps, inBlockHotVInstMap, - inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) { - // return false always when not allow partialUseInSubExp, it will try again + if (!tryRematInHotSpot(*HotBB.MBB, Status, VDistance, SDistance, VSaved, + SSaved, InBlockCloneSubExps, InBlockHotVInstMap, + InBlockHotSInstMap, LIS, MRI, SIRI, SIII)) { + // return false always when not allow partialUseInSubExp, It will try again // with partialUseInSubExp enabled. if (!AllowPartialUseInSubExp) return false; // If status not balance, do the remat even cannot reach target. // TODO: check the result not help even one occupancy. - if (!status.NotBalance && TargetOccupancy == 0) + if (!Status.NotBalance && TargetOccupancy == 0) return false; } - // nrmSubExps can help reach target occupancy, add it to + // nrmSubExps can help reach target occupancy, add It to // subExpCandidates. - addExpCandidates(subExpCandidates, partialSubExps, usedRegs); + addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs); return true; } // Remat passthru regs per hot block. -// Reason to do it per block is to make sure passthru reuse is precise. +// Reason to do It per block is to make sure passthru reuse is precise. // If try remat on all hot blocks together, the passthru might be on one block, // reuse in on another block which the reg is not passthru there. -bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, - RematStatus &status, - GCNRPTracker::LiveRegSet &liveRegCandidates, +bool perBlockPassthruRemat(Remat *Remat, std::vector &HotBlocks, + RematStatus &Status, + GCNRPTracker::LiveRegSet &LiveRegCandidates, const GCNSubtarget *ST, LiveIntervals *LIS, - const MachineLoopInfo *MLI, - MachineDominatorTree *DT, MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, + const MachineLoopInfo *MLI, MachineDominatorTree *DT, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { bool IsUpdated = false; bool IsCanClone = EnableSubExpClone || EnableSubExpAggressive; - SlotIndexes *slotIndexes = LIS->getSlotIndexes(); + SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); // Sort hot blocks by pressure first. // The hot block with higher pressure is easier to fail. - // If fail, fail fast. It it works, save the subExpCandidates. The + // If fail, fail fast. It It works, save the subExpCandidates. The // subExpCandidates may help other hotblocks. - std::sort(hotBlocks.begin(), hotBlocks.end(), - [&ST](const HotBlock &a, const HotBlock &b) { - return pressureHigher(a.maxPressures.first, a.maxPressures.second, - b.maxPressures.first, b.maxPressures.second, + std::sort(HotBlocks.begin(), HotBlocks.end(), + [&ST](const HotBlock &A, const HotBlock &B) { + return pressureHigher(A.MaxPressures.first, A.MaxPressures.second, + B.MaxPressures.first, B.MaxPressures.second, ST); }); - std::vector subExpCandidates; + std::vector SubExpCandidates; // For inBlock remat clone. - std::vector inBlockCloneSubExps; - DenseMap inBlockHotVInstMap; - DenseMap inBlockHotSInstMap; + std::vector InBlockCloneSubExps; + DenseMap InBlockHotVInstMap; + DenseMap InBlockHotSInstMap; // Save used passThrus to avoid use same reg on different MBB. - GCNRPTracker::LiveRegSet usedPassThrus; + GCNRPTracker::LiveRegSet UsedPassThrus; // Save moved regs to avoid use same reg hoist and sink. - GCNRPTracker::LiveRegSet usedRegs; + GCNRPTracker::LiveRegSet UsedRegs; - const int VLimit = status.TargetVLimit; - const int SLimit = status.TargetSLimit; + const int VLimit = Status.TargetVLimit; + const int SLimit = Status.TargetSLimit; // Collect passthru for hot block. - // Try remat on it. - for (auto &it : hotBlocks) { - MachineBasicBlock *MBB = it.MBB; + // Try remat on It. + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; - const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB]; - const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB]; + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; - it.inputLive = inputLive; + It.InputLive = InputLive; // Add pressure by 1 to consider spill to vgpr. const int PressureDelta = -1; - int vgpr = it.maxPressures.first - PressureDelta; - int sgpr = it.maxPressures.second; - bool IsVOutBound = vgpr > VLimit; - bool IsSOutBound = sgpr > SLimit; + int Vgpr = It.MaxPressures.first - PressureDelta; + int Sgpr = It.MaxPressures.second; + bool IsVOutBound = Vgpr > VLimit; + bool IsSOutBound = Sgpr > SLimit; // savingInputLive is used to calculate saving which will be modified to // avoid count same input multiple times. - GCNRPTracker::LiveRegSet savingInputLive = inputLive; - GCNRPTracker::LiveRegSet savingOutputLive = outputLive; - std::pair curSaving = - calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive, + GCNRPTracker::LiveRegSet SavingInputLive = InputLive; + GCNRPTracker::LiveRegSet SavingOutputLive = OutputLive; + std::pair CurSaving = + calculateSaving(It, SubExpCandidates, SavingInputLive, SavingOutputLive, IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI); - vgpr += curSaving.first; - sgpr += curSaving.second; + Vgpr += CurSaving.first; + Sgpr += CurSaving.second; - if (vgpr <= VLimit && sgpr <= SLimit) + if (Vgpr <= VLimit && Sgpr <= SLimit) continue; // Collect pass thru regs. - GCNRPTracker::LiveRegSet passThrus = - collectPassThrus(MBB, inputLive, outputLive, usedPassThrus, - liveRegCandidates, MRI, IsCanClone); + GCNRPTracker::LiveRegSet PassThrus = + collectPassThrus(MBB, InputLive, OutputLive, + LiveRegCandidates, MRI, IsCanClone); // Group pass thru regs by def MBB. SmallVector> - Candidates = groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, + Candidates = groupPassThruByDefBlock(Remat, PassThrus, UsedPassThrus, MRI, SIRI, SIII); // unUsedPassThrus used to collect passThru which is skipped when build // subExp. - GCNRPTracker::LiveRegSet unusedPassThrus; + GCNRPTracker::LiveRegSet UnusedPassThrus; // Build exp dag on define blocks. bool AllowPartialUseInSubExp = false; if (tryToAddSubExps( - Remat, it, status, subExpCandidates, inBlockCloneSubExps, - inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr, - savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI, - SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound, - IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) { + Remat, It, Status, SubExpCandidates, InBlockCloneSubExps, + InBlockHotVInstMap, InBlockHotSInstMap, Candidates, Vgpr, Sgpr, + SavingInputLive, SavingOutputLive, PassThrus, UsedRegs, MRI, SIRI, + SIII, MLI, SlotIndexes, LIS, DT, IsCanClone, IsVOutBound, + IsSOutBound, UnusedPassThrus, AllowPartialUseInSubExp)) { // Remove unusedPassThrus from passThrus first. - llvm::andNotLiveRegSet(passThrus, unusedPassThrus); - llvm::mergeLiveRegSet(usedPassThrus, passThrus); + llvm::andNotLiveRegSet(PassThrus, UnusedPassThrus); + llvm::mergeLiveRegSet(UsedPassThrus, PassThrus); continue; } // If cannot clone, don't need to try partialUseInSubExp which must clone. @@ -4312,54 +4279,53 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, return false; // Partial use subExp may result count caused by clone. - // Only try it when enable aggressive remat. + // Only try It when enable aggressive remat. if (!EnableSubExpAggressive) return false; AllowPartialUseInSubExp = true; if (!tryToAddSubExps( - Remat, it, status, subExpCandidates, inBlockCloneSubExps, - inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr, - savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI, - SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound, - IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) { + Remat, It, Status, SubExpCandidates, InBlockCloneSubExps, + InBlockHotVInstMap, InBlockHotSInstMap, Candidates, Vgpr, Sgpr, + SavingInputLive, SavingOutputLive, PassThrus, UsedRegs, MRI, SIRI, + SIII, MLI, SlotIndexes, LIS, DT, IsCanClone, IsVOutBound, + IsSOutBound, UnusedPassThrus, AllowPartialUseInSubExp)) { return false; } // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp. - llvm::mergeLiveRegSet(usedPassThrus, passThrus); + llvm::mergeLiveRegSet(UsedPassThrus, PassThrus); } // Apply changes. { // sort subExpCandidates to make sure input use apply before output use if a // reg is input and output of subExps. - LLVM_DEBUG(for (SubExp &Exp : subExpCandidates) { Exp.dump(MRI, SIRI); }); - sortSubExpCandidates(subExpCandidates); + LLVM_DEBUG(for (SubExp &Exp : SubExpCandidates) { Exp.dump(MRI, SIRI); }); + sortSubExpCandidates(SubExpCandidates); - for (SubExp &Exp : subExpCandidates) { + for (SubExp &Exp : SubExpCandidates) { // Skip exp which is cleared in sort for hoist sink conflict. if (Exp.SUnits.empty()) continue; LLVM_DEBUG(Exp.dump(MRI, SIRI)); if (Exp.IsHoist) { - ApplySubExpMoveNearDefine(Exp, MRI, DT, slotIndexes, SIII, SIRI); + applySubExpMoveNearDefine(Exp, MRI, SlotIndexes, SIII, SIRI); } else { if (Exp.IsCloneOnly) - ApplySubExpCloneNearUser(Exp, hotBlocks, DT, MRI, slotIndexes, SIII, + applySubExpCloneNearUser(Exp, HotBlocks, DT, MRI, SlotIndexes, SIII, SIRI); else - ApplySubExpMoveNearUser(Exp, MRI, DT, slotIndexes, SIII, SIRI); + applySubExpMoveNearUser(Exp, MRI, DT, SlotIndexes); } } - for (SubExp &Exp : inBlockCloneSubExps) { - ApplySubExpCloneNearUserInBlock(Exp, inBlockHotVInstMap, - inBlockHotSInstMap, MRI, slotIndexes, - SIII, SIRI); + for (SubExp &Exp : InBlockCloneSubExps) { + applySubExpCloneNearUserInBlock( + Exp, InBlockHotVInstMap, InBlockHotSInstMap, MRI, SlotIndexes, SIRI); } // Try to see possible occupancy could reach, then dicide a target. // Apply remat. - IsUpdated = subExpCandidates.size(); + IsUpdated = SubExpCandidates.size(); } return IsUpdated; @@ -4367,7 +4333,7 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &hotBlocks, int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, const MachineRegisterInfo &MRI) { - int vmemLdSize = 0; + int VmemLdSize = 0; // Collect vmemLd when enable split. for (MachineInstr &MI : MBB) { bool IsHighLatency = SIII->isHighLatencyInstruction(MI); @@ -4379,16 +4345,16 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII, continue; // a vmem ld. MachineOperand &Dst = MI.getOperand(0); - LaneBitmask mask = llvm::getRegMask(Dst, MRI); - unsigned size = llvm::getRegSize(Dst.getReg(), mask, MRI, SIRI); - vmemLdSize += size; + LaneBitmask Mask = llvm::getRegMask(Dst, MRI); + unsigned Size = llvm::getRegSize(Dst.getReg(), Mask, MRI, SIRI); + VmemLdSize += Size; } - return vmemLdSize; + return VmemLdSize; } } // namespace -bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, +bool groupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, AliasAnalysis *AA) { if (MF.size() < 2) @@ -4400,95 +4366,95 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, auto &MRI = MF.getRegInfo(); - RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST); + RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST); const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; - if (status.TargetOcc >= MaxOcc) + if (Status.TargetOcc >= MaxOcc) return false; - unsigned VLimit = status.TargetVLimit; - unsigned SLimit = status.TargetSLimit; + unsigned VLimit = Status.TargetVLimit; + unsigned SLimit = Status.TargetSLimit; - int rematVCnt = status.MaxVPressure - VLimit; - int rematSCnt = status.MaxSPressure - SLimit; + int RematVCnt = Status.MaxVPressure - VLimit; + int RematSCnt = Status.MaxSPressure - SLimit; bool IsSGPRSpill = false; - if (rematSCnt > 0) { - IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF); + if (RematSCnt > 0) { + IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); } // If bound by lds, skip. - if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second && + if ((Status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second && !IsSGPRSpill) return false; - bool IsBothOutLimit = rematVCnt > 0 && rematSCnt > 0; + bool IsBothOutLimit = RematVCnt > 0 && RematSCnt > 0; // TODO: use check wqm and support vreg remat. bool IsCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; - rematVCnt = IsCheckWQM & false; + RematVCnt = IsCheckWQM & false; // Remat on every hot block. // Collect all hot blocks. - std::vector hotBlocks; + std::vector HotBlocks; for (MachineBasicBlock &MBB : MF) { // Collect reg pressure. - auto &RP = status.MBBPressureMap[&MBB]; - unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); - unsigned maxLocalSPressure = RP.getMaxSGPR(); + auto &RP = Status.MBBPressureMap[&MBB]; + unsigned MaxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + unsigned MaxLocalSPressure = RP.getMaxSGPR(); - maxLocalSPressure += RegForVCC; + MaxLocalSPressure += RegForVCC; if (!EnableInBlockRemat) { - if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit) + if (MaxLocalVPressure <= VLimit && MaxLocalSPressure <= SLimit) continue; } // Move inst which input is imm/pass thru reg/out reg to help pressure. - if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) { - maxLocalVPressure = 0; - maxLocalSPressure = 0; - collectMBBPressure(MBB, LIS, ST, maxLocalVPressure, maxLocalSPressure, - status); + if (tryHoldPacifist(MBB, LIS, MRI, SIRI, AA, Status)) { + MaxLocalVPressure = 0; + MaxLocalSPressure = 0; + collectMBBPressure(MBB, LIS, ST, MaxLocalVPressure, MaxLocalSPressure, + Status); - maxLocalSPressure += RegForVCC; + MaxLocalSPressure += RegForVCC; } - if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit) + if (MaxLocalVPressure <= VLimit && MaxLocalSPressure <= SLimit) continue; // When both vgpr sgpr out limit, only help vgpr. - if (IsBothOutLimit && maxLocalVPressure <= VLimit) + if (IsBothOutLimit && MaxLocalVPressure <= VLimit) continue; - GCNRPTracker::LiveRegSet liveSet; - hotBlocks.push_back({&MBB, liveSet, - std::make_pair(maxLocalVPressure, maxLocalSPressure), + GCNRPTracker::LiveRegSet LiveSet; + HotBlocks.push_back({&MBB, LiveSet, + std::make_pair(MaxLocalVPressure, MaxLocalSPressure), 0, 0}); } // Collect vmemLdInput/OutputSize. if (EnableVmemDegree) { - DenseMap outputVMemLdSizeMap; - for (auto it : hotBlocks) { - MachineBasicBlock *MBB = it.MBB; + DenseMap OutputVMemLdSizeMap; + for (auto It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; // Collect vmemLd when enable split. - int vmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI); - if (vmemLdSize) { - outputVMemLdSizeMap[MBB] = vmemLdSize; + int VmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI); + if (VmemLdSize) { + OutputVMemLdSizeMap[MBB] = VmemLdSize; } } - for (auto &it : hotBlocks) { - MachineBasicBlock *MBB = it.MBB; + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; - auto oit = outputVMemLdSizeMap.find(MBB); - if (oit != outputVMemLdSizeMap.end()) - it.vmemLdOutputSize = oit->second; + auto OIt = OutputVMemLdSizeMap.find(MBB); + if (OIt != OutputVMemLdSizeMap.end()) + It.VmemLdOutputSize = OIt->second; if (MBB->pred_size() != 1) continue; MachineBasicBlock *Pred = *MBB->pred_begin(); - oit = outputVMemLdSizeMap.find(Pred); - if (oit != outputVMemLdSizeMap.end()) { - it.vmemLdInputSize = oit->second; + OIt = OutputVMemLdSizeMap.find(Pred); + if (OIt != OutputVMemLdSizeMap.end()) { + It.VmemLdInputSize = OIt->second; } else { if (Pred->getFirstTerminator() != Pred->end()) continue; @@ -4497,60 +4463,60 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, bool IsHighLatency = SIII->isHighLatencyInstruction(Pred->back()); if (!IsHighLatency) continue; - int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI); - it.vmemLdInputSize = vmemLdSize; + int VmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI); + It.VmemLdInputSize = VmemLdSize; } } } if (EnableUniformVectorToScalar) { - if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, - hotBlocks, LIS, MRI, SIRI, SIII, MLI)) { + if (rematUniformVgprToSgpr(Remat, MF, Status, HotBlocks, LIS, MRI, SIRI, + SIII, MLI)) { // Rebuild LIS. LIS->reanalyze(MF); - status = getRematStatus(MF, MLI, LIS, MRI, ST); - bool IsSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF); + Status = getRematStatus(MF, MLI, LIS, MRI, ST); + bool IsSgprSpilled = nearSgprSpill(Status.MaxSPressure, ST, MF); if (IsSgprSpilled) { bool IsNearTarget = false; hotBlockRemat(Remat, MF, MLI, LIS, DT, PDT, IsNearTarget); // Rebuild LIS. LIS->reanalyze(MF); - status = getRematStatus(MF, MLI, LIS, MRI, ST); + Status = getRematStatus(MF, MLI, LIS, MRI, ST); } - for (auto &it : hotBlocks) { - MachineBasicBlock *MBB = it.MBB; + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; // Update pressure. - auto &RP = status.MBBPressureMap[MBB]; - unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); - unsigned maxLocalSPressure = RP.getMaxSGPR(); + auto &RP = Status.MBBPressureMap[MBB]; + unsigned MaxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + unsigned MaxLocalSPressure = RP.getMaxSGPR(); - maxLocalSPressure += RegForVCC; - it.maxPressures.first = maxLocalVPressure; - it.maxPressures.second = maxLocalSPressure; + MaxLocalSPressure += RegForVCC; + It.MaxPressures.first = MaxLocalVPressure; + It.MaxPressures.second = MaxLocalSPressure; } } } // Collect all live reg which cross hot blocks. - GCNRPTracker::LiveRegSet liveRegCandidates; - for (auto it : hotBlocks) { - MachineBasicBlock *MBB = it.MBB; + GCNRPTracker::LiveRegSet LiveRegCandidates; + for (auto It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; - const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB]; + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; - const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB]; + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; - llvm::mergeLiveRegSet(liveRegCandidates, inputLive); - llvm::mergeLiveRegSet(liveRegCandidates, outputLive); + llvm::mergeLiveRegSet(LiveRegCandidates, InputLive); + llvm::mergeLiveRegSet(LiveRegCandidates, OutputLive); } // Check min VGPR bound. BlockSet PressureUnderLimitSet; if (EnableSubExpMinReg) { - for (auto &it : hotBlocks) { - MachineBasicBlock *MBB = it.MBB; + for (auto &It : HotBlocks) { + MachineBasicBlock *MBB = It.MBB; unsigned MaxLocalVGPR = 0; unsigned MaxLocalSGPR = 0; llvm::getRegBound(MBB, MRI, SIRI, SIII, LIS, MaxLocalVGPR, MaxLocalSGPR); @@ -4558,17 +4524,17 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, if (MaxLocalVGPR < VLimit && MaxLocalSGPR < SLimit) { PressureUnderLimitSet.insert(MBB); } else { - if (MaxLocalVGPR < it.maxPressures.first) - it.maxPressures = - std::make_pair(MaxLocalVGPR, it.maxPressures.second); - if (MaxLocalSGPR < it.maxPressures.second) - it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR); + if (MaxLocalVGPR < It.MaxPressures.first) + It.MaxPressures = + std::make_pair(MaxLocalVGPR, It.MaxPressures.second); + if (MaxLocalSGPR < It.MaxPressures.second) + It.MaxPressures = std::make_pair(It.MaxPressures.first, MaxLocalSGPR); } } } bool IsUpdated = - perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST, + perBlockPassthruRemat(Remat, HotBlocks, Status, LiveRegCandidates, ST, LIS, MLI, DT, MRI, SIRI, SIII); return IsUpdated; @@ -4614,7 +4580,7 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { LIS->reanalyze(MF); } - IsUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA); + IsUpdated = groupRemat(this, MF, MLI, LIS, DT, PDT, AA); IsFinalUpdated |= IsUpdated; } From d8b6711de941bfa483a82f41bc47eff7e23ac16d Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Fri, 14 Mar 2025 12:34:53 -0700 Subject: [PATCH 14/25] More cleanup --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 74 +++++----- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 39 ++--- .../AMDGPUOccupancyAndLatencyHelper.cpp | 139 +++++++++--------- .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 18 +-- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 56 ++++--- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h | 20 +-- 7 files changed, 175 insertions(+), 173 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 4c46cee69a038..46d182ffd9e29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -2308,7 +2308,7 @@ void applySubExpCloneNearUser(SubExp &Exp, std::vector &HotBlocks, } // Build dag for SubExp to help remove unused inst when clone. ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true); - Dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits); + Dag.build(Exp.InputLive, Exp.OutputLive, Exp.SUnits); DenseSet DagBottoms; for (SUnit &SU : Dag.SUnits) { if (!SU.isInstr()) @@ -3141,10 +3141,10 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi, if (SubExp.IsNotSafeToCopy) continue; if (IsVGPR) { - if (SubExp.vOutputSize == 0) + if (SubExp.VOutputSize == 0) continue; } else { - if (SubExp.sOutputSize == 0) + if (SubExp.SOutputSize == 0) continue; } if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) @@ -3158,9 +3158,9 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi, if (SubExp.IsHasMemInst && MemWriteMBBSet.count(&MBB)) continue; if (IsVGPR) { - Distance -= SubExp.vOutputSize; + Distance -= SubExp.VOutputSize; } else { - Distance -= SubExp.sOutputSize; + Distance -= SubExp.SOutputSize; } CloneSubExps.emplace_back(SubExp); if (Distance <= 0) @@ -3256,8 +3256,8 @@ bool tryRematInHotSpot( // When apply subExp1 before subExp0, new clone of subExp0 which use result of // subExp1 will have old reg of subExp1. And reg pressure will not be reduced. void sortSubExpCandidates(std::vector &SubExpCandidates) { - MapVector> InputMap; - MapVector> OutputMap; + MapVector> InputMap; + MapVector> OutputMap; struct SortNode { SubExp Exp; unsigned Depth; @@ -3288,7 +3288,7 @@ void sortSubExpCandidates(std::vector &SubExpCandidates) { MapVector SortMap; for (auto It : InputMap) { unsigned Reg = It.first; - auto OutIt = OutputMap.find(Reg); + MapVector>::iterator OutIt = OutputMap.find(Reg); if (OutIt == OutputMap.end()) continue; auto &InExps = It.second; @@ -3302,8 +3302,8 @@ void sortSubExpCandidates(std::vector &SubExpCandidates) { continue; // Canot input(use) move up, output(def) move down. // Choose the exp which save more. - int InExpGain = InExp->vOutputSize - InExp->vInputSize; - int OutExpGain = OutExp->vInputSize - InExp->vOutputSize; + int InExpGain = InExp->VOutputSize - InExp->VInputSize; + int OutExpGain = OutExp->VInputSize - InExp->VOutputSize; if (InExpGain >= OutExpGain) { OutExp->SUnits.clear(); } else { @@ -3415,26 +3415,26 @@ bool canHelpPressureWhenSink(SubExp &SubExp, // Update input size to ignore lives in which already in // passThrus. - for (auto It : SubExp.inputLive) { + for (auto It : SubExp.InputLive) { unsigned Reg = It.first; if (PassThrus.count(Reg) == 0) continue; unsigned Size = getRegSize(Reg, It.second, MRI, SIRI); if (SIRI->isVGPR(MRI, Reg)) { - SubExp.vInputSize -= Size; + SubExp.VInputSize -= Size; } else { - SubExp.sInputSize -= Size; + SubExp.SInputSize -= Size; } } - if (SubExp.vInputSize > SubExp.vOutputSize) + if (SubExp.VInputSize > SubExp.VOutputSize) return false; - if (SubExp.sInputSize > SubExp.sOutputSize && IsSgprBound) + if (SubExp.SInputSize > SubExp.SOutputSize && IsSgprBound) return false; - if (SubExp.sInputSize >= SubExp.sOutputSize && - SubExp.vInputSize == SubExp.vOutputSize) + if (SubExp.SInputSize >= SubExp.SOutputSize && + SubExp.VInputSize == SubExp.VOutputSize) return false; // Try to find a Insert Block. @@ -3479,13 +3479,13 @@ bool canHelpPressureWhenHoist(SubExp &SubExp, const MachineRegisterInfo &MRI, const MachineLoopInfo *MLI, bool IsSgprBound) { if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ true)) return false; - if (SubExp.vInputSize < SubExp.vOutputSize) + if (SubExp.VInputSize < SubExp.VOutputSize) return false; - if (SubExp.sInputSize < SubExp.sOutputSize && IsSgprBound) + if (SubExp.SInputSize < SubExp.SOutputSize && IsSgprBound) return false; - if (SubExp.sInputSize <= SubExp.sOutputSize && - SubExp.vInputSize == SubExp.vOutputSize) + if (SubExp.SInputSize <= SubExp.SOutputSize && + SubExp.VInputSize == SubExp.VOutputSize) return false; // Try to find a Insert Block. @@ -3715,17 +3715,17 @@ SubExp buildFreeSubExp(SubExp &Exp, // Calc reg for freeExp. for (unsigned Reg : FreeExp.TopRegs) { - FreeExp.inputLive[Reg]; + FreeExp.InputLive[Reg]; } for (unsigned Reg : FreeExp.BottomRegs) { - FreeExp.outputLive[Reg]; + FreeExp.OutputLive[Reg]; } - CollectLiveSetPressure(FreeExp.inputLive, MRI, SIRI, FreeExp.vInputSize, - FreeExp.sInputSize); - CollectLiveSetPressure(FreeExp.outputLive, MRI, SIRI, FreeExp.vOutputSize, - FreeExp.sOutputSize); + CollectLiveSetPressure(FreeExp.InputLive, MRI, SIRI, FreeExp.VInputSize, + FreeExp.SInputSize); + CollectLiveSetPressure(FreeExp.OutputLive, MRI, SIRI, FreeExp.VOutputSize, + FreeExp.SOutputSize); return FreeExp; } @@ -3817,14 +3817,14 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, continue; // When subExp is from hotBB, check output instead of input. if (Exp.FromBB == MBB) { - if (IsVOutBound && Exp.vOutputSize < Exp.vInputSize) + if (IsVOutBound && Exp.VOutputSize < Exp.VInputSize) continue; - if (IsSOutBound && Exp.sOutputSize < Exp.sInputSize) + if (IsSOutBound && Exp.SOutputSize < Exp.SInputSize) continue; - Vgpr += Exp.vInputSize; - Vgpr -= Exp.vOutputSize; - Sgpr += Exp.sInputSize; - Sgpr -= Exp.sOutputSize; + Vgpr += Exp.VInputSize; + Vgpr -= Exp.VOutputSize; + Sgpr += Exp.SInputSize; + Sgpr -= Exp.SOutputSize; continue; } } @@ -3852,7 +3852,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, } } - for (auto OutIt : Exp.outputLive) { + for (auto OutIt : Exp.OutputLive) { unsigned Reg = OutIt.first; LaneBitmask OutMask = OutIt.second; LaneBitmask MBBBeginMask; @@ -3887,7 +3887,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, } } - for (auto InIt : Exp.inputLive) { + for (auto InIt : Exp.InputLive) { unsigned Reg = InIt.first; LaneBitmask InMask = InIt.second; LaneBitmask MBBBeginMask; @@ -3929,7 +3929,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, // If MBB dominate any user of output live reg, It will still live in // MBB. So cannot count that output live reg as profit. // Hoist into loop is not supported now. - for (auto OutIt : Exp.outputLive) { + for (auto OutIt : Exp.OutputLive) { unsigned Reg = OutIt.first; bool IsDomUser = false; for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) { @@ -3963,7 +3963,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, } } - for (auto InIt : Exp.inputLive) { + for (auto InIt : Exp.InputLive) { unsigned Reg = InIt.first; LaneBitmask InMask = InIt.second; LaneBitmask MBBBeginMask; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 365fb058bf6b3..63651ab82fcdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -453,7 +453,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, .addImm(offset * LaneSize); MachineInstr *OffsetAddMI = OffsetAdd.getInstr(); MachineBasicBlock::iterator InsertPoint = - llvm::FindOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI, + llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI, SIII, &MRI); MI.getParent()->insert(InsertPoint, OffsetAddMI); SIII->legalizeOperands(*OffsetAddMI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index 1e9f0bad12d19..04b4b74fbd726 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -1,3 +1,6 @@ +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H + #pragma once #include "llvm/ADT/DenseMap.h" @@ -37,14 +40,14 @@ using LiveSet = llvm::DenseMap; unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI); -void CollectLiveSetPressure(const LiveSet &liveSet, +void collectLiveSetPressure(const LiveSet &liveSet, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, unsigned &VPressure, unsigned &SPressure); bool isExecUpdateForControlFlow(llvm::MachineInstr &MI); -bool IsSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); +bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO, const llvm::MachineRegisterInfo &MRI); @@ -68,40 +71,40 @@ bool reach_block(llvm::MachineBasicBlock *FromBB, void viewCFGWithPhi(llvm::MachineFunction &MF); void write_contribution_list(llvm::MachineFunction &MF, const char *Filename); -llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, +llvm::MachineBasicBlock *createNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII); -bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, +bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, llvm::MachineBasicBlock &MBB); -void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, +void updatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI); -void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, +void buildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, llvm::SmallDenseSet &LiveOutSet, const llvm::MachineRegisterInfo *MRI); -MachineReg CreateVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand, +MachineReg createVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand, llvm::MachineFunction &MF); -MachineReg CreateVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF); +MachineReg createVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF); -bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, +bool isExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst); struct MachineRegWithSubReg { MachineReg Reg = /*NoRegister*/ 0; unsigned SubReg = /*NoSubRegister*/ 0; }; -MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF); -llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF); +MachineRegWithSubReg getWqmEntryActiveMask(llvm::MachineFunction &MF); +llvm::MachineInstr *getWqmEntryActiveMaskInst(llvm::MachineFunction &MF); // Return true if this machine instruction represents a call to the fetch // shader. We curently have two mechanisims for calling fetch shader: // 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction // 2. A CALL instruction with the `FetchShaderCall` flag set to true. -bool IsFetchShaderCall(const llvm::MachineInstr *MI); +bool isFetchShaderCall(const llvm::MachineInstr *MI); -bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, +bool isSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI); // An enum used to pass additional constraints to @@ -126,7 +129,7 @@ enum SccDefInsertPointConstraintFlags { // scc around BeforeInst. This way BeforeInst can safely be used // as the new insert location. // -llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef( +llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst, const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII, llvm::MachineRegisterInfo *MRI, @@ -149,9 +152,9 @@ void buildEndLiveMap( void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI); -unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, +unsigned getCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI); -unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, +unsigned getCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI); bool isFastMathInst(llvm::MachineInstr &MI); @@ -169,7 +172,7 @@ void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, // Look for the successor `Succ` of the given `MBB`. // Returns MBB->succ_end() if `Succ` is not a successor of MBB. llvm::MachineBasicBlock::succ_iterator -FindSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ); +findSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ); // The enum and helper function for v_perm selection mask. // @@ -210,3 +213,5 @@ constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0, (int)Sel_0); } } // namespace llvm + +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp index 2e48ec44f979c..a8eef88ac2af8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -32,40 +32,39 @@ float SchedScore::computeScore() const { LatencyHide; } float SchedScore::computeScore2() const { - float cycles = 0; - cycles = (MixAlu * Occupancy + MemLatency); - cycles /= Occupancy; - return cycles; + float Cycles = 0; + Cycles = (MixAlu * Occupancy + MemLatency); + Cycles /= Occupancy; + return Cycles; } -void SchedScore::sum(const SchedScore &s, unsigned loopDepth) { - unsigned loopCount = loopDepth > 0 ? std::pow(3, loopDepth) : 1; - LatencyHide += loopCount * s.LatencyHide; - MemLatency += loopCount * s.MemLatency; - MixAlu += loopCount * s.MixAlu; - Alu += loopCount * s.Alu; - Lds += loopCount * s.Lds; - SgprSpill |= s.SgprSpill; +void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) { + unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1; + LatencyHide += LoopCount * S.LatencyHide; + MemLatency += LoopCount * S.MemLatency; + MixAlu += LoopCount * S.MixAlu; + Alu += LoopCount * S.Alu; + Lds += LoopCount * S.Lds; + SgprSpill |= S.SgprSpill; } -bool SchedScore::isBetter(const SchedScore &s) const { - float score = computeScore(); - float newScore = s.computeScore(); - bool spillBetter = !SgprSpill && s.SgprSpill; - return spillBetter ? true : newScore >= score; +bool SchedScore::isBetter(const SchedScore &S) const { + float Score = computeScore(); + float NewScore = S.computeScore(); + bool SpillBetter = !SgprSpill && S.SgprSpill; + return SpillBetter ? true : NewScore >= Score; } // Does more occupancy give more perf. bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const { - unsigned gain = latencyGain(TargetOccupancy, ExtraOcc); + unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc); // 10% is good enough. - if ((10 * gain) >= Alu) + if ((10 * Gain) >= Alu) return true; - else - return false; + return false; } unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const { - unsigned latency = MemLatency; - return (latency / (TgtOcc)) - (latency / (TgtOcc + ExtraOcc)); + unsigned Latency = MemLatency; + return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc)); } // AMDGPULatencyTracker @@ -75,113 +74,113 @@ AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST) void AMDGPULatencyTracker::scan(const MachineInstr &MI) { if (MI.isDebugInstr()) return; - int latency = SIII->getInstrLatency(ItinerayData, MI); + int Latency = SIII->getInstrLatency(ItinerayData, MI); // If inside latency hide. if (!LatencyMIs.empty()) { - bool bWaitCnt = false; + bool IsWaitCnt = false; for (auto &MO : MI.operands()) { if (MO.isReg()) { - unsigned reg = MO.getReg(); - auto it = LatencyMIs.find(reg); - if (it != LatencyMIs.end()) { - bWaitCnt = true; + Register Reg = MO.getReg(); + auto It = LatencyMIs.find(Reg); + if (It != LatencyMIs.end()) { + IsWaitCnt = true; // If MI use mem result, update latency to mem latency. - int cycle = it->second; - if (cycle > latency) - latency = cycle; + int Cycle = It->second; + if (Cycle > Latency) + Latency = Cycle; } } } // Update latency for each mem latency inst. - for (auto it = LatencyMIs.begin(); it != LatencyMIs.end();) { - auto prev = it; - auto l = (it++); - int cycle = l->second; - if (cycle <= latency) { + for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) { + auto Prev = It; + auto L = (It++); + int Cycle = L->second; + if (Cycle <= Latency) { // Only left cycles. // Remove the reg. - LatencyMIs.erase(prev); - if (bWaitCnt && cycle == latency) { - score.MemLatency += cycle; + LatencyMIs.erase(Prev); + if (IsWaitCnt && Cycle == Latency) { + Score.MemLatency += Cycle; // Only count memLatency once, the rest is hide. - bWaitCnt = false; + IsWaitCnt = false; } else { // Hide cycle or count mem latency? - score.LatencyHide += cycle; + Score.LatencyHide += Cycle; } } else { - l->second -= latency; + L->second -= Latency; // Hide latency. - score.LatencyHide += latency; + Score.LatencyHide += Latency; } } } else { // TODO: check branch/lds? // TODO: check prevVAlu? - auto getAluStatus = [](const MachineInstr &MI, + auto GetAluStatus = [](const MachineInstr &MI, const llvm::SIInstrInfo *SIII) { - AluStatus status = AluStatus::Nothing; + AluStatus Status = AluStatus::Nothing; if (SIII->isVALU(MI.getOpcode())) { - status = AluStatus::Vector; + Status = AluStatus::Vector; } else if (SIII->isSALU(MI.getOpcode())) { - status = AluStatus::Scalar; + Status = AluStatus::Scalar; } - return status; + return Status; }; - AluStatus status = getAluStatus(MI, SIII); + AluStatus Status = GetAluStatus(MI, SIII); - switch (prevStatus) { + switch (PrevStatus) { case AluStatus::Nothing: { - score.Alu += latency; - score.MixAlu += latency; - prevStatus = status; + Score.Alu += Latency; + Score.MixAlu += Latency; + PrevStatus = Status; } break; case AluStatus::Vector: case AluStatus::Scalar: { - score.Alu += latency; + Score.Alu += Latency; // Ignore mix alu. - if (prevStatus != status) { - prevStatus = AluStatus::Nothing; + if (PrevStatus != Status) { + PrevStatus = AluStatus::Nothing; } else { - score.MixAlu += latency; + Score.MixAlu += Latency; } } break; } } // Update latency inst. if (SIII->isHighLatencyInstruction(MI) && MI.mayLoad()) { - unsigned reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); // TODO: get correct latency. // SIII->getInstrLatency(ItinerayData, MI); constexpr unsigned kHighLetency = 180; - LatencyMIs[reg] = kHighLetency; + LatencyMIs[Reg] = kHighLetency; } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) { - unsigned reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); // TODO: get correct latency. // SIII->getInstrLatency(ItinerayData, MI); constexpr unsigned kLowLetency = 35; - LatencyMIs[reg] = kLowLetency; + LatencyMIs[Reg] = kLowLetency; } } -SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, +SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, const llvm::MachineLoopInfo *MLI) { - SchedScore totalScore; + SchedScore TotalScore; for (auto &MFI : MF) { MachineBasicBlock &MBB = MFI; MachineBasicBlock::iterator Next; - AMDGPULatencyTracker latencyTracker(ST); + AMDGPULatencyTracker LatencyTracker(ST); for (auto &MI : MBB) { - latencyTracker.scan(MI); + LatencyTracker.scan(MI); } - unsigned loopDepth = 0; + unsigned LoopDepth = 0; if (MLI) { - loopDepth = MLI->getLoopDepth(&MBB); + LoopDepth = MLI->getLoopDepth(&MBB); } - totalScore.sum(latencyTracker.score, loopDepth); + TotalScore.sum(LatencyTracker.Score, LoopDepth); } - return totalScore; + return TotalScore; } } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h index a9a15f7538a58..c04afe61c9809 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -40,8 +40,8 @@ struct SchedScore { float computeScore() const; float computeScore2() const; - void sum(const SchedScore &s, unsigned loopDepth = 0); - bool isBetter(const SchedScore &s) const; + void sum(const SchedScore &S, unsigned LoopDepth = 0); + bool isBetter(const SchedScore &S) const; bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const; // More latency can be hiden with ExtraOcc. unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; @@ -53,23 +53,23 @@ struct AMDGPULatencyTracker { const llvm::InstrItineraryData *ItinerayData; // Latency MI dst reg to cycle map. llvm::DenseMap LatencyMIs; - SchedScore score; + SchedScore Score; // Low latency MI not wait. - unsigned hideLatency = 0; - unsigned memLatency = 0; + unsigned HideLatency = 0; + unsigned MemLatency = 0; // For simple, only consider mixture as one valu one salu. // Not group now. - unsigned prevSAlu = 0; - unsigned prevVAlu = 0; + unsigned PrevSAlu = 0; + unsigned PrevVAlu = 0; enum class AluStatus { Nothing, Vector, Scalar, - } prevStatus = AluStatus::Nothing; + } PrevStatus = AluStatus::Nothing; void scan(const llvm::MachineInstr &MI); }; -SchedScore CollectLatency(llvm::MachineFunction &MF, +SchedScore collectLatency(llvm::MachineFunction &MF, const llvm::GCNSubtarget &ST, const llvm::MachineLoopInfo *MLI = nullptr); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index be24bfce2851c..fec8ac9546a4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -31,12 +31,12 @@ void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const { dbgs() << "\nSubExp:\n"; dbgs() << "input regs:\n"; - for (auto &input : inputLive) { + for (auto &input : InputLive) { pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs()); dbgs() << "\n"; } dbgs() << "output regs:\n"; - for (auto &output : outputLive) { + for (auto &output : OutputLive) { pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs()); dbgs() << "\n"; } @@ -60,8 +60,8 @@ bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const { void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { - sMaxSize = std::max(sInputSize, sOutputSize); - vMaxSize = std::max(vInputSize, vOutputSize); + SMaxSize = std::max(SInputSize, SOutputSize); + VMaxSize = std::max(VInputSize, VOutputSize); DenseMap LiveRegs; GCNRegPressure CurPressure; @@ -125,10 +125,10 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, unsigned sSize = CurPressure.getSGPRNum(); unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts()); - if (sSize > sMaxSize) - sMaxSize = sSize; - if (vSize > vMaxSize) - vMaxSize = vSize; + if (sSize > SMaxSize) + SMaxSize = sSize; + if (vSize > VMaxSize) + VMaxSize = vSize; } } @@ -185,8 +185,8 @@ template void ExpDag::initNodes>( template void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, - T &insts) { - initNodes(InputLiveReg, insts); + T &Insts) { + initNodes(InputLiveReg, Insts); addDataDep(SIRI); addCtrlDep(); buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII); @@ -336,7 +336,7 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, auto it = StartLiveReg.find(Reg); assert(it != StartLiveReg.end() && "cannot find input reg in block start live"); - Exp.inputLive[Reg] |= it->second; + Exp.InputLive[Reg] |= it->second; } for (unsigned Reg : Exp.BottomRegs) { @@ -349,13 +349,13 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, // outputLive which will affect profit count. continue; } - Exp.outputLive[Reg] |= it->second; + Exp.OutputLive[Reg] |= it->second; } - CollectLiveSetPressure(Exp.inputLive, MRI, SIRI, Exp.vInputSize, - Exp.sInputSize); - CollectLiveSetPressure(Exp.outputLive, MRI, SIRI, Exp.vOutputSize, - Exp.sOutputSize); + CollectLiveSetPressure(Exp.InputLive, MRI, SIRI, Exp.VInputSize, + Exp.SInputSize); + CollectLiveSetPressure(Exp.OutputLive, MRI, SIRI, Exp.VOutputSize, + Exp.SOutputSize); } } @@ -415,8 +415,8 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) { auto curDefIt = curDefMI.find(Reg); // Check def inst first. if (curDefIt != curDefMI.end()) { - MachineInstr *curDef = curDefIt->second; - DefSU = MISUnitMap[curDef]; + MachineInstr *CurDef = curDefIt->second; + DefSU = MISUnitMap[CurDef]; // Add link between different defs. SU.addPred(SDep(DefSU, SDep::Data, Reg)); } @@ -445,12 +445,12 @@ void BlockExpDag::build() { const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB); const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI); - std::vector insts; + std::vector Insts; for (MachineInstr &MI : *MBB) { - insts.emplace_back(&MI); + Insts.emplace_back(&MI); } - ExpDag::build(StartLiveReg, EndLiveReg, insts); + ExpDag::build(StartLiveReg, EndLiveReg, Insts); } void BlockExpDag::buildWithPressure() { @@ -461,17 +461,17 @@ void BlockExpDag::buildWithPressure() { const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB); const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI); - std::vector insts; + std::vector Insts; for (MachineInstr &MI : *MBB) { - insts.emplace_back(&MI); + Insts.emplace_back(&MI); } - ExpDag::build(StartLiveReg, EndLiveReg, insts); + ExpDag::build(StartLiveReg, EndLiveReg, Insts); // Build pressure. buildPressure(StartLiveReg, EndLiveReg); } -void BlockExpDag::buildAvail(const LiveSet &passThruSet, +void BlockExpDag::buildAvail(const LiveSet &PassThruSet, DenseMap &DagAvailRegMap) { DenseSet Processed; @@ -485,10 +485,10 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet, for (SUnit &SU : SUnits) { if (SU.NumPredsLeft == 0) { GCNDownwardRPTracker RP(*LIS); - RP.reset(BeginMI, &passThruSet); + RP.reset(BeginMI, &PassThruSet); MachineInstr *MI = SU.getInstr(); if (MI) { - RP.reset(*MI, &passThruSet); + RP.reset(*MI, &PassThruSet); RP.advance(); } DagAvailRegMap[&SU] = RP.getLiveRegs(); @@ -503,7 +503,6 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet, } } while (!WorkList.empty()) { - bool IsUpdated = false; SmallVector ReadyNodes; for (SUnit *SU : WorkList) { if (SU->NumPredsLeft > 0) @@ -511,7 +510,6 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet, ReadyNodes.emplace_back(SU); // Ready, move it to Processed. Processed.insert(SU); - IsUpdated = true; // Only update 1 node once. // Order of schedle here should not affect pressure. break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h index 952126798b1de..c447750e17f1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h @@ -41,14 +41,14 @@ struct SubExp { bool IsTouchSCC = false; llvm::MachineBasicBlock *FromBB; llvm::MachineBasicBlock *ToBB; - unsigned sInputSize; - unsigned vInputSize; - unsigned sOutputSize; - unsigned vOutputSize; - unsigned sMaxSize; - unsigned vMaxSize; - LiveSet inputLive; - LiveSet outputLive; + unsigned SInputSize; + unsigned VInputSize; + unsigned SOutputSize; + unsigned VOutputSize; + unsigned SMaxSize; + unsigned VMaxSize; + LiveSet InputLive; + LiveSet OutputLive; bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool IsMoveUp) const; void calcMaxPressure(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI); @@ -73,7 +73,7 @@ struct ExpDag { std::vector SubExps; template void build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, - T &insts); + T &Insts); void dump(); void viewGraph(const llvm::Twine &Name, const llvm::Twine &Title) const; /// Returns a label for an SUnit node in a visualization of the ScheduleDAG. @@ -104,7 +104,7 @@ struct BlockExpDag : public ExpDag { void buildWithPressure(); private: - void buildAvail(const LiveSet &passThruSet, + void buildAvail(const LiveSet &PassThruSet, llvm::DenseMap &DagAvailRegMap); void buildPressure(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg); }; From f8eb7fb3a7f0d25b6773ec6d0598cd325138cbb8 Mon Sep 17 00:00:00 2001 From: Adam Yang <31109344+adam-yang@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:10:34 -0700 Subject: [PATCH 15/25] More cleanups --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 40 +- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 405 +++++---- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 771 +++++++++--------- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h | 29 +- 4 files changed, 619 insertions(+), 626 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 46d182ffd9e29..853a212ac5bf3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -226,7 +226,7 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) { if (!Op.isReg()) continue; if (!MRI.getUniqueVRegDef(Op.getReg()) && - !llvm::IsSub0Sub1SingleDef(Op.getReg(), MRI)) { + !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) { return false; } } @@ -280,7 +280,7 @@ unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, GCNUpwardRPTracker RPTracker(*LIS); // R.End doesn't point to the boundary instruction. // Skip Debug instr. - if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB)) + if (!llvm::getNonDebugMBBEnd(BBEnd, MBB)) return ST->getOccupancyWithNumVGPRs(0); GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; @@ -327,7 +327,7 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, // R.End doesn't point to the boundary instruction. // Skip Debug instr. - if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) { + if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) { auto SI = SlotIndexes->getInstructionIndex(*BBEnd); MBBOutputSlotMap[&MBB] = SI; } @@ -417,7 +417,7 @@ RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure); unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure); - llvm::SchedScore TotalScore = llvm::CollectLatency(MF, *ST, MLI); + llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI); bool MemBound = TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc); @@ -702,7 +702,7 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI, } bool IsSingleDef = MRI.hasOneDef(Reg); if (!IsSingleDef) { - IsSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI); + IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI); } if (IsSingleDef) { @@ -1066,7 +1066,7 @@ static MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash( const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); if (WillSmashScc) { - CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef( + CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef( MBB, CurrentInsertPoint, SIRI, SIII, &MRI); } @@ -1081,7 +1081,7 @@ static MachineBasicBlock::iterator adjustInsertPointForSubExpToAvoidSccSmash( const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI); if (WillSmashScc) { - CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef( + CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef( MBB, CurrentInsertPoint, SIRI, SIII, &MRI); } @@ -1094,7 +1094,7 @@ static bool willSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB, // It is ok to pass nullptr to `modifiesRegister` for TRI here since // SCC has no subreg/suprereg relationships. return MI->modifiesRegister(AMDGPU::SCC, nullptr) && - llvm::IsSccLiveAt(MBB, Location); + llvm::isSccLiveAt(MBB, Location); } void applyCloneRemat(Remat *Remat, RematNode &Node, @@ -1374,7 +1374,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, const GCNRPTracker::LiveRegSet &LiveSet = LISLR; unsigned VPressure = 0; unsigned SPressure = 0; - CollectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure); + collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure); if (MaxVPressure < VPressure) MaxVPressure = VPressure; if (MaxSPressure < SPressure) @@ -1635,7 +1635,7 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI, if (OpReg.isPhysical()) return false; if (!MRI.getUniqueVRegDef(OpReg) && - !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) { + !llvm::isSub0Sub1SingleDef(OpReg, MRI)) { return false; } } @@ -1794,7 +1794,7 @@ std::vector buildSubExpFromCandidates( continue; MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); - assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) && + assert((DefMI || llvm::isSub0Sub1SingleDef(Reg, MRI)) && "UseMI should be safe to move"); if (DefMI && CandidateDefs.count(DefMI) > 0) continue; @@ -1982,7 +1982,7 @@ std::vector buildSubExpFromCandidatesTopBottom( if (Candidates.count(Reg) == 0 && LocalCandidates.count(Reg) != 0) continue; } - assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) && + assert((DefMI || llvm::isSub0Sub1SingleDef(Reg, MRI)) && "UseMI should be safe to move"); if (DefMI && CandidateDefs.count(DefMI) > 0) continue; @@ -2361,7 +2361,7 @@ void applySubExpCloneNearUser(SubExp &Exp, std::vector &HotBlocks, DenseMap RegMap; auto InsertPtr = MBB->getFirstNonPHI(); // If Exp has scc read/write, make sure MBB not have scc in liveins. - if (IsModifiesScc && llvm::IsSccLiveAt(MBB, InsertPtr)) + if (IsModifiesScc && llvm::isSccLiveAt(MBB, InsertPtr)) continue; MachineFunction *MF = MBB->getParent(); for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) { @@ -2470,7 +2470,7 @@ void applySubExpCloneNearUserInBlock( continue; // Do not overwrite a live scc. - if (IsModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI)) + if (IsModifiesScc && llvm::isSccLiveAt(UserBB, &UseMI)) continue; UseMIs.emplace_back(&UseMI); @@ -3147,7 +3147,7 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi, if (SubExp.SOutputSize == 0) continue; } - if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) + if (!SubExp.isSafeToMove(MRI)) continue; // Not clone . if (SubExp.SUnits.size() > 10) @@ -3410,7 +3410,7 @@ bool canHelpPressureWhenSink(SubExp &SubExp, MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound) { LLVM_DEBUG(SubExp.dump(MRI, SIRI)); - if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) + if (!SubExp.isSafeToMove(MRI)) return false; // Update input size to ignore lives in which already in @@ -3477,7 +3477,7 @@ bool canHelpPressureWhenSink(SubExp &SubExp, bool canHelpPressureWhenHoist(SubExp &SubExp, const MachineRegisterInfo &MRI, const MachineLoopInfo *MLI, bool IsSgprBound) { - if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ true)) + if (!SubExp.isSafeToMove(MRI)) return false; if (SubExp.VInputSize < SubExp.VOutputSize) return false; @@ -3722,9 +3722,9 @@ SubExp buildFreeSubExp(SubExp &Exp, FreeExp.OutputLive[Reg]; } - CollectLiveSetPressure(FreeExp.InputLive, MRI, SIRI, FreeExp.VInputSize, + collectLiveSetPressure(FreeExp.InputLive, MRI, SIRI, FreeExp.VInputSize, FreeExp.SInputSize); - CollectLiveSetPressure(FreeExp.OutputLive, MRI, SIRI, FreeExp.VOutputSize, + collectLiveSetPressure(FreeExp.OutputLive, MRI, SIRI, FreeExp.VOutputSize, FreeExp.SOutputSize); return FreeExp; } @@ -3779,7 +3779,7 @@ std::vector buildSubExpCandidates( if (!canHelpPressureWhenSink(Exp, PassThrus, MRI, SIRI, MLI, DT, IsCanClone, IsSgprBound)) { if (AllowPartialUseInSubExp && - Exp.isSafeToMove(MRI, /*IsMoveUp*/ false)) { + Exp.isSafeToMove(MRI)) { SubExp FreeSubExp = buildFreeSubExp(Exp, PassThrus, MRI, SIRI); if (canHelpPressureWhenSink(FreeSubExp, PassThrus, MRI, SIRI, MLI, DT, IsCanClone, IsSgprBound)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 63651ab82fcdb..d207b3aa3d4f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -5,7 +5,6 @@ #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/SlotIndexes.h" -// #include "dxc/DXIL/DxilMetadataHelper.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" @@ -30,12 +29,12 @@ class CFGWithPhi { MachineRegisterInfo &MRI = F.getRegInfo(); for (MachineBasicBlock &BB : F) { - auto &phiInsts = blockToPhiInstsMap[&BB]; + auto &PhiInsts = BlockToPhiInstsMap[&BB]; for (MachineInstr &I : BB) { if (!I.isPHI()) break; - phiInsts.insert(&I); - unsigned Reg = I.getOperand(0).getReg(); + PhiInsts.insert(&I); + Register Reg = I.getOperand(0).getReg(); // Add incoming values. for (unsigned i = 1; i < I.getNumOperands(); i += 2) { MachineOperand &MO = I.getOperand(i); @@ -44,11 +43,11 @@ class CFGWithPhi { MachineInstr *DefMI = MRI.getUniqueVRegDef(MO.getReg()); if (!DefMI) continue; - blockToPhiInstsMap[DefMI->getParent()].insert(DefMI); + BlockToPhiInstsMap[DefMI->getParent()].insert(DefMI); } // Add users. for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { - blockToPhiInstsMap[UseMI.getParent()].insert(&UseMI); + BlockToPhiInstsMap[UseMI.getParent()].insert(&UseMI); } } } @@ -56,7 +55,7 @@ class CFGWithPhi { void addCustomGraphFeatures(llvm::GraphWriter &) const {} MachineFunction &F; DenseMap> - blockToPhiInstsMap; + BlockToPhiInstsMap; void dump(); }; @@ -64,13 +63,13 @@ void CFGWithPhi::dump() { #ifdef DBG for (MachineBasicBlock &BB : F) { dbgs() << BB.getName() << "\n"; - auto &phiInsts = blockToPhiInstsMap[&BB]; - for (MachineInstr *I : phiInsts) { + auto &PhiInsts = blockToPhiInstsMap[&BB]; + for (MachineInstr *I : PhiInsts) { if (!I->isPHI()) continue; I->dump(); } - for (MachineInstr *I : phiInsts) { + for (MachineInstr *I : PhiInsts) { if (I->isPHI()) continue; I->dump(); @@ -86,14 +85,14 @@ namespace llvm { template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { - DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} - static std::string getGraphName(const CFGWithPhi *G) { + static std::string getGraphName(const CFGWithPhi *) { return "CFG with Phi graph"; } static std::string getNodeIdentifierLabel(const MachineBasicBlock *Node, - const CFGWithPhi *Graph) { + const CFGWithPhi *) { std::string R; raw_string_ostream OS(R); OS << static_cast(Node); @@ -107,17 +106,17 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { raw_string_ostream OS(Str); OS << "BB:" << BB->getName(); - auto it = G->blockToPhiInstsMap.find(BB); - if (it != G->blockToPhiInstsMap.end()) { + auto It = G->BlockToPhiInstsMap.find(BB); + if (It != G->BlockToPhiInstsMap.end()) { - auto &phiInsts = it->second; - for (MachineInstr *I : phiInsts) { + auto &PhiInsts = It->second; + for (MachineInstr *I : PhiInsts) { if (!I->isPHI()) continue; I->print(OS); OS << "\n"; } - for (MachineInstr *I : phiInsts) { + for (MachineInstr *I : PhiInsts) { if (I->isPHI()) continue; I->print(OS); @@ -157,7 +156,7 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { return OutStr; } static std::string getNodeDescription(const MachineBasicBlock *SU, - const CFGWithPhi *G) { + const CFGWithPhi *) { return SU->getName().str(); } @@ -200,25 +199,24 @@ unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, const llvm::SIRegisterInfo *SIRI) { unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); Size >>= 5; - LaneBitmask mask = Mask; - if (mask.any()) { - if (unsigned maskSize = mask.getNumLanes()) { - if (maskSize < Size) - Size = maskSize; + if (Mask.any()) { + if (unsigned MaskSize = Mask.getNumLanes()) { + if (MaskSize < Size) + Size = MaskSize; } } return Size; } -void CollectLiveSetPressure(const LiveSet &liveSet, +void collectLiveSetPressure(const LiveSet &LiveSet, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, unsigned &VPressure, unsigned &SPressure) { VPressure = 0; SPressure = 0; - for (auto liveIt : liveSet) { - unsigned Reg = liveIt.first; - unsigned Size = getRegSize(Reg, liveIt.second, MRI, SIRI); + for (auto LiveIt : LiveSet) { + unsigned Reg = LiveIt.first; + unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI); if (SIRI->isVGPR(MRI, Reg)) { VPressure += Size; } else { @@ -228,58 +226,58 @@ void CollectLiveSetPressure(const LiveSet &liveSet, } bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) { - bool isExecUpdate = false; - unsigned opcode = MI.getOpcode(); - if (opcode == AMDGPU::S_MOV_B64 || opcode == AMDGPU::S_MOV_B32 || - opcode == AMDGPU::S_OR_B64_term || opcode == AMDGPU::S_OR_B32_term || - opcode == AMDGPU::S_OR_SAVEEXEC_B64 || - opcode == AMDGPU::S_OR_SAVEEXEC_B32 || opcode == AMDGPU::S_AND_B64 || - opcode == AMDGPU::S_AND_B32 || opcode == AMDGPU::S_ANDN2_B64 || - opcode == AMDGPU::S_ANDN2_B32) { + bool IsExecUpdate = false; + unsigned Opcode = MI.getOpcode(); + if (Opcode == AMDGPU::S_MOV_B64 || Opcode == AMDGPU::S_MOV_B32 || + Opcode == AMDGPU::S_OR_B64_term || Opcode == AMDGPU::S_OR_B32_term || + Opcode == AMDGPU::S_OR_SAVEEXEC_B64 || + Opcode == AMDGPU::S_OR_SAVEEXEC_B32 || Opcode == AMDGPU::S_AND_B64 || + Opcode == AMDGPU::S_AND_B32 || Opcode == AMDGPU::S_ANDN2_B64 || + Opcode == AMDGPU::S_ANDN2_B32) { MachineOperand &Dst = MI.getOperand(0); if (Dst.getReg() == AMDGPU::EXEC || Dst.getReg() == AMDGPU::EXEC_LO) { - isExecUpdate = true; + IsExecUpdate = true; } } - return isExecUpdate; + return IsExecUpdate; } -bool IsSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { +bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { // Support multi def for pattern of pointer: // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 // %808.sub1:sgpr_64 = S_MOV_B32 0 - bool bHasSub0 = false; - bool bHasSub1 = false; + bool HasSub0 = false; + bool HasSub1 = false; for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) { if (unsigned SubReg = UserDefMO.getSubReg()) { - bool bSingleSubReg = false; + bool IsSingleSubReg = false; switch (SubReg) { default: break; case AMDGPU::sub0: - if (!bHasSub0) { - bHasSub0 = true; - bSingleSubReg = true; + if (!HasSub0) { + HasSub0 = true; + IsSingleSubReg = true; } break; case AMDGPU::sub1: - if (!bHasSub1) { - bHasSub1 = true; - bSingleSubReg = true; + if (!HasSub1) { + HasSub1 = true; + IsSingleSubReg = true; } break; } - if (!bSingleSubReg) { - bHasSub0 = false; + if (!IsSingleSubReg) { + HasSub0 = false; break; } } else { - bHasSub0 = false; + HasSub0 = false; break; } } - return (bHasSub0 && bHasSub1); + return (HasSub0 && HasSub1); } LaneBitmask getRegMask(const MachineOperand &MO, @@ -293,46 +291,46 @@ LaneBitmask getRegMask(const MachineOperand &MO, MO.getSubReg()); } -void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) { - for (auto Reg : inputSet) { - unsigned reg = Reg.first; - LaneBitmask mask = Reg.second; - auto targetReg = targetSet.find(reg); - if (targetReg != targetSet.end()) { - LaneBitmask targetMask = targetReg->second; - mask |= targetMask; +void mergeLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) { + for (auto It : InputSet) { + Register Reg = It.first; + LaneBitmask Mask = It.second; + auto TargetReg = TargetSet.find(Reg); + if (TargetReg != TargetSet.end()) { + LaneBitmask TargetMask = TargetReg->second; + Mask |= TargetMask; } - targetSet[reg] = mask; + TargetSet[Reg] = Mask; } } -void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) { +void andLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) { GCNRPTracker::LiveRegSet AndSet; - for (auto Reg : inputSet) { - unsigned reg = Reg.first; - LaneBitmask mask = Reg.second; - auto targetReg = targetSet.find(reg); - if (targetReg != targetSet.end()) { - LaneBitmask targetMask = targetReg->second; - mask &= targetMask; - AndSet[reg] = mask; + for (auto It : InputSet) { + Register Reg = It.first; + LaneBitmask Mask = It.second; + auto TargetReg = TargetSet.find(Reg); + if (TargetReg != TargetSet.end()) { + LaneBitmask TargetMask = TargetReg->second; + Mask &= TargetMask; + AndSet[Reg] = Mask; } } - targetSet = AndSet; + TargetSet = AndSet; } -void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) { - for (auto Reg : inputSet) { - unsigned reg = Reg.first; - LaneBitmask mask = Reg.second; - auto targetReg = targetSet.find(reg); - if (targetReg != targetSet.end()) { - LaneBitmask targetMask = targetReg->second; - if ((targetMask | mask) == mask) - targetSet.erase(reg); +void andNotLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) { + for (auto It : InputSet) { + unsigned Reg = It.first; + LaneBitmask Mask = It.second; + auto TargetReg = TargetSet.find(Reg); + if (TargetReg != TargetSet.end()) { + LaneBitmask TargetMask = TargetReg->second; + if ((TargetMask | Mask) == Mask) + TargetSet.erase(Reg); else - targetSet[reg] = targetMask & (~mask); + TargetSet[Reg] = TargetMask & (~Mask); } } } @@ -356,56 +354,55 @@ MachineBasicBlock *split(MachineInstr *Inst) { struct Piece { unsigned Reg; - unsigned offset; - unsigned size; - static SmallVector split(std::bitset<32> mask) { + unsigned Offset; + unsigned Size; + static SmallVector split(std::bitset<32> Mask) { - SmallVector pieces; - Piece piece = {0, 0, 0}; + SmallVector Pieces; + Piece Piece = {0, 0, 0}; for (unsigned i = 0; i < 32; i++) { - if (mask.test(i)) { - if (piece.size == 0) - piece.offset = i; + if (Mask.test(i)) { + if (Piece.Size == 0) + Piece.Offset = i; - piece.size++; + Piece.Size++; // Make sure no piece bigger than 8. - if (piece.size == 8) { - pieces.emplace_back(piece); - piece.size = 0; + if (Piece.Size == 8) { + Pieces.emplace_back(Piece); + Piece.Size = 0; } } else { - if (piece.size == 0) { + if (Piece.Size == 0) { continue; } - pieces.emplace_back(piece); - piece.size = 0; + Pieces.emplace_back(Piece); + Piece.Size = 0; } } - return pieces; + return Pieces; } }; -void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC, - unsigned offset, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII) { - unsigned size = NewRC->getLaneMask().getNumLanes(); - if (size == 1) { +static void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC, + unsigned Offset, const SIRegisterInfo *SIRI) { + unsigned Size = NewRC->getLaneMask().getNumLanes(); + if (Size == 1) { UseMO.setSubReg(0); } else { const uint32_t SubReg = UseMO.getSubReg(); - LaneBitmask Mask = SIRI->getSubRegIndexLaneMask(SubReg); + LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg); - unsigned mask = Mask.getAsInteger() >> offset; + unsigned Mask = LaneMask.getAsInteger() >> Offset; unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask( - NewRC, LaneBitmask(mask)) + NewRC, LaneBitmask(Mask)) .front(); UseMO.setSubReg(NewSubReg); } } -bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, +bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { MachineOperand &DstMO = MI.getOperand(0); @@ -413,7 +410,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, if (DstMO.getSubReg()) { return false; } - unsigned Reg = DstMO.getReg(); + Register Reg = DstMO.getReg(); SmallVector UseMOs; for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) { @@ -421,9 +418,9 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, } const llvm::TargetRegisterClass *NewRC = - SIRI->getRegClass(desc.operands().front().RegClass); - unsigned size = NewRC->getLaneMask().getNumLanes(); - if (offset > 0) { + SIRI->getRegClass(Desc.operands().front().RegClass); + unsigned Size = NewRC->getLaneMask().getNumLanes(); + if (Offset > 0) { // Update offset operand in MI. MachineOperand *OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::offset); @@ -433,7 +430,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, if (OffsetOp->isImm()) { assert(OffsetOp != nullptr); int64_t Offset = OffsetOp->getImm(); - Offset += offset * LaneSize; + Offset += Offset * LaneSize; if (!SIII->isLegalMUBUFImmOffset(Offset)) { return false; } @@ -444,13 +441,13 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, } else { OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset); if (OffsetOp) { - unsigned NewOffsetReg = + Register NewOffsetReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(), SIII->get(AMDGPU::S_ADD_U32)) .addDef(NewOffsetReg) .add(*OffsetOp) - .addImm(offset * LaneSize); + .addImm(Offset * LaneSize); MachineInstr *OffsetAddMI = OffsetAdd.getInstr(); MachineBasicBlock::iterator InsertPoint = llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI, @@ -467,16 +464,16 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, } // Update subReg for users. for (MachineOperand *UseMO : UseMOs) { - updateSubReg(*UseMO, NewRC, offset, SIRI, SIII); + updateSubReg(*UseMO, NewRC, Offset, SIRI); } - } else if (size == 1) { + } else if (Size == 1) { // Clear subReg when size is 1. for (MachineOperand *UseMO : UseMOs) { UseMO->setSubReg(0); } } - MI.setDesc(desc); + MI.setDesc(Desc); // Mutate reg class of Reg. MRI.setRegClass(Reg, NewRC); return true; @@ -485,7 +482,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc, bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { - bool bImm = false; + bool IsImm = false; switch (MI.getOpcode()) { default: break; @@ -493,67 +490,70 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM: - bImm = true; + IsImm = true; + LLVM_FALLTHROUGH; case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); if (!MRI.getUniqueVRegDef(Reg)) return false; - LaneBitmask dstMask = getRegMask(MI.getOperand(0), MRI); + LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI); LaneBitmask UseMask; for (MachineOperand &MO : MRI.use_operands(Reg)) { UseMask |= llvm::getRegMask(MO, MRI); } - const unsigned fullMask = dstMask.getAsInteger(); - unsigned mask = UseMask.getAsInteger(); - if (mask == fullMask) + const unsigned FullMask = DstMask.getAsInteger(); + unsigned Mask = UseMask.getAsInteger(); + if (Mask == FullMask) return false; // Split mask when there's gap. Then group mask to 2/4/8. - auto pieces = Piece::split(std::bitset<32>(mask)); + auto Pieces = Piece::split(std::bitset<32>(Mask)); // Now only support 1 piece. - if (pieces.size() != 1) + if (Pieces.size() != 1) return false; - auto piece = pieces[0]; - if (piece.size > 8) + auto Piece = Pieces[0]; + if (Piece.Size > 8) return false; - // TODO: enable offset support when bImm is true. + // TODO: enable offset support when IsImm is true. // Now if break different test when mul LaneSize or not mul for the offset. - if (bImm && piece.offset != 0) + if (IsImm && Piece.Offset != 0) return false; - switch (piece.size) { + switch (Piece.Size) { default: return false; case 1: - return reduceChannel(piece.offset, MI, - SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), MRI, SIRI, SIII, SlotIndexes); case 2: - return reduceChannel(piece.offset, MI, - SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), MRI, SIRI, SIII, SlotIndexes); case 3: - if (fullMask == 0xf) + if (FullMask == 0xf) return false; + LLVM_FALLTHROUGH; case 4: - return reduceChannel(piece.offset, MI, - SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR), MRI, SIRI, SIII, SlotIndexes); case 5: case 6: case 7: - if (fullMask == 0xff) + if (FullMask == 0xff) return false; + LLVM_FALLTHROUGH; case 8: - return reduceChannel(piece.offset, MI, - SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM + return reduceChannel(Piece.Offset, MI, + SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR), MRI, SIRI, SIII, SlotIndexes); } @@ -610,15 +610,15 @@ bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT, // If BB can reach hotMBBs. bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineLoopInfo *LI, - DenseSet &hotMBBs) { - bool bCross = false; - for (MachineBasicBlock *hotBB : hotMBBs) { - if (reach_block(BB, DT, PDT, LI, hotBB)) { - bCross = true; + DenseSet &HotMBBs) { + bool Cross = false; + for (MachineBasicBlock *HotBB : HotMBBs) { + if (reach_block(BB, DT, PDT, LI, HotBB)) { + Cross = true; break; } } - return bCross; + return Cross; } } // namespace llvm @@ -634,7 +634,7 @@ void viewCFGWithPhi(llvm::MachineFunction &F) { } // namespace llvm namespace llvm { -bool GetNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, +bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, MachineBasicBlock &MBB) { // R.End doesn't point to the boundary instruction. // Skip Debug instr. @@ -951,13 +951,13 @@ void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, raw_ostream &os) { // Split subReg? MO.getSubReg(); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); unsigned SubReg = MO.getSubReg(); MachineInstr *MI = MO.getParent(); SlotIndex Slot = SlotIndexes->getInstructionIndex(*MI); if (SubReg == 0) { - unsigned size = get_reg_size(Reg, MRI, SIRI); - for (unsigned i = 0; i < size; i++) { + unsigned Size = get_reg_size(Reg, MRI, SIRI); + for (unsigned i = 0; i < Size; i++) { write_define(Slot, Reg, i, MRI, SIRI, os); } } else { @@ -1744,13 +1744,13 @@ void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) { } } // namespace llvm -static bool IsPhysReg(const MachineOperand &Op) { +static bool isPhysReg(const MachineOperand &Op) { return Op.isReg() && Op.getReg().isPhysical(); } // Sometimes split bb uses physical registers defined in BB, have to add them to // live-in or the ir is malformed. -void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, +void llvm::updatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI) { // Initialize with current set of liveins. For new blocks this will be empty. SmallDenseSet DefSet; @@ -1762,11 +1762,11 @@ void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, // Add all undefined physical registers to the live in set. for (MachineOperand &Use : MI.operands()) { // Only process physreg uses. - if (!IsPhysReg(Use) || !Use.isUse()) + if (!isPhysReg(Use) || !Use.isUse()) continue; // Reserved regs do not need to be tracked through live-in sets. - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue; @@ -1778,14 +1778,14 @@ void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, // set. for (MachineOperand &Def : MI.operands()) { // Only process physreg defs. - if (!IsPhysReg(Def) || !Def.isDef()) + if (!isPhysReg(Def) || !Def.isDef()) continue; DefSet.insert(Def.getReg()); } } } -void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, +void llvm::buildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, SmallDenseSet &LiveOutSet, const MachineRegisterInfo *MRI) { for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) { @@ -1794,14 +1794,14 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, // set. for (MachineOperand &Def : MI.operands()) { // Only process physreg defs. - if (!IsPhysReg(Def) || !Def.isDef()) + if (!isPhysReg(Def) || !Def.isDef()) continue; LiveOutSet.erase(Def.getReg()); } // Add all undefined physical registers to the live in set. for (MachineOperand &Use : MI.operands()) { // Only process physreg uses. - if (!IsPhysReg(Use) || !Use.isUse()) + if (!isPhysReg(Use) || !Use.isUse()) continue; // Reserved regs do not need to be tracked through live-in sets. @@ -1818,7 +1818,7 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, } } -MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode, +MachineReg llvm::createVirtualRegForOperand(MachineOpcode Opcode, unsigned OpNum, MachineFunction &MF) { const TargetSubtargetInfo &ST = MF.getSubtarget(); @@ -1835,14 +1835,14 @@ MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode, return MRI.createVirtualRegister(RC); } -MachineReg llvm::CreateVirtualDstReg(MachineOpcode Opcode, +MachineReg llvm::createVirtualDstReg(MachineOpcode Opcode, MachineFunction &MF) { - return llvm::CreateVirtualRegForOperand(Opcode, 0, MF); + return llvm::createVirtualRegForOperand(Opcode, 0, MF); } // Return true if the MI is a copy of exec. // If true then sets pDst to the destination register. -bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, +bool llvm::isExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst) { enum { DST = 0, SRC = 1 }; bool FoundCopy = false; @@ -1868,10 +1868,10 @@ bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, return FoundCopy; } -llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) { +llvm::MachineRegWithSubReg llvm::getWqmEntryActiveMask(MachineFunction &MF) { llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister}; - if (MachineInstr *MI = GetWqmEntryActiveMaskInst(MF)) { + if (MachineInstr *MI = getWqmEntryActiveMaskInst(MF)) { LiveLaneMask.Reg = MI->getOperand(0).getReg(); LiveLaneMask.SubReg = MI->getOperand(0).getSubReg(); } @@ -1879,7 +1879,7 @@ llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) { return LiveLaneMask; } -MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) { +MachineInstr *llvm::getWqmEntryActiveMaskInst(MachineFunction &MF) { #if 0 // TODO: Get rid of this // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction. // This instruction is added by the SIWholeQuadMode pass. @@ -1897,7 +1897,7 @@ MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) { return nullptr; } -bool llvm::IsFetchShaderCall(const MachineInstr *MI) { +bool llvm::isFetchShaderCall(const MachineInstr *MI) { #if 0 // TODO: Get rid of this. return MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER || @@ -1907,12 +1907,12 @@ bool llvm::IsFetchShaderCall(const MachineInstr *MI) { #endif } -bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, +bool llvm::isSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) { const TargetRegisterInfo *TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo(); - for (auto it = MI; it != MBB->end(); ++it) { - const MachineInstr &CurMI = *it; + for (auto It = MI; It != MBB->end(); ++It) { + const MachineInstr &CurMI = *It; // Hit use of scc, it is live. if (CurMI.readsRegister(AMDGPU::SCC, TRI)) return true; @@ -1939,12 +1939,12 @@ bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, // scc around BeforeInst. This way BeforeInst can safely be used // as the new insert location. // -MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef( +MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef( MachineBasicBlock *MBB, MachineBasicBlock::iterator MI, const TargetRegisterInfo *TRI, const SIInstrInfo *TII, MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) { // If SCC is dead at MI when we can use MI as the insert point. - if (!llvm::IsSccLiveAt(MBB, MI)) { + if (!llvm::isSccLiveAt(MBB, MI)) { return MI; } @@ -1990,7 +1990,7 @@ MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef( // MI // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC // - unsigned int TmpScc = + Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); DebugLoc DL = MI->getDebugLoc(); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) @@ -2006,39 +2006,39 @@ MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef( namespace { bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes, - SmallDenseSet &touchedMBBSet) { - MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start); - MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end); + SmallDenseSet &TouchedMBBSet) { + MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); // Treat non inst as not local. - if (!startMI || !endMI) + if (!StartMI || !EndMI) return false; // is local when parent MBB the same. - bool bSameMBB = startMI->getParent() == endMI->getParent(); - if (!bSameMBB) + bool IsSameMBB = StartMI->getParent() == EndMI->getParent(); + if (!IsSameMBB) return false; // Collect touched MBB. - MachineBasicBlock *MBB = startMI->getParent(); - touchedMBBSet.insert(MBB); + MachineBasicBlock *MBB = StartMI->getParent(); + TouchedMBBSet.insert(MBB); return true; } bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes, - SmallDenseSet &touchedMBBSet) { + SmallDenseSet &TouchedMBBSet) { for (const LiveRange::Segment &Seg : Range->segments) { - if (!isLocalSegment(&Seg, Indexes, touchedMBBSet)) + if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet)) return false; } return true; } bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) { - MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start); - MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end); + MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start); + MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end); // Treat non inst as not local. - if (!startMI || !endMI) + if (!StartMI || !EndMI) return false; // is local when parent MBB the same. - return startMI->getParent() == endMI->getParent(); + return StartMI->getParent() == EndMI->getParent(); } bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) { @@ -2053,19 +2053,19 @@ bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) { // In case like float4 v, v.x used and defined in one block, v.y used and define // in another block, one live interval could touch more than one MBB. -// touchedMBBSet is used for scheduling where local live interval could cross +// TouchedMBBSet is used for scheduling where local live interval could cross // multiple regions, need to calculate livereg for each region inside touched // MBB. bool llvm::isLocalLiveInterval( const LiveInterval &LI, SlotIndexes *Indexes, - SmallDenseSet &touchedMBBSet) { + SmallDenseSet &TouchedMBBSet) { if (LI.hasSubRanges()) { for (const auto &S : LI.subranges()) { - if (!isLocalLiveRange(&S, Indexes, touchedMBBSet)) + if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet)) return false; } } - return isLocalLiveRange(&LI, Indexes, touchedMBBSet); + return isLocalLiveRange(&LI, Indexes, TouchedMBBSet); } bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) { @@ -2096,7 +2096,7 @@ void llvm::buildEndLiveMap( // R.End doesn't point to the boundary instruction. // Skip Debug instr. - if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) { + if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) { auto SI = SlotIndexes->getInstructionIndex(*BBEnd); MBBOutputSlotMap[&MBB] = After ? SI.getDeadSlot() : SI.getBaseIndex(); } @@ -2107,16 +2107,15 @@ void llvm::buildEndLiveMap( if (!LIS->hasInterval(Reg)) continue; - LaneBitmask LiveMask; const auto &LI = LIS->getInterval(Reg); // Skip local live interval to make live input/ouput faster. if (llvm::isLocalLiveInterval(LI, SlotIndexes)) continue; - for (auto outputIt : MBBOutputSlotMap) { - MachineBasicBlock *MBB = outputIt.first; - auto SI = outputIt.second; + for (auto OutputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = OutputIt.first; + auto SI = OutputIt.second; auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); if (LiveMask.any()) @@ -2125,7 +2124,7 @@ void llvm::buildEndLiveMap( } } -unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, +unsigned llvm::getCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) { auto &MRI = MF.getRegInfo(); for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { @@ -2136,10 +2135,10 @@ unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, return 0; } -unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, +unsigned llvm::getCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) { const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + Register ScratchRSrcReg = MFI->getScratchRSrcReg(); MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned MaxSGPR = 0; for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { @@ -2160,11 +2159,11 @@ unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { dbgs() << "\n live set: \n"; - for (auto it : LiveSet) { - int Reg = it.first; + for (auto It : LiveSet) { + int Reg = It.first; dbgs() << printReg(Reg, SIRI); - if (it.second.any()) { - dbgs() << " mask:" << it.second.getAsInteger(); + if (It.second.any()) { + dbgs() << " mask:" << It.second.getAsInteger(); } dbgs() << "\n"; } @@ -2197,7 +2196,7 @@ bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage) #endif MachineBasicBlock::succ_iterator -llvm::FindSuccessor(llvm::MachineBasicBlock *MBB, +llvm::findSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ) { for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index fec8ac9546a4a..94d78fb676f9a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -31,13 +31,13 @@ void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const { dbgs() << "\nSubExp:\n"; dbgs() << "input regs:\n"; - for (auto &input : InputLive) { - pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs()); + for (auto &Input : InputLive) { + pressure::print_reg(Input.first, MRI, SIRI, llvm::dbgs()); dbgs() << "\n"; } dbgs() << "output regs:\n"; - for (auto &output : OutputLive) { - pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs()); + for (auto &Output : OutputLive) { + pressure::print_reg(Output.first, MRI, SIRI, llvm::dbgs()); dbgs() << "\n"; } @@ -58,8 +58,7 @@ bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const { return false; } -void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI) { +void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI) { SMaxSize = std::max(SInputSize, SOutputSize); VMaxSize = std::max(VInputSize, VOutputSize); @@ -76,23 +75,23 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; - LaneBitmask mask = getRegMask(MO, MRI); - auto it = LiveRegs.find(Reg); - if (it != LiveRegs.end()) { - LiveRegs[Reg] = mask | it->second; + LaneBitmask Mask = getRegMask(MO, MRI); + auto It = LiveRegs.find(Reg); + if (It != LiveRegs.end()) { + LiveRegs[Reg] = Mask | It->second; } else { - LiveRegs[Reg] = mask; + LiveRegs[Reg] = Mask; } } } - for (auto it : LiveRegs) { - LaneBitmask emptyMask; - CurPressure.inc(it.first, emptyMask, it.second, MRI); + for (auto It : LiveRegs) { + LaneBitmask EmptyMask; + CurPressure.inc(It.first, EmptyMask, It.second, MRI); } - for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) { - MachineInstr *MI = *it; + for (auto It = SUnits.rbegin(); It != SUnits.rend(); It++) { + MachineInstr *MI = *It; auto *ST = &MI->getMF() ->getSubtarget(); // TODO: Better way to get this. @@ -108,9 +107,9 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, LaneBitmask LiveMask = getRegMask(MO, MRI); LaneBitmask PrevMask; - auto liveIt = LiveRegs.find(Reg); - if (liveIt != LiveRegs.end()) { - PrevMask = liveIt->second; + auto LiveIt = LiveRegs.find(Reg); + if (LiveIt != LiveRegs.end()) { + PrevMask = LiveIt->second; } if (MO.isDef()) { @@ -123,16 +122,16 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI, LiveRegs[Reg] = LiveMask; } - unsigned sSize = CurPressure.getSGPRNum(); - unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts()); - if (sSize > SMaxSize) - SMaxSize = sSize; - if (vSize > VMaxSize) - VMaxSize = vSize; + unsigned SSize = CurPressure.getSGPRNum(); + unsigned VSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts()); + if (SSize > SMaxSize) + SMaxSize = SSize; + if (VSize > VMaxSize) + VMaxSize = VSize; } } -bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool IsMoveUp) const { +bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI) const { if (IsMultiDefOutput) return false; if (IsHasTerminatorInst) @@ -142,7 +141,7 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool IsMoveUp) const { // Input should be single def. for (unsigned Reg : TopRegs) { - if (!MRI.hasOneDef(Reg) && !llvm::IsSub0Sub1SingleDef(Reg, MRI)) + if (!MRI.hasOneDef(Reg) && !llvm::isSub0Sub1SingleDef(Reg, MRI)) return false; } return true; @@ -154,11 +153,11 @@ ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI, : MRI(MRI), SIRI(SIRI), SIII(SIII), IsJoinInputToSubExp(IsJoinInput) {} template -void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) { - unsigned NodeSize = InputLiveReg.size() + insts.size(); +void ExpDag::initNodes(const LiveSet &InputLiveReg, T &Insts) { + unsigned NodeSize = InputLiveReg.size() + Insts.size(); SUnits.reserve(NodeSize); - for (MachineInstr *MI : insts) { + for (MachineInstr *MI : Insts) { if (MI->isDebugInstr()) continue; SUnits.emplace_back(MI, SUnits.size()); @@ -167,8 +166,8 @@ void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) { MISUnitMap[MI] = SU; } - for (auto it : InputLiveReg) { - unsigned Reg = it.first; + for (auto It : InputLiveReg) { + unsigned Reg = It.first; SUnits.emplace_back(); SUnit *SU = &SUnits.back(); SU->NodeNum = SUnits.size() - 1; @@ -187,7 +186,7 @@ template void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg, T &Insts) { initNodes(InputLiveReg, Insts); - addDataDep(SIRI); + addDataDep(); addCtrlDep(); buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII); } @@ -203,10 +202,10 @@ template void ExpDag::build>( void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) { IntEqClasses SubtreeClasses(SUnits.size()); - std::vector passThruInputs; + std::vector PassThruInputs; for (SUnit &SU : SUnits) { if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) { - passThruInputs.emplace_back(SU.NodeNum); + PassThruInputs.emplace_back(SU.NodeNum); continue; } if (!IsJoinInputToSubExp && !SU.isInstr()) @@ -227,9 +226,9 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, SubtreeClasses.compress(); unsigned NumSubExps = SubtreeClasses.getNumClasses(); - // Not count passThruInputs for subExps since they're exp with only 1 SU. + // Not count PassThruInputs for subExps since they're exp with only 1 SU. // SubExpIndexMap is used to pack SubIdx within updated NumSubExps. - NumSubExps -= passThruInputs.size(); + NumSubExps -= PassThruInputs.size(); SubExps.resize(NumSubExps); DenseMap SubExpIndexMap; @@ -242,18 +241,18 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, unsigned OriginSubIdx = SubIdx; // Pack subidx. if (SubExpIndexMap.count(SubIdx) == 0) { - unsigned count = SubExpIndexMap.size(); - SubExpIndexMap.insert(std::make_pair(SubIdx, count)); + unsigned Count = SubExpIndexMap.size(); + SubExpIndexMap.insert(std::make_pair(SubIdx, Count)); } SubIdx = SubExpIndexMap[SubIdx]; // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag. SU.NodeQueueId = SubIdx; SubExp &Exp = SubExps[SubIdx]; - auto it = SUnitInputMap.find(&SU); - if (it != SUnitInputMap.end()) { + auto It = SUnitInputMap.find(&SU); + if (It != SUnitInputMap.end()) { // Input. - unsigned Reg = it->second; + Register Reg = It->second; Exp.TopRegs.insert(Reg); } else { MachineInstr *MI = SU.getInstr(); @@ -264,7 +263,7 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, continue; if (!MO.isUse()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) { Exp.IsUseIncomingReg = true; } @@ -301,13 +300,13 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, IsUsedInOtherBlk = true; break; } - auto suIt = MISUnitMap.find(&UserMI); + auto SuIt = MISUnitMap.find(&UserMI); // When UserMI is not in dag, treat it as other block. - if (suIt == MISUnitMap.end()) { + if (SuIt == MISUnitMap.end()) { IsUsedInOtherBlk = true; break; } - SUnit *UseSU = suIt->second; + SUnit *UseSU = SuIt->second; // UserMI should always be in same subExp. unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum]; if (UseSubIdx != OriginSubIdx) { @@ -333,34 +332,34 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, // Only reg will miss live mask. for (SubExp &Exp : SubExps) { for (unsigned Reg : Exp.TopRegs) { - auto it = StartLiveReg.find(Reg); - assert(it != StartLiveReg.end() && + auto It = StartLiveReg.find(Reg); + assert(It != StartLiveReg.end() && "cannot find input reg in block start live"); - Exp.InputLive[Reg] |= it->second; + Exp.InputLive[Reg] |= It->second; } for (unsigned Reg : Exp.BottomRegs) { - auto it = EndLiveReg.find(Reg); - if (it == EndLiveReg.end()) { + auto It = EndLiveReg.find(Reg); + if (It == EndLiveReg.end()) { //"cannot find output reg in block end live"); // Bottom reg is killed inside current block, did not get out of the // block. // Or the bottom reg is not treat as output in this dag, not save to - // outputLive which will affect profit count. + // OutputLive which will affect profit count. continue; } - Exp.OutputLive[Reg] |= it->second; + Exp.OutputLive[Reg] |= It->second; } - CollectLiveSetPressure(Exp.InputLive, MRI, SIRI, Exp.VInputSize, + collectLiveSetPressure(Exp.InputLive, MRI, SIRI, Exp.VInputSize, Exp.SInputSize); - CollectLiveSetPressure(Exp.OutputLive, MRI, SIRI, Exp.VOutputSize, + collectLiveSetPressure(Exp.OutputLive, MRI, SIRI, Exp.VOutputSize, Exp.SOutputSize); } } -void ExpDag::addDataDep(const SIRegisterInfo *SIRI) { - DenseMap curDefMI; +void ExpDag::addDataDep() { + DenseMap CurDefMI; for (SUnit &SU : SUnits) { if (!SU.isInstr()) @@ -377,11 +376,11 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) { Register Reg = MO.getReg(); SUnit *DefSU = nullptr; - auto curDefIt = curDefMI.find(Reg); + auto CurDefIt = CurDefMI.find(Reg); // Check def inst first. - if (curDefIt != curDefMI.end()) { - MachineInstr *curDef = curDefIt->second; - DefSU = MISUnitMap[curDef]; + if (CurDefIt != CurDefMI.end()) { + MachineInstr *CurDef = CurDefIt->second; + DefSU = MISUnitMap[CurDef]; } else { // physical reg is not in live reg. if (!Reg.isVirtual()) @@ -404,7 +403,7 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) { continue; if (!MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); // For case like: // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 @@ -412,17 +411,17 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) { // When partially write, link MI to previous def. if (MO.getSubReg() != 0) { SUnit *DefSU = nullptr; - auto curDefIt = curDefMI.find(Reg); + auto CurDefIt = CurDefMI.find(Reg); // Check def inst first. - if (curDefIt != curDefMI.end()) { - MachineInstr *CurDef = curDefIt->second; + if (CurDefIt != CurDefMI.end()) { + MachineInstr *CurDef = CurDefIt->second; DefSU = MISUnitMap[CurDef]; // Add link between different defs. SU.addPred(SDep(DefSU, SDep::Data, Reg)); } } - curDefMI[Reg] = MI; + CurDefMI[Reg] = MI; } } } @@ -521,7 +520,7 @@ void BlockExpDag::buildAvail(const LiveSet &PassThruSet, MachineInstr *MI = SU->getInstr(); // Calc pressure based on pred nodes. - GCNRPTracker::LiveRegSet dagLive; + GCNRPTracker::LiveRegSet DagLive; for (auto &Pred : SU->Preds) { SUnit *PredSU = Pred.getSUnit(); GCNRPTracker::LiveRegSet PredLive = DagAvailRegMap[PredSU]; @@ -533,9 +532,9 @@ void BlockExpDag::buildAvail(const LiveSet &PassThruSet, // Update PredLive based on MI. RP.advance(); } - llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs()); + llvm::mergeLiveRegSet(DagLive, RP.getLiveRegs()); } - DagAvailRegMap[SU] = dagLive; + DagAvailRegMap[SU] = DagLive; // Add succ to work list. for (auto &Succ : SU->Succs) { @@ -561,23 +560,23 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, if (MBB->empty()) return; DenseMap DagAvailRegMap; - GCNRPTracker::LiveRegSet passThruSet; - for (auto Reg : StartLiveReg) { - unsigned reg = Reg.first; - auto EndReg = EndLiveReg.find(reg); + GCNRPTracker::LiveRegSet PassThruSet; + for (auto It : StartLiveReg) { + Register Reg = It.first; + auto EndReg = EndLiveReg.find(Reg); if (EndReg == EndLiveReg.end()) continue; - LaneBitmask mask = Reg.second; - LaneBitmask endMask = EndReg->second; - mask &= endMask; - if (mask.getAsInteger() == 0) + LaneBitmask Mask = It.second; + LaneBitmask EndMask = EndReg->second; + Mask &= EndMask; + if (Mask.getAsInteger() == 0) continue; - passThruSet[reg] = mask; + PassThruSet[Reg] = Mask; } // Build avial for each nodes. - buildAvail(passThruSet, DagAvailRegMap); + buildAvail(PassThruSet, DagAvailRegMap); // Calc avaialbe for each node, live is avail & sum(input of success). // If a reg is avaiable from the node, then success node can use it from this @@ -594,10 +593,10 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, // Using pass thru as base because output of current SU should not // affect other output SUs. GCNUpwardRPTracker RP(*LIS); - RP.reset(BeginMI, &passThruSet, /*After*/ true); + RP.reset(BeginMI, &PassThruSet, /*After*/ true); MachineInstr *MI = SU.getInstr(); if (MI) { - RP.reset(*MI, &passThruSet, /*After*/ true); + RP.reset(*MI, &PassThruSet, /*After*/ true); RP.recede(*MI); } DagPressureMap[&SU] = RP.getLiveRegs(); @@ -611,7 +610,6 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, } while (!WorkList.empty()) { - bool IsUpdated = false; SmallVector ReadyNodes; for (SUnit *SU : WorkList) { if (SU->NumSuccsLeft > 0) @@ -619,7 +617,6 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, ReadyNodes.emplace_back(SU); // Ready, move it to Processed. Processed.insert(SU); - IsUpdated = true; // Only update 1 node once. // Order of schedle here should not affect pressure. break; @@ -631,7 +628,7 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, MachineInstr *MI = SU->getInstr(); // Calc pressure based on succ nodes. - GCNRPTracker::LiveRegSet dagLive; + GCNRPTracker::LiveRegSet DagLive; for (auto &Succ : SU->Succs) { SUnit *SuccSU = Succ.getSUnit(); GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU]; @@ -643,12 +640,12 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, // Update SuccLive based on MI. RP.recede(*MI); } - llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs()); + llvm::mergeLiveRegSet(DagLive, RP.getLiveRegs()); } // Remove live which not avail in SU. - GCNRPTracker::LiveRegSet availLive = DagAvailRegMap[SU]; - llvm::andLiveRegSet(dagLive, availLive); - DagPressureMap[SU] = dagLive; + GCNRPTracker::LiveRegSet AvailLive = DagAvailRegMap[SU]; + llvm::andLiveRegSet(DagLive, AvailLive); + DagPressureMap[SU] = DagLive; // Add pred to work list. for (auto &Pred : SU->Preds) { @@ -669,16 +666,16 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg, // dump functions. std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const { - std::string s; - raw_string_ostream oss(s); - auto it = SUnitInputMap.find(SU); - if (it != SUnitInputMap.end()) { - oss << "second) << ">"; + std::string S; + raw_string_ostream OSS(S); + auto It = SUnitInputMap.find(SU); + if (It != SUnitInputMap.end()) { + OSS << "second) << ">"; } else { - SU->getInstr()->print(oss, /*SkipOpers=*/true); + SU->getInstr()->print(OSS, /*SkipOpers=*/true); } - return oss.str(); + return OSS.str(); } /// Return the label. @@ -688,7 +685,6 @@ std::string ExpDag::getDAGName() const { return "dag.exp"; } /// rendered using 'dot'. /// void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const { -#if 0 // TODO: Re-enable this // This code is only for debugging! #ifndef NDEBUG ViewGraph(const_cast(this), Name, false, Title); @@ -696,7 +692,6 @@ void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const { errs() << "BlockExpDag::viewGraph is only available in debug builds on " << "systems with Graphviz or gv!\n"; #endif // NDEBUG -#endif } void ExpDag::dump() { @@ -713,9 +708,9 @@ static DenseSet ViewNodes; template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { - DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} - static std::string getGraphName(const llvm::ExpDag *G) { + static std::string getGraphName(const llvm::ExpDag *) { return "ExpDag graph"; } @@ -729,7 +724,7 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { } static std::string getNodeIdentifierLabel(const SUnit *Node, - const llvm::ExpDag *Graph) { + const llvm::ExpDag *) { std::string R; raw_string_ostream OS(R); OS << static_cast(Node); @@ -738,8 +733,8 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { /// If you want to override the dot attributes printed for a particular /// edge, override this method. - static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI, - const llvm::ExpDag *Graph) { + static std::string getEdgeAttributes(const SUnit *, SUnitIterator EI, + const llvm::ExpDag *) { if (EI.isArtificialDep()) return "color=cyan,style=dashed"; if (EI.isCtrlDep()) @@ -747,7 +742,7 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { return ""; } - static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *Graph) { + static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *) { std::string Str; raw_string_ostream SS(Str); SS << "SU:" << SU->NodeNum; @@ -758,7 +753,7 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { return G->getGraphNodeLabel(SU); } static std::string getNodeAttributes(const SUnit *N, - const llvm::ExpDag *Graph) { + const llvm::ExpDag *) { std::string Str("shape=Mrecord"); Str += ",style=filled,fillcolor=\"#"; @@ -798,42 +793,42 @@ void getRegBound(llvm::MachineBasicBlock *MBB, MaxSGPR = AMDGPU::SGPR104 - AMDGPU::SGPR0; const auto &EndSlot = LIS->getMBBEndIdx(MBB); - const GCNRPTracker::LiveRegSet outputLive = + const GCNRPTracker::LiveRegSet OutputLive = llvm::getLiveRegs(EndSlot, *LIS, MRI); auto *ST = &MBB->getParent() ->getSubtarget(); // TODO: Better way to get this. if (MBB->empty()) { - GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive); + GCNRegPressure MaxPressure = getRegPressure(MRI, OutputLive); MaxSGPR = MaxPressure.getSGPRNum(); MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts()); return; } - BlockExpDag dag(MBB, LIS, MRI, SIRI, SIII); - dag.build(); + BlockExpDag Dag(MBB, LIS, MRI, SIRI, SIII); + Dag.build(); - std::vector &SUnits = dag.SUnits; + std::vector &SUnits = Dag.SUnits; // Remove input nodes. for (SUnit &SU : SUnits) { if (!SU.isInstr()) continue; - std::vector inputDeps; + std::vector InputDeps; for (SDep &Dep : SU.Preds) { SUnit *Pred = Dep.getSUnit(); if (Pred->isInstr()) continue; - inputDeps.emplace_back(Dep); + InputDeps.emplace_back(Dep); } - for (SDep &Dep : inputDeps) { + for (SDep &Dep : InputDeps) { SU.removePred(Dep); } } - unsigned inputSize = dag.InputSUnitMap.size(); - unsigned instNodeSize = SUnits.size() - inputSize; - SUnits.erase(SUnits.begin() + instNodeSize, SUnits.end()); + const unsigned InputSize = Dag.InputSUnitMap.size(); + const unsigned InstNodeSize = SUnits.size() - InputSize; + SUnits.erase(SUnits.begin() + InstNodeSize, SUnits.end()); std::vector BotRoots; for (SUnit &SU : SUnits) { @@ -844,9 +839,9 @@ void getRegBound(llvm::MachineBasicBlock *MBB, auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI); GCNUpwardRPTracker RPTracker(*LIS); - RPTracker.reset(MBB->front(), &outputLive, /*After*/ true); - for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) { - const SUnit *SU = *it; + RPTracker.reset(MBB->front(), &OutputLive, /*After*/ true); + for (auto It = SchedResult.rbegin(); It != SchedResult.rend(); It++) { + const SUnit *SU = *It; if (!SU->isInstr()) continue; MachineInstr *MI = SU->getInstr(); @@ -863,32 +858,32 @@ void getRegBound(llvm::MachineBasicBlock *MBB, namespace { std::vector buildWorkList(std::vector &SUnits) { - std::vector resultList; - resultList.reserve(SUnits.size()); + std::vector ResultList; + ResultList.reserve(SUnits.size()); for (SUnit &SU : SUnits) { - resultList.emplace_back(&SU); + ResultList.emplace_back(&SU); } - return resultList; + return ResultList; } -void sortByHeight(std::vector &workList) { - std::sort(workList.begin(), workList.end(), - [](const SUnit *a, const SUnit *b) { +void sortByHeight(std::vector &WorkList) { + std::sort(WorkList.begin(), WorkList.end(), + [](const SUnit *A, const SUnit *B) { // Lowest height first. - if (a->getHeight() < b->getHeight()) + if (A->getHeight() < B->getHeight()) return true; // If height the same, NodeNum big first. - if (a->getHeight() == b->getHeight()) - return a->NodeNum > b->NodeNum; + if (A->getHeight() == B->getHeight()) + return A->NodeNum > B->NodeNum; return false; }); } -void sortByInChain(std::vector &workList, DenseSet &Chained) { +void sortByInChain(std::vector &WorkList, DenseSet &Chained) { // In chain nodes at end. - std::sort(workList.begin(), workList.end(), - [&Chained](const SUnit *a, const SUnit *b) { - return Chained.count(a) < Chained.count(b); + std::sort(WorkList.begin(), WorkList.end(), + [&Chained](const SUnit *A, const SUnit *B) { + return Chained.count(A) < Chained.count(B); }); } @@ -905,7 +900,7 @@ const TargetRegisterClass *getRegClass(SUnit *SU, MachineOperand *MO = MI->defs().begin(); if (!MO->isReg()) return nullptr; - unsigned Reg = MO->getReg(); + Register Reg = MO->getReg(); return SIRI->getRegClassForReg(MRI, Reg); } @@ -926,12 +921,12 @@ unsigned getSGPRSize(const TargetRegisterClass *RC, return RC->getLaneMask().getNumLanes(); } -void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet &backNodes, +void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet &BackNodes, unsigned NodeNum, - SmallDenseSet &visitedNodes) { - if (visitedNodes.count(SU)) + SmallDenseSet &VisitedNodes) { + if (VisitedNodes.count(SU)) return; - visitedNodes.insert(SU); + VisitedNodes.insert(SU); for (SDep &Dep : SU->Succs) { if (Dep.isWeak()) @@ -943,8 +938,8 @@ void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet &backNodes, if (Succ->NodeNum >= NodeNum) continue;*/ - backNodes.insert(Succ); - collectSameHeightBackNodes(Succ, backNodes, NodeNum, visitedNodes); + BackNodes.insert(Succ); + collectSameHeightBackNodes(Succ, BackNodes, NodeNum, VisitedNodes); } } @@ -963,60 +958,60 @@ SUnit *HRB::Lineage::getTail() const { return Nodes.back(); } void HRB::buildLinear(std::vector &SUnits) { // Working list from TopRoots. - std::vector workList = buildWorkList(SUnits); + std::vector WorkList = buildWorkList(SUnits); IntEqClasses EqClasses(SUnits.size()); - while (!workList.empty()) { - sortByHeight(workList); + while (!WorkList.empty()) { + sortByHeight(WorkList); // Highest SU. - SUnit *SU = workList.back(); - workList.pop_back(); + SUnit *SU = WorkList.back(); + WorkList.pop_back(); if (!SU->isInstr()) continue; if (ChainedNodes.count(SU) > 0) continue; IsRecomputeHeight = false; - Lineage lineage = buildChain(SU, SUnits); + Lineage Lineage = buildChain(SU, SUnits); // Remove chained nodes from worklist. - sortByInChain(workList, ChainedNodes); - while (!workList.empty()) { - SUnit *back = workList.back(); - if (ChainedNodes.count(back)) - workList.pop_back(); + sortByInChain(WorkList, ChainedNodes); + while (!WorkList.empty()) { + SUnit *Back = WorkList.back(); + if (ChainedNodes.count(Back)) + WorkList.pop_back(); else break; } - Lineages.emplace_back(lineage); + Lineages.emplace_back(Lineage); if (IsRecomputeHeight) { // Update height from tail. - SUnit *tail = lineage.Nodes.back(); - tail->setDepthDirty(); - tail->getHeight(); + SUnit *Tail = Lineage.Nodes.back(); + Tail->setDepthDirty(); + Tail->getHeight(); } } - DenseSet tailSet; + DenseSet TailSet; for (Lineage &L : Lineages) { if (L.Nodes.size() < 2) continue; - auto it = L.Nodes.rbegin(); - it++; - SUnit *tail = L.Nodes.back(); - // If already as tail for other lineage, start from next. - if (tailSet.count(tail) > 0) { - tail = *it; - it++; + auto It = L.Nodes.rbegin(); + It++; + SUnit *Tail = L.Nodes.back(); + // If already as tail for other Lineage, start from next. + if (TailSet.count(Tail) > 0) { + Tail = *It; + It++; } else { - tailSet.insert(tail); + TailSet.insert(Tail); } - for (; it != L.Nodes.rend(); it++) { - SUnit *SU = *it; - if (tail->NodeNum == -1) + for (; It != L.Nodes.rend(); It++) { + SUnit *SU = *It; + if (Tail->NodeNum == (unsigned)-1) continue; - EqClasses.join(SU->NodeNum, tail->NodeNum); + EqClasses.join(SU->NodeNum, Tail->NodeNum); } } @@ -1024,7 +1019,7 @@ void HRB::buildLinear(std::vector &SUnits) { // TODO: assign sub class to node. for (Lineage &L : Lineages) { for (SUnit *SU : L.Nodes) { - if (SU->NodeNum == -1) + if (SU->NodeNum == (unsigned)-1) continue; unsigned SubIdx = EqClasses[SU->NodeNum]; //// Pack subidx. @@ -1040,7 +1035,7 @@ void HRB::buildLinear(std::vector &SUnits) { dbgs() << "Chained Nodes:"; for (SUnit *SU : ChainedNodes) { dbgs() << " " << SU->NodeNum << "\n"; - } for (int i = 0; i < Lineages.size(); i++) { + } for (unsigned i = 0; i < Lineages.size(); i++) { dbgs() << "Lineage" << i << ":"; Lineage &L = Lineages[i]; for (SUnit *SU : L.Nodes) { @@ -1078,7 +1073,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector &SUnits) { } // Make sure choose lowest dependence between SameHeightCandidate. if (SameHeightCandidate.size() > 1) { - for (int i = 1; i < SameHeightCandidate.size(); i++) { + for (size_t i = 1; i < SameHeightCandidate.size(); i++) { SUnit *SU = SameHeightCandidate[i]; // If Heir is pred of SU, use SU. if (canReach(SU, Heir)) @@ -1116,8 +1111,8 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector &SUnits) { } HRB::Lineage HRB::buildChain(SUnit *Node, std::vector &SUnits) { - HRB::Lineage chain; - chain.addNode(Node); + HRB::Lineage Chain; + Chain.addNode(Node); ChainedNodes.insert(Node); LLVM_DEBUG(dbgs() << "start chain " << Node->NodeNum << "(" << Node->getHeight() << ")\n"); @@ -1125,7 +1120,7 @@ HRB::Lineage HRB::buildChain(SUnit *Node, std::vector &SUnits) { SUnit *Heir = findHeir(Node, SUnits); if (!Heir) break; - chain.addNode(Heir); + Chain.addNode(Heir); LLVM_DEBUG(dbgs() << "add node to chain " << Heir->NodeNum << "\n"); if (ChainedNodes.count(Heir) > 0) @@ -1137,38 +1132,38 @@ HRB::Lineage HRB::buildChain(SUnit *Node, std::vector &SUnits) { // Find biggest vgpr RC for the chain. // TODO: Build conflict and allocate on each edge of the chain. const TargetRegisterClass *RC = nullptr; - unsigned maxRCSize = 0; - for (SUnit *SU : chain.Nodes) { + unsigned MaxRCSize = 0; + for (SUnit *SU : Chain.Nodes) { const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI); unsigned RCSize = getVGPRSize(SuRC, SIRI); - if (RCSize > maxRCSize) { - maxRCSize = RCSize; + if (RCSize > MaxRCSize) { + MaxRCSize = RCSize; RC = SuRC; } } if (!RC) { // TODO: Find biggest sgpr RC. - unsigned maxRCSize = 0; - for (SUnit *SU : chain.Nodes) { + unsigned MaxRCSize = 0; + for (SUnit *SU : Chain.Nodes) { const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI); unsigned RCSize = getSGPRSize(SuRC, SIRI); - if (RCSize > maxRCSize) { - maxRCSize = RCSize; + if (RCSize > MaxRCSize) { + MaxRCSize = RCSize; RC = SuRC; } } } - chain.RC = RC; - return chain; + Chain.RC = RC; + return Chain; } void HRB::buildConflict() { for (unsigned i = 0; i < Lineages.size(); i++) { - Lineage &a = Lineages[i]; + Lineage &A = Lineages[i]; for (unsigned j = i + 1; j < Lineages.size(); j++) { - Lineage &b = Lineages[j]; - if (isConflict(a, b)) { + Lineage &B = Lineages[j]; + if (isConflict(A, B)) { Color.Conflicts[i].insert(j); Color.Conflicts[j].insert(i); LLVM_DEBUG(dbgs() << i << " conflict" << j << "\n"); @@ -1179,24 +1174,24 @@ void HRB::buildConflict() { } } -bool HRB::canReach(llvm::SUnit *a, llvm::SUnit *b) { - auto it = ReachMap.find(a); +bool HRB::canReach(llvm::SUnit *A, llvm::SUnit *B) { + auto It = ReachMap.find(A); // If no reach info, treat as reach. - if (it == ReachMap.end()) + if (It == ReachMap.end()) return true; - DenseSet &CurReach = it->second; - return CurReach.find(b) != CurReach.end(); + DenseSet &CurReach = It->second; + return CurReach.find(B) != CurReach.end(); } -void HRB::updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b, +void HRB::updateReachForEdge(llvm::SUnit *A, llvm::SUnit *B, std::vector &SUnits) { - DenseSet &ReachA = ReachMap[a]; - ReachA.insert(b); - DenseSet &ReachB = ReachMap[b]; + DenseSet &ReachA = ReachMap[A]; + ReachA.insert(B); + DenseSet &ReachB = ReachMap[B]; ReachA.insert(ReachB.begin(), ReachB.end()); for (SUnit &SU : SUnits) { - if (!canReach(&SU, a)) + if (!canReach(&SU, A)) continue; DenseSet &CurReach = ReachMap[&SU]; @@ -1252,91 +1247,91 @@ void HRB::buildReachRelation(ArrayRef BotRoots) { }); } -bool HRB::isConflict(const Lineage &a, const Lineage &b) { +bool HRB::isConflict(const Lineage &A, const Lineage &B) { // Make conflict between sgpr and vgpr to help group lineages when share // colors. Keep the conflict will group lineages in avoid mix use color in // different sub exp. - SUnit *head0 = a.getHead(); - SUnit *tail0 = a.getTail(); - SUnit *head1 = b.getHead(); - SUnit *tail1 = b.getTail(); - DenseSet &Reach0 = ReachMap[head0]; - DenseSet &Reach1 = ReachMap[head1]; - bool r01 = Reach0.count(tail1) != 0; - bool r10 = Reach1.count(tail0) != 0; - return r01 && r10; + SUnit *Head0 = A.getHead(); + SUnit *Tail0 = A.getTail(); + SUnit *Head1 = B.getHead(); + SUnit *Tail1 = B.getTail(); + DenseSet &Reach0 = ReachMap[Head0]; + DenseSet &Reach1 = ReachMap[Head1]; + bool R01 = Reach0.count(Tail1) != 0; + bool R10 = Reach1.count(Tail0) != 0; + return R01 && R10; } -bool HRB::canFuse(const Lineage &a, const Lineage &b) { - if (a.RC != b.RC) { +bool HRB::canFuse(const Lineage &A, const Lineage &B) { + if (A.RC != B.RC) { // no RC will not conflict with other nodes. - if (!a.RC) + if (!A.RC) return false; - if (!b.RC) + if (!B.RC) return false; // SGRP and VGPR not conflict. - if (SIRI->isSGPRClass(a.RC) != SIRI->isSGPRClass(b.RC)) + if (SIRI->isSGPRClass(A.RC) != SIRI->isSGPRClass(B.RC)) return false; } // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa. - SUnit *head0 = a.getHead(); - SUnit *tail0 = a.getTail(); - SUnit *head1 = b.getHead(); - SUnit *tail1 = b.getTail(); - DenseSet &Reach0 = ReachMap[head0]; - DenseSet &Reach1 = ReachMap[head1]; - bool r01 = Reach0.count(tail1) != 0; - bool r10 = Reach1.count(tail0) != 0; - return r01 != r10; + SUnit *Head0 = A.getHead(); + SUnit *Tail0 = A.getTail(); + SUnit *Head1 = B.getHead(); + SUnit *Tail1 = B.getTail(); + DenseSet &Reach0 = ReachMap[Head0]; + DenseSet &Reach1 = ReachMap[Head1]; + bool R01 = Reach0.count(Tail1) != 0; + bool R10 = Reach1.count(Tail0) != 0; + return R01 != R10; } -bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector &SUnits) { +bool HRB::tryFuse(Lineage &A, Lineage &B, std::vector &SUnits) { // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa. - SUnit *head0 = a.getHead(); - SUnit *tail0 = a.getTail(); - SUnit *head1 = b.getHead(); - SUnit *tail1 = b.getTail(); - DenseSet &Reach0 = ReachMap[head0]; - DenseSet &Reach1 = ReachMap[head1]; - bool r01 = Reach0.count(tail1) != 0; - bool r10 = Reach1.count(tail0) != 0; - if (r01 == r10) + SUnit *Head0 = A.getHead(); + SUnit *Tail0 = A.getTail(); + SUnit *Head1 = B.getHead(); + SUnit *Tail1 = B.getTail(); + DenseSet &Reach0 = ReachMap[Head0]; + DenseSet &Reach1 = ReachMap[Head1]; + bool R01 = Reach0.count(Tail1) != 0; + bool R10 = Reach1.count(Tail0) != 0; + if (R01 == R10) return false; - Lineage *newHead = &a; - Lineage *newTail = &b; - if (r01) { + Lineage *NewHead = &A; + Lineage *NewTail = &B; + if (R01) { // a reach b, b cannot reach a. // link a.tail->b.head. - newHead = &a; - newTail = &b; + NewHead = &A; + NewTail = &B; } else { // b reach a, a cannot reach b. // link b.tail->a.head. - newHead = &b; - newTail = &a; + NewHead = &B; + NewTail = &A; } // Merge reg class. - const TargetRegisterClass *RC0 = newHead->RC; - const TargetRegisterClass *RC1 = newTail->RC; + const TargetRegisterClass *RC0 = NewHead->RC; + const TargetRegisterClass *RC1 = NewTail->RC; unsigned RC0Size = getVGPRSize(RC0, SIRI); unsigned RC1Size = getVGPRSize(RC1, SIRI); if (RC1Size > RC0Size) - newHead->RC = RC1; + NewHead->RC = RC1; // Merge chain. - SUnit *fuseTail = newHead->getTail(); - SUnit *fuseHead = newTail->getHead(); - assert(ReachMap[fuseHead].count(fuseTail) == 0 && ""); - fuseHead->addPred(SDep(fuseTail, SDep::Artificial)); - LLVM_DEBUG(dbgs() << "fuse " << fuseTail->NodeNum << "->" << fuseHead->NodeNum + SUnit *FuseTail = NewHead->getTail(); + SUnit *FuseHead = NewTail->getHead(); + assert(ReachMap[FuseHead].count(FuseTail) == 0 && ""); + FuseHead->addPred(SDep(FuseTail, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "fuse " << FuseTail->NodeNum << "->" << FuseHead->NodeNum << "\n"); // Update reach map. - updateReachForEdge(fuseTail, fuseHead, SUnits); + updateReachForEdge(FuseTail, FuseHead, SUnits); // Merge Nodes. - newHead->Nodes.append(newTail->Nodes.begin(), newTail->Nodes.end()); + NewHead->Nodes.append(NewTail->Nodes.begin(), NewTail->Nodes.end()); // Clear newTail. - newTail->Nodes.clear(); - newTail->RC = nullptr; + NewTail->Nodes.clear(); + NewTail->RC = nullptr; return true; } @@ -1346,27 +1341,27 @@ void HRB::fusionLineages(std::vector &SUnits) { bool IsUpdated = true; while (IsUpdated) { IsUpdated = false; - int size = Lineages.size(); - for (int i = 0; i < size; i++) { - Lineage &a = Lineages[i]; - if (a.length() == 0) + int Size = Lineages.size(); + for (int i = 0; i < Size; i++) { + Lineage &A = Lineages[i]; + if (A.length() == 0) continue; - for (int j = i + 1; j < size; j++) { - Lineage &b = Lineages[j]; - if (b.length() == 0) + for (int j = i + 1; j < Size; j++) { + Lineage &B = Lineages[j]; + if (B.length() == 0) continue; - if (tryFuse(a, b, SUnits)) { + if (tryFuse(A, B, SUnits)) { IsUpdated = true; - if (a.length() == 0) + if (A.length() == 0) break; } } } // Remove empty lineages. std::sort(Lineages.begin(), Lineages.end(), - [](const Lineage &a, const Lineage &b) { - return a.length() > b.length(); + [](const Lineage &A, const Lineage &B) { + return A.length() > B.length(); }); while (Lineages.back().length() == 0) { Lineages.pop_back(); @@ -1379,63 +1374,63 @@ void HRB::fusionLineages(std::vector &SUnits) { } } -unsigned HRB::colorLineages(std::vector &lineages, +unsigned HRB::colorLineages(std::vector &InLineages, DenseMap &AllocMap, const unsigned Limit) { // allocate long Lineage first. How about size of RC? - std::sort(lineages.begin(), lineages.end(), + std::sort(InLineages.begin(), InLineages.end(), [](const Lineage *a, const Lineage *b) { // Make sure root allocate first. return a->length() > b->length(); }); - unsigned maxColor = 0; + unsigned MaxColor = 0; const unsigned VGPR_LIMIT = 256 * 4; - for (Lineage *L : lineages) { + for (Lineage *L : InLineages) { unsigned ID = L->ID; auto &Conflict = Color.Conflicts[ID]; - std::bitset colors; + std::bitset Colors; for (unsigned j : Conflict) { - Lineage *C = &Lineages[j]; - if (AllocMap.count(C) == 0) + Lineage *LineageC = &Lineages[j]; + if (AllocMap.count(LineageC) == 0) continue; - unsigned c = AllocMap[C]; - unsigned s = C->getSize(); - for (unsigned i = 0; i < s; i++) { - unsigned pos = c + i; - colors.set(pos); + unsigned C = AllocMap[LineageC]; + unsigned S = LineageC->getSize(); + for (unsigned i = 0; i < S; i++) { + unsigned Pos = C + i; + Colors.set(Pos); } } - unsigned color = Limit; - unsigned size = L->getSize(); - for (unsigned i = 0; i < Limit - size;) { - unsigned oldI = i; - for (unsigned j = 0; j < size; j++) { - unsigned pos = i + size - 1 - j; - if (colors.test(pos)) { - i = pos + 1; + unsigned Color = Limit; + unsigned Size = L->getSize(); + for (unsigned i = 0; i < Limit - Size;) { + unsigned OldI = i; + for (unsigned j = 0; j < Size; j++) { + unsigned Pos = i + Size - 1 - j; + if (Colors.test(Pos)) { + i = Pos + 1; break; } } - if (i != oldI) + if (i != OldI) continue; - color = i; + Color = i; break; } - AllocMap[L] = color; - color += size; - if (color > maxColor) - maxColor = color; + AllocMap[L] = Color; + Color += Size; + if (Color > MaxColor) + MaxColor = Color; } - return maxColor; + return MaxColor; } -void HRB::ColorResult::colorSU(SUnit *SU, unsigned color) { - ColorMap[SU] = color; +void HRB::ColorResult::colorSU(SUnit *SU, unsigned Color) { + ColorMap[SU] = Color; } unsigned HRB::ColorResult::getLineage(SUnit *SU) const { @@ -1454,53 +1449,53 @@ bool HRB::ColorResult::isTail(SUnit *SU) const { return TailSet.count(SU); } const SUnit *HRB::ColorResult::getTail(SUnit *SU) const { if (!isHead(SU)) return nullptr; - auto it = HeadTailMap.find(SU); - return it->second; + auto It = HeadTailMap.find(SU); + return It->second; } unsigned HRB::ColorResult::getColor(const llvm::SUnit *SU) const { - auto it = ColorMap.find(SU); - return it->second; + auto It = ColorMap.find(SU); + return It->second; } unsigned HRB::ColorResult::getSize(const llvm::SUnit *SU) const { - auto it = SizeMap.find(SU); - return it->second; + auto It = SizeMap.find(SU); + return It->second; } HRB::ColorResult &HRB::coloring() { // Collect VGPR lineages. - std::vector vgprLineages; + std::vector VgprLineages; for (Lineage &L : Lineages) { - auto RC = L.RC; + const auto *RC = L.RC; if (!RC) continue; if (SIRI->isSGPRClass(RC)) continue; - vgprLineages.emplace_back(&L); + VgprLineages.emplace_back(&L); } const unsigned VGPR_LIMIT = 256 * 4; DenseMap VAllocMap; - const unsigned maxVGPR = colorLineages(vgprLineages, VAllocMap, VGPR_LIMIT); + const unsigned MaxVGPR = colorLineages(VgprLineages, VAllocMap, VGPR_LIMIT); // Collect SGPR lineages. - std::vector sgprLineages; + std::vector SgprLineages; for (Lineage &L : Lineages) { - auto RC = L.RC; + const auto *RC = L.RC; if (!RC) continue; if (!SIRI->isSGPRClass(RC)) continue; - sgprLineages.emplace_back(&L); + SgprLineages.emplace_back(&L); } const unsigned SGPR_LIMIT = 104; DenseMap SAllocMap; - const unsigned maxSGPR = colorLineages(sgprLineages, SAllocMap, SGPR_LIMIT); + const unsigned MaxSGPR = colorLineages(SgprLineages, SAllocMap, SGPR_LIMIT); // +1 for each type of lineages(SGPR, VGPR, no reg). - const unsigned maxReg = maxSGPR + 1 + maxVGPR + 1 + 1; - const unsigned sgprBase = maxVGPR + 1; + const unsigned MaxReg = MaxSGPR + 1 + MaxVGPR + 1 + 1; + const unsigned SgprBase = MaxVGPR + 1; for (Lineage &L : Lineages) { // Collect HeadSet. @@ -1508,41 +1503,41 @@ HRB::ColorResult &HRB::coloring() { Color.TailSet.insert(L.getTail()); Color.HeadTailMap[L.getHead()] = L.getTail(); // Save color. - auto RC = L.RC; + const auto *RC = L.RC; // All no reg lineage goes to maxReg. - unsigned color = maxReg; + unsigned RegColor = MaxReg; if (!RC) { } else if (SIRI->isSGPRClass(RC)) { - color = SAllocMap[&L] + sgprBase; + RegColor = SAllocMap[&L] + SgprBase; } else { - color = VAllocMap[&L]; + RegColor = VAllocMap[&L]; } - unsigned size = L.getSize(); + unsigned Size = L.getSize(); for (SUnit *SU : L.Nodes) { - Color.colorSU(SU, color); - Color.SizeMap[SU] = size; + Color.colorSU(SU, RegColor); + Color.SizeMap[SU] = Size; Color.LineageMap[SU] = L.ID; } } - Color.maxReg = maxReg; - Color.maxSGPR = maxSGPR; - Color.maxVGPR = maxVGPR; + Color.MaxReg = MaxReg; + Color.MaxSGPR = MaxSGPR; + Color.MaxVGPR = MaxVGPR; for (unsigned i = 0; i < Lineages.size(); i++) { - Lineage &a = Lineages[i]; - SUnit *headA = a.getHead(); - unsigned colorA = Color.getColor(headA); - unsigned sizeA = Color.getSize(headA); + Lineage &A = Lineages[i]; + SUnit *HeadA = A.getHead(); + unsigned ColorA = Color.getColor(HeadA); + unsigned SizeA = Color.getSize(HeadA); for (unsigned j = i + 1; j < Lineages.size(); j++) { - Lineage &b = Lineages[j]; + Lineage &B = Lineages[j]; - SUnit *headB = b.getHead(); - unsigned colorB = Color.getColor(headB); - unsigned sizeB = Color.getSize(headB); + SUnit *HeadB = B.getHead(); + unsigned ColorB = Color.getColor(HeadB); + unsigned SizeB = Color.getSize(HeadB); - if (colorB >= (colorA + sizeA)) + if (ColorB >= (ColorA + SizeA)) continue; - if (colorA >= (colorB + sizeB)) + if (ColorA >= (ColorB + SizeB)) continue; Color.ShareColorLineages.insert(i); Color.ShareColorLineages.insert(j); @@ -1553,7 +1548,7 @@ HRB::ColorResult &HRB::coloring() { } void HRB::dump() { - for (int i = 0; i < Lineages.size(); i++) { + for (unsigned i = 0; i < Lineages.size(); i++) { dbgs() << "Lineage" << i << ":"; Lineage &L = Lineages[i]; for (SUnit *SU : L.Nodes) { @@ -1566,7 +1561,7 @@ void HRB::dump() { } if (!ReachMap.empty()) { dbgs() << "conflict:"; - for (int j = 0; j < Lineages.size(); j++) { + for (unsigned j = 0; j < Lineages.size(); j++) { if (i == j) continue; if (isConflict(L, Lineages[j])) { @@ -1581,9 +1576,9 @@ void HRB::dump() { void HRB::dumpReachMap() { if (!ReachMap.empty()) { dbgs() << "reachMap:"; - for (auto it : ReachMap) { - SUnit *SU = it.first; - auto &Reach = it.second; + for (auto It : ReachMap) { + SUnit *SU = It.first; + auto &Reach = It.second; if (SU->isInstr()) { MachineInstr *MI = SU->getInstr(); MI->print(dbgs()); @@ -1604,24 +1599,24 @@ std::vector hrbSched(std::vector &SUnits, std::vector &BRoots, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI) { - HRB hrb(MRI, SIRI); + HRB Hrb(MRI, SIRI); // build reach info to avoid dead loop when build linear. - hrb.buildReachRelation(BRoots); - hrb.buildLinear(SUnits); + Hrb.buildReachRelation(BRoots); + Hrb.buildLinear(SUnits); - std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *a, const SUnit *b) { - return a->NumSuccsLeft < b->NumSuccsLeft; + std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *A, const SUnit *B) { + return A->NumSuccsLeft < B->NumSuccsLeft; }); while (!BRoots.empty() && BRoots.back()->NumSuccsLeft > 0) { BRoots.pop_back(); } - hrb.buildReachRelation(BRoots); - hrb.fusionLineages(SUnits); - hrb.buildConflict(); - const HRB::ColorResult &Color = hrb.coloring(); + Hrb.buildReachRelation(BRoots); + Hrb.fusionLineages(SUnits); + Hrb.buildConflict(); + const HRB::ColorResult &ColorRes = Hrb.coloring(); - LLVM_DEBUG(hrb.dump()); + LLVM_DEBUG(Hrb.dump()); // All lineage head which don't has Pred is TopRoots. // Put top roots in worklist. @@ -1638,30 +1633,30 @@ std::vector hrbSched(std::vector &SUnits, // When there're more than one sub exp in the DAG, make sure not mix different // sub exp or it will dead loop for color goes different subexp. - std::bitset<512 * 2> colors; - auto isColorAvail = [&colors](unsigned color, unsigned size) -> bool { - for (unsigned i = 0; i < size; i++) { - unsigned pos = color + i; - if (colors.test(pos)) + std::bitset<512 * 2> Colors; + auto IsColorAvail = [&Colors](unsigned Color, unsigned Size) -> bool { + for (unsigned i = 0; i < Size; i++) { + unsigned Pos = Color + i; + if (Colors.test(Pos)) return false; } return true; }; - auto allocColor = [&colors](unsigned color, unsigned size) { - for (unsigned i = 0; i < size; i++) { - unsigned pos = color + i; - assert(!colors.test(pos) && "color already allocated"); - LLVM_DEBUG(dbgs() << pos << "is allocated\n"); - colors.set(pos); + auto AllocColor = [&Colors](unsigned Color, unsigned Size) { + for (unsigned i = 0; i < Size; i++) { + unsigned Pos = Color + i; + assert(!Colors.test(Pos) && "color already allocated"); + LLVM_DEBUG(dbgs() << Pos << "is allocated\n"); + Colors.set(Pos); } }; - auto freeColor = [&colors](unsigned color, unsigned size) { - for (unsigned i = 0; i < size; i++) { - unsigned pos = color + i; - assert(colors.test(pos) && "color has not been allocated"); - LLVM_DEBUG(dbgs() << pos << "is free\n"); - colors.reset(pos); + auto FreeColor = [&Colors](unsigned Color, unsigned Size) { + for (unsigned i = 0; i < Size; i++) { + unsigned Pos = Color + i; + assert(Colors.test(Pos) && "color has not been allocated"); + LLVM_DEBUG(dbgs() << Pos << "is free\n"); + Colors.reset(Pos); } }; @@ -1680,25 +1675,25 @@ std::vector hrbSched(std::vector &SUnits, // ShareColorLineages will mark lineages which share color with other // lineages. When sched, choose new lineages which has more conflict with // ShareColorLineages. - const DenseSet &ShareColorLineages = Color.ShareColorLineages; + const DenseSet &ShareColorLineages = ColorRes.ShareColorLineages; std::vector Schedule; DenseSet UnfinishedLineages; while (!ReadyList.empty()) { // Make sure node conflict with predLineage first. std::sort(ReadyList.begin(), ReadyList.end(), - [&UnfinishedLineages, &Color](const SUnit *a, const SUnit *b) { - unsigned confA = 0; + [&UnfinishedLineages, &ColorRes](const SUnit *A, const SUnit *B) { + unsigned ConfA = 0; for (unsigned L : UnfinishedLineages) { - if (Color.isConflict(a, L)) - confA++; + if (ColorRes.isConflict(A, L)) + ConfA++; } - unsigned confB = 0; + unsigned ConfB = 0; for (unsigned L : UnfinishedLineages) { - if (Color.isConflict(b, L)) - confB++; + if (ColorRes.isConflict(B, L)) + ConfB++; } - return confA > confB; + return ConfA > ConfB; }); LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU @@ -1706,33 +1701,33 @@ std::vector hrbSched(std::vector &SUnits, dbgs() << " " << SU->NodeNum; } dbgs() << "\n";); SUnit *Candidate = nullptr; - for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) { - SUnit *SU = *it; - unsigned color = Color.getColor(SU); - unsigned size = Color.getSize(SU); + for (auto It = ReadyList.begin(); It != ReadyList.end(); It++) { + SUnit *SU = *It; + unsigned Color = ColorRes.getColor(SU); + unsigned Size = ColorRes.getSize(SU); // If SU is not head or color is available, SU is the candidate. - if (Color.isHead(SU)) { - if (!isColorAvail(color, size)) + if (ColorRes.isHead(SU)) { + if (!IsColorAvail(Color, Size)) continue; // alloc color. - allocColor(color, size); + AllocColor(Color, Size); // save tail color. - const SUnit *Tail = Color.getTail(SU); - unsigned ID = Color.getLineage(SU); - SmallVector, 2> &tailColors = + const SUnit *Tail = ColorRes.getTail(SU); + unsigned ID = ColorRes.getLineage(SU); + SmallVector, 2> &TailColors = TailMap[Tail]; - tailColors.emplace_back(std::make_tuple(color, size, ID)); + TailColors.emplace_back(std::make_tuple(Color, Size, ID)); if (ShareColorLineages.count(ID)) UnfinishedLineages.insert(ID); } // free color for working lineage which end with SU. - if (Color.isTail(SU)) { - auto &tailColors = TailMap[SU]; - for (auto &tailTuple : tailColors) { - unsigned lineageColor, lineageSize, ID; - std::tie(lineageColor, lineageSize, ID) = tailTuple; - freeColor(lineageColor, lineageSize); + if (ColorRes.isTail(SU)) { + auto &TailColors = TailMap[SU]; + for (auto &TailTuple : TailColors) { + unsigned LineageColor, LineageSize, ID; + std::tie(LineageColor, LineageSize, ID) = TailTuple; + FreeColor(LineageColor, LineageSize); if (ShareColorLineages.count(ID)) UnfinishedLineages.insert(ID); } @@ -1742,21 +1737,21 @@ std::vector hrbSched(std::vector &SUnits, Candidate = SU; // Remove Candidate from ReadyList. - ReadyList.erase(it); + ReadyList.erase(It); break; } if (!Candidate) { // In case failed to find candidate, start a lineage if there is one. - for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) { - SUnit *SU = *it; + for (auto It = ReadyList.begin(); It != ReadyList.end(); It++) { + SUnit *SU = *It; - if (!Color.isHead(SU)) { + if (!ColorRes.isHead(SU)) { continue; } Candidate = SU; // Remove Candidate from ReadyList. - ReadyList.erase(it); + ReadyList.erase(It); break; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h index c447750e17f1d..c19190c6afe24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h @@ -49,9 +49,8 @@ struct SubExp { unsigned VMaxSize; LiveSet InputLive; LiveSet OutputLive; - bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool IsMoveUp) const; - void calcMaxPressure(const llvm::MachineRegisterInfo &MRI, - const llvm::SIRegisterInfo *SIRI); + bool isSafeToMove(const llvm::MachineRegisterInfo &MRI) const; + void calcMaxPressure(const llvm::MachineRegisterInfo &MRI); void dump(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI) const; bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo *SIRI) const; @@ -83,8 +82,8 @@ struct ExpDag { void addCustomGraphFeatures(llvm::GraphWriter &) const {} private: - template void initNodes(const LiveSet &InputLiveReg, T &insts); - void addDataDep(const llvm::SIRegisterInfo *SIRI); + template void initNodes(const LiveSet &InputLiveReg, T &Insts); + void addDataDep(); void addCtrlDep(); void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg, const llvm::SIRegisterInfo *SIRI, @@ -140,10 +139,10 @@ class HRB { llvm::DenseSet HeadSet; llvm::DenseSet TailSet; llvm::DenseMap HeadTailMap; - unsigned maxReg = 0; - unsigned maxVGPR = 0; - unsigned maxSGPR = 0; - void colorSU(llvm::SUnit *SU, unsigned color); + unsigned MaxReg = 0; + unsigned MaxVGPR = 0; + unsigned MaxSGPR = 0; + void colorSU(llvm::SUnit *SU, unsigned Color); unsigned getLineage(llvm::SUnit *SU) const; bool isConflict(const llvm::SUnit *SU0, unsigned Lineage) const; bool isHead(llvm::SUnit *SU) const; @@ -161,8 +160,8 @@ class HRB { llvm::DenseMap> &getReachMap() { return ReachMap; } - bool canReach(llvm::SUnit *a, llvm::SUnit *b); - void updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b, + bool canReach(llvm::SUnit *a, llvm::SUnit *B); + void updateReachForEdge(llvm::SUnit *A, llvm::SUnit *B, std::vector &SUnits); void fusionLineages(std::vector &SUnits); ColorResult &coloring(); @@ -172,10 +171,10 @@ class HRB { private: Lineage buildChain(llvm::SUnit *Node, std::vector &SUnits); llvm::SUnit *findHeir(llvm::SUnit *SU, std::vector &SUnits); - bool isConflict(const Lineage &a, const Lineage &b); - bool canFuse(const Lineage &a, const Lineage &b); - bool tryFuse(Lineage &a, Lineage &b, std::vector &SUnits); - unsigned colorLineages(std::vector &lineages, + bool isConflict(const Lineage &A, const Lineage &B); + bool canFuse(const Lineage &A, const Lineage &B); + bool tryFuse(Lineage &A, Lineage &B, std::vector &SUnits); + unsigned colorLineages(std::vector &Lineages, llvm::DenseMap &AllocMap, const unsigned Limit); From 0600e2fd23c5a6b7992c6cfcc1944e255fbae7d9 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 16:57:56 -0700 Subject: [PATCH 16/25] Possibly the last batch of cleanup --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 1 - llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 928 +++++++++--------- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 14 +- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 24 +- llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h | 99 -- 5 files changed, 440 insertions(+), 626 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 853a212ac5bf3..a6ce3426a7b93 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -17,7 +17,6 @@ #include "AMDGPUOccupancyAndLatencyHelper.h" #include "AMDGPUSubExpDag.h" #include "AMDGPUSubtarget.h" -#include "AMDGPUVMemDegreeDAG.h" #include "GCNRegPressure.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index d207b3aa3d4f3..990718cd7525f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -60,10 +60,10 @@ class CFGWithPhi { }; void CFGWithPhi::dump() { -#ifdef DBG +#ifndef NDEBUG for (MachineBasicBlock &BB : F) { dbgs() << BB.getName() << "\n"; - auto &PhiInsts = blockToPhiInstsMap[&BB]; + auto &PhiInsts = BlockToPhiInstsMap[&BB]; for (MachineInstr *I : PhiInsts) { if (!I->isPHI()) continue; @@ -644,31 +644,31 @@ bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, } } // namespace llvm -// Helper functions to write jason. +// Helper functions to Write jason. namespace { -void json_name(StringRef Val, raw_ostream &os) { os << "\"" << Val << "\":"; } +void json_name(StringRef Val, raw_ostream &OS) { OS << "\"" << Val << "\":"; } template -void json_pair(StringRef Val, write_fn &fn, raw_ostream &os) { - json_name(Val, os); - os << "\""; - fn(); - os << "\""; +void json_pair(StringRef Val, write_fn &Fn, raw_ostream &OS) { + json_name(Val, OS); + OS << "\""; + Fn(); + OS << "\""; } template -void json_obj_pair(StringRef Val, write_fn &fn, raw_ostream &os) { - json_name(Val, os); +void json_obj_pair(StringRef Val, write_fn &Fn, raw_ostream &OS) { + json_name(Val, OS); - fn(); + Fn(); } template -void json_array(StringRef Val, write_fn &fn, raw_ostream &os) { - json_name(Val, os); - os << "["; - fn(); - os << "]"; +void json_array(StringRef Val, write_fn &Fn, raw_ostream &OS) { + json_name(Val, OS); + OS << "["; + Fn(); + OS << "]"; } } // namespace @@ -676,71 +676,71 @@ namespace llvm { namespace pressure { void write_inst(MachineInstr &MI, const SlotIndexes *SlotIndexes, - const SIInstrInfo *SIII, raw_ostream &os) { - os << "{"; + const SIInstrInfo *SIII, raw_ostream &OS) { + OS << "{"; SlotIndex Slot = SlotIndexes->getInstructionIndex(MI); - auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; - json_pair("slot_index", writeSlot, os); + json_pair("slot_index", WriteSlot, OS); - os << ","; + OS << ","; - auto writeOpcode = [&MI, &SIII, &os]() { - os << SIII->getName(MI.getOpcode()); + auto WriteOpcode = [&MI, &SIII, &OS]() { + OS << SIII->getName(MI.getOpcode()); }; - json_pair("opcode", writeOpcode, os); + json_pair("opcode", WriteOpcode, OS); - os << ","; + OS << ","; - auto writeAsm = [&MI, &SIII, &os]() { - MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false, + auto WriteAsm = [&MI, &SIII, &OS]() { + MI.print(OS, /*IsStandalone*/ true, /*SkipOpers*/ false, /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII); }; - json_pair("asm", writeAsm, os); + json_pair("asm", WriteAsm, OS); - os << "}"; + OS << "}"; } void print_reg(Register Reg, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, raw_ostream &os) { + const SIRegisterInfo *SIRI, raw_ostream &OS) { if (Reg.isVirtual()) { StringRef Name = MRI.getVRegName(Reg); if (Name != "") { - os << '%' << Name; + OS << '%' << Name; } else { - os << '%' << Register::virtReg2Index(Reg); + OS << '%' << Register::virtReg2Index(Reg); } } else if (Reg < SIRI->getNumRegs()) { - os << '$'; - printLowerCase(SIRI->getName(Reg), os); + OS << '$'; + printLowerCase(SIRI->getName(Reg), OS); } else { llvm_unreachable("invalid reg"); } } void write_reg(unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, raw_ostream &os) { - os << "{"; + const SIRegisterInfo *SIRI, raw_ostream &OS) { + OS << "{"; - auto writeReg = [&MRI, &SIRI, &Reg, &os]() { print_reg(Reg, MRI, SIRI, os); }; - json_pair("reg", writeReg, os); + auto WriteReg = [&MRI, &SIRI, &Reg, &OS]() { print_reg(Reg, MRI, SIRI, OS); }; + json_pair("reg", WriteReg, OS); - os << ","; + OS << ","; - auto writeSubReg = [&SubReg, &os]() { os << SubReg; }; + auto WriteSubReg = [&SubReg, &OS]() { OS << SubReg; }; - json_pair("sub_reg", writeSubReg, os); + json_pair("sub_reg", WriteSubReg, OS); - os << ","; - auto writeIsSgpr = [&Reg, &MRI, &SIRI, &os]() { + OS << ","; + auto WriteIsSgpr = [&Reg, &MRI, &SIRI, &OS]() { if (SIRI->isSGPRReg(MRI, Reg)) - os << "true"; + OS << "true"; else - os << "false"; + OS << "false"; }; - json_obj_pair("is_sgpr", writeIsSgpr, os); - os << "}"; + json_obj_pair("is_sgpr", WriteIsSgpr, OS); + OS << "}"; } unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI, @@ -749,7 +749,7 @@ unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI, } void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, raw_ostream &os) { + const SIRegisterInfo *SIRI, raw_ostream &OS) { if (Mask.none()) { unsigned size = get_reg_size(Reg, MRI, SIRI); Mask = LaneBitmask((1 << size) - 1); @@ -757,199 +757,199 @@ void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI, unsigned mask = Mask.getAsInteger(); for (unsigned i = 0; i <= Mask.getHighestLane(); i++) { if (mask & (1 << i)) { - write_reg(Reg, i, MRI, SIRI, os); - os << ",\n"; + write_reg(Reg, i, MRI, SIRI, OS); + OS << ",\n"; } } } void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, raw_ostream &os) { - os << "{"; - auto writeID = [&ID, &os]() { os << ID; }; + const SIRegisterInfo *SIRI, raw_ostream &OS) { + OS << "{"; + auto WriteID = [&ID, &OS]() { OS << ID; }; - json_pair("ID", writeID, os); + json_pair("ID", WriteID, OS); - os << ","; + OS << ","; - auto writeReg = [®, &MRI, &SIRI, &os]() { print_reg(reg, MRI, SIRI, os); }; + auto WriteReg = [®, &MRI, &SIRI, &OS]() { print_reg(reg, MRI, SIRI, OS); }; - json_pair("reg", writeReg, os); + json_pair("reg", WriteReg, OS); - os << ","; + OS << ","; - auto writeMask = [&mask, &os]() { os << mask; }; + auto WriteMask = [&mask, &OS]() { OS << mask; }; - json_pair("mask", writeMask, os); + json_pair("mask", WriteMask, OS); - os << "},\n"; + OS << "},\n"; } void write_dag_inst_node(unsigned ID, SlotIndex Slot, GCNRPTracker::LiveRegSet LiveReg, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, SUnit *SU, - raw_ostream &os) { - os << "{"; - auto writeID = [&ID, &os]() { os << ID; }; + raw_ostream &OS) { + OS << "{"; + auto WriteID = [&ID, &OS]() { OS << ID; }; - json_pair("ID", writeID, os); + json_pair("ID", WriteID, OS); - os << ","; + OS << ","; - auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; - json_pair("slot_index", writeSlot, os); + json_pair("slot_index", WriteSlot, OS); - os << ","; + OS << ","; - auto writeRegs = [&LiveReg, &MRI, &SIRI, &os]() { - for (auto it : LiveReg) { - unsigned Reg = it.first; - LaneBitmask Mask = it.second; - write_live(Reg, Mask, MRI, SIRI, os); + auto WriteRegs = [&LiveReg, &MRI, &SIRI, &OS]() { + for (auto It : LiveReg) { + unsigned Reg = It.first; + LaneBitmask Mask = It.second; + write_live(Reg, Mask, MRI, SIRI, OS); } }; - json_array("regs", writeRegs, os); + json_array("regs", WriteRegs, OS); - os << ","; + OS << ","; - auto writePreds = [&SU, &os]() { + auto WritePreds = [&SU, &OS]() { for (auto &Pred : SU->Preds) { - os << Pred.getSUnit()->NodeNum << ","; + OS << Pred.getSUnit()->NodeNum << ","; } }; - json_array("preds", writePreds, os); + json_array("preds", WritePreds, OS); - os << "},\n"; + OS << "},\n"; } void write_block(MachineBasicBlock &Blk, LiveIntervals *LIS, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - const SIInstrInfo *SIII, raw_ostream &os) { - os << "{\n"; - auto writeName = [&Blk, &os]() { os << Blk.getName(); }; - json_pair("name", writeName, os); + const SIInstrInfo *SIII, raw_ostream &OS) { + OS << "{\n"; + auto WriteName = [&Blk, &OS]() { OS << Blk.getName(); }; + json_pair("name", WriteName, OS); - os << ","; + OS << ","; - auto writeIndex = [&Blk, &os]() { os << Blk.getNumber(); }; - json_pair("id", writeIndex, os); + auto WriteIndex = [&Blk, &OS]() { OS << Blk.getNumber(); }; + json_pair("id", WriteIndex, OS); - os << ","; + OS << ","; const SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); SlotIndex BeginSlot = SlotIndexes->getMBBStartIdx(&Blk); - auto writeSlot = [&BeginSlot, &os]() { BeginSlot.print(os); }; - json_pair("begin_slot", writeSlot, os); + auto WriteSlot = [&BeginSlot, &OS]() { BeginSlot.print(OS); }; + json_pair("begin_slot", WriteSlot, OS); - os << ","; + OS << ","; SlotIndex EndSlot = SlotIndexes->getMBBEndIdx(&Blk); - auto writeEndSlot = [&EndSlot, &os]() { EndSlot.print(os); }; - json_pair("end_slot", writeEndSlot, os); + auto WriteEndSlot = [&EndSlot, &OS]() { EndSlot.print(OS); }; + json_pair("end_slot", WriteEndSlot, OS); - os << ","; + OS << ","; - auto writeInsts = [&Blk, &SlotIndexes, &SIII, &os]() { + auto WriteInsts = [&Blk, &SlotIndexes, &SIII, &OS]() { for (MachineInstr &MI : Blk) { if (MI.isDebugInstr()) continue; - write_inst(MI, SlotIndexes, SIII, os); - os << ",\n"; + write_inst(MI, SlotIndexes, SIII, OS); + OS << ",\n"; } }; - json_array("instructions", writeInsts, os); + json_array("instructions", WriteInsts, OS); - os << ","; + OS << ","; - BlockExpDag dag(&Blk, LIS, MRI, SIRI, SIII); - dag.buildWithPressure(); + BlockExpDag Dag(&Blk, LIS, MRI, SIRI, SIII); + Dag.buildWithPressure(); - const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *dag.LIS, dag.MRI); - auto writeInputs = [&StartLiveReg, &dag, &os]() { - for (auto it : StartLiveReg) { - unsigned Reg = it.first; - LaneBitmask mask = it.second; - SUnit *SU = dag.InputSUnitMap[Reg]; + const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *Dag.LIS, Dag.MRI); + auto WriteInputs = [&StartLiveReg, &Dag, &OS]() { + for (auto It : StartLiveReg) { + unsigned Reg = It.first; + LaneBitmask Mask = It.second; + SUnit *SU = Dag.InputSUnitMap[Reg]; // Write Reg and mask to the nodes. - write_dag_input_node(SU->NodeNum, Reg, mask.getAsInteger(), dag.MRI, - dag.SIRI, os); + write_dag_input_node(SU->NodeNum, Reg, Mask.getAsInteger(), Dag.MRI, + Dag.SIRI, OS); } }; - json_array("input_nodes", writeInputs, os); + json_array("input_nodes", WriteInputs, OS); - os << ","; + OS << ","; - auto writeNodes = [&SlotIndexes, &dag, &os]() { - for (auto it : dag.MISUnitMap) { - MachineInstr *MI = it.first; - SUnit *SU = it.second; + auto WriteNodes = [&SlotIndexes, &Dag, &OS]() { + for (auto It : Dag.MISUnitMap) { + MachineInstr *MI = It.first; + SUnit *SU = It.second; // Use SlotIndex of MI. SlotIndex SlotIndex; if (!MI->isDebugInstr()) SlotIndex = SlotIndexes->getInstructionIndex(*MI); - GCNRPTracker::LiveRegSet LiveReg = dag.DagPressureMap[SU]; + GCNRPTracker::LiveRegSet LiveReg = Dag.DagPressureMap[SU]; // Write slot, live to the nodes. - write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, dag.MRI, dag.SIRI, - SU, os); + write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, Dag.MRI, Dag.SIRI, + SU, OS); } }; - json_array("inst_nodes", writeNodes, os); + json_array("inst_nodes", WriteNodes, OS); - os << ","; + OS << ","; - auto writePreds = [&Blk, &os]() { + auto WritePreds = [&Blk, &OS]() { for (MachineBasicBlock *Pred : Blk.predecessors()) { - os << Pred->getNumber() << ","; + OS << Pred->getNumber() << ","; } }; - json_array("preds", writePreds, os); + json_array("preds", WritePreds, OS); - os << ","; + OS << ","; - auto writeSuccs = [&Blk, &os]() { + auto WriteSuccs = [&Blk, &OS]() { for (MachineBasicBlock *Succ : Blk.successors()) { - os << Succ->getNumber() << ","; + OS << Succ->getNumber() << ","; } }; - json_array("succs", writeSuccs, os); + json_array("succs", WriteSuccs, OS); - os << "}"; + OS << "}"; } void write_define(SlotIndex &Slot, unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - raw_ostream &os) { - os << "{"; - auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + raw_ostream &OS) { + OS << "{"; + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; - json_pair("slot_index", writeSlot, os); + json_pair("slot_index", WriteSlot, OS); - os << ","; + OS << ","; - auto writeReg = [&MRI, &SIRI, &Reg, &SubReg, &os]() { - write_reg(Reg, SubReg, MRI, SIRI, os); + auto WriteReg = [&MRI, &SIRI, &Reg, &SubReg, &OS]() { + write_reg(Reg, SubReg, MRI, SIRI, OS); }; - json_obj_pair("reg", writeReg, os); + json_obj_pair("reg", WriteReg, OS); - os << "}\n"; + OS << "}\n"; - os << ","; + OS << ","; } void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - raw_ostream &os) { + raw_ostream &OS) { // Split subReg? MO.getSubReg(); Register Reg = MO.getReg(); unsigned SubReg = MO.getSubReg(); @@ -958,104 +958,104 @@ void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes, if (SubReg == 0) { unsigned Size = get_reg_size(Reg, MRI, SIRI); for (unsigned i = 0; i < Size; i++) { - write_define(Slot, Reg, i, MRI, SIRI, os); + write_define(Slot, Reg, i, MRI, SIRI, OS); } } else { switch (SubReg) { default: assert(0 && "SubReg not supported yet."); - write_define(Slot, Reg, SubReg, MRI, SIRI, os); + write_define(Slot, Reg, SubReg, MRI, SIRI, OS); break; case AMDGPU::sub0: - write_define(Slot, Reg, 0, MRI, SIRI, os); + write_define(Slot, Reg, 0, MRI, SIRI, OS); break; case AMDGPU::sub1: - write_define(Slot, Reg, 1, MRI, SIRI, os); + write_define(Slot, Reg, 1, MRI, SIRI, OS); break; case AMDGPU::sub2: - write_define(Slot, Reg, 2, MRI, SIRI, os); + write_define(Slot, Reg, 2, MRI, SIRI, OS); break; case AMDGPU::sub3: - write_define(Slot, Reg, 3, MRI, SIRI, os); + write_define(Slot, Reg, 3, MRI, SIRI, OS); break; case AMDGPU::sub4: - write_define(Slot, Reg, 4, MRI, SIRI, os); + write_define(Slot, Reg, 4, MRI, SIRI, OS); break; case AMDGPU::sub5: - write_define(Slot, Reg, 5, MRI, SIRI, os); + write_define(Slot, Reg, 5, MRI, SIRI, OS); break; case AMDGPU::sub6: - write_define(Slot, Reg, 6, MRI, SIRI, os); + write_define(Slot, Reg, 6, MRI, SIRI, OS); break; case AMDGPU::sub7: - write_define(Slot, Reg, 7, MRI, SIRI, os); + write_define(Slot, Reg, 7, MRI, SIRI, OS); break; case AMDGPU::sub8: - write_define(Slot, Reg, 8, MRI, SIRI, os); + write_define(Slot, Reg, 8, MRI, SIRI, OS); break; case AMDGPU::sub9: - write_define(Slot, Reg, 9, MRI, SIRI, os); + write_define(Slot, Reg, 9, MRI, SIRI, OS); break; case AMDGPU::sub10: - write_define(Slot, Reg, 10, MRI, SIRI, os); + write_define(Slot, Reg, 10, MRI, SIRI, OS); break; case AMDGPU::sub11: - write_define(Slot, Reg, 11, MRI, SIRI, os); + write_define(Slot, Reg, 11, MRI, SIRI, OS); break; case AMDGPU::sub12: - write_define(Slot, Reg, 12, MRI, SIRI, os); + write_define(Slot, Reg, 12, MRI, SIRI, OS); break; case AMDGPU::sub13: - write_define(Slot, Reg, 13, MRI, SIRI, os); + write_define(Slot, Reg, 13, MRI, SIRI, OS); break; case AMDGPU::sub14: - write_define(Slot, Reg, 14, MRI, SIRI, os); + write_define(Slot, Reg, 14, MRI, SIRI, OS); break; case AMDGPU::sub15: - write_define(Slot, Reg, 15, MRI, SIRI, os); + write_define(Slot, Reg, 15, MRI, SIRI, OS); break; case AMDGPU::sub0_sub1: - write_define(Slot, Reg, 0, MRI, SIRI, os); - write_define(Slot, Reg, 1, MRI, SIRI, os); + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); break; case AMDGPU::sub2_sub3: - write_define(Slot, Reg, 2, MRI, SIRI, os); - write_define(Slot, Reg, 3, MRI, SIRI, os); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); break; case AMDGPU::sub4_sub5: - write_define(Slot, Reg, 4, MRI, SIRI, os); - write_define(Slot, Reg, 5, MRI, SIRI, os); + write_define(Slot, Reg, 4, MRI, SIRI, OS); + write_define(Slot, Reg, 5, MRI, SIRI, OS); break; case AMDGPU::sub1_sub2: - write_define(Slot, Reg, 1, MRI, SIRI, os); - write_define(Slot, Reg, 2, MRI, SIRI, os); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); break; case AMDGPU::sub0_sub1_sub2: - write_define(Slot, Reg, 0, MRI, SIRI, os); - write_define(Slot, Reg, 1, MRI, SIRI, os); - write_define(Slot, Reg, 2, MRI, SIRI, os); + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); break; case AMDGPU::sub0_sub1_sub2_sub3: - write_define(Slot, Reg, 0, MRI, SIRI, os); - write_define(Slot, Reg, 1, MRI, SIRI, os); - write_define(Slot, Reg, 2, MRI, SIRI, os); - write_define(Slot, Reg, 3, MRI, SIRI, os); + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); break; case AMDGPU::sub2_sub3_sub4_sub5: - write_define(Slot, Reg, 2, MRI, SIRI, os); - write_define(Slot, Reg, 3, MRI, SIRI, os); - write_define(Slot, Reg, 4, MRI, SIRI, os); - write_define(Slot, Reg, 5, MRI, SIRI, os); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); + write_define(Slot, Reg, 4, MRI, SIRI, OS); + write_define(Slot, Reg, 5, MRI, SIRI, OS); break; case AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7: - write_define(Slot, Reg, 0, MRI, SIRI, os); - write_define(Slot, Reg, 1, MRI, SIRI, os); - write_define(Slot, Reg, 2, MRI, SIRI, os); - write_define(Slot, Reg, 3, MRI, SIRI, os); - write_define(Slot, Reg, 4, MRI, SIRI, os); - write_define(Slot, Reg, 5, MRI, SIRI, os); - write_define(Slot, Reg, 6, MRI, SIRI, os); - write_define(Slot, Reg, 7, MRI, SIRI, os); + write_define(Slot, Reg, 0, MRI, SIRI, OS); + write_define(Slot, Reg, 1, MRI, SIRI, OS); + write_define(Slot, Reg, 2, MRI, SIRI, OS); + write_define(Slot, Reg, 3, MRI, SIRI, OS); + write_define(Slot, Reg, 4, MRI, SIRI, OS); + write_define(Slot, Reg, 5, MRI, SIRI, OS); + write_define(Slot, Reg, 6, MRI, SIRI, OS); + write_define(Slot, Reg, 7, MRI, SIRI, OS); break; } } @@ -1063,13 +1063,13 @@ void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes, void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - raw_ostream &os) { + raw_ostream &OS) { for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { auto Reg = Register::index2VirtReg(i); for (MachineOperand &MO : MRI.def_operands(Reg)) { - write_define(MO, SlotIndexes, MRI, SIRI, os); + write_define(MO, SlotIndexes, MRI, SIRI, OS); } } } @@ -1077,288 +1077,288 @@ void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes, void write_uses(MachineFunction &MF, const SlotIndexes *SlotIndexes, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - raw_ostream &os) { + raw_ostream &OS) { for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { auto Reg = Register::index2VirtReg(i); for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { // TODO: create write_use if use has more info. - write_define(MO, SlotIndexes, MRI, SIRI, os); + write_define(MO, SlotIndexes, MRI, SIRI, OS); } } } void write_liveness(SlotIndex Slot, GCNRPTracker::LiveRegSet &LiveSet, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - raw_ostream &os) { - os << "{"; - auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + raw_ostream &OS) { + OS << "{"; + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; - json_pair("slot_index", writeSlot, os); + json_pair("slot_index", WriteSlot, OS); - os << ","; + OS << ","; - auto writeRegs = [&LiveSet, &MRI, &SIRI, &os]() { + auto WriteRegs = [&LiveSet, &MRI, &SIRI, &OS]() { for (auto it : LiveSet) { unsigned Reg = it.first; LaneBitmask Mask = it.second; - write_live(Reg, Mask, MRI, SIRI, os); + write_live(Reg, Mask, MRI, SIRI, OS); } }; - json_array("regs", writeRegs, os); - os << "\n},\n"; + json_array("regs", WriteRegs, OS); + OS << "\n},\n"; } -void write_segment(const LiveInterval::Segment &S, raw_ostream &os) { - os << "{"; - auto writeBegin = [&S, &os]() { S.start.print(os); }; +void write_segment(const LiveInterval::Segment &S, raw_ostream &OS) { + OS << "{"; + auto WriteBegin = [&S, &OS]() { S.start.print(OS); }; - json_pair("begin", writeBegin, os); + json_pair("begin", WriteBegin, OS); - os << ","; + OS << ","; - auto writeEnd = [&S, &os]() { S.end.print(os); }; + auto WriteEnd = [&S, &OS]() { S.end.print(OS); }; - json_pair("end", writeEnd, os); + json_pair("end", WriteEnd, OS); - os << ","; + OS << ","; - auto writeValNum = [&S, &os]() { + auto WriteValNum = [&S, &OS]() { if (S.valno) - os << S.valno->id; + OS << S.valno->id; else - os << 0xFFFFFFFF; + OS << 0xFFFFFFFF; }; - json_pair("val_num", writeValNum, os); + json_pair("val_num", WriteValNum, OS); - os << "},\n"; + OS << "},\n"; } -void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &os) { - os << "{\n"; - auto writeMask = [&SR, &os]() { os << SR.LaneMask.getAsInteger(); }; +void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &OS) { + OS << "{\n"; + auto WriteMask = [&SR, &OS]() { OS << SR.LaneMask.getAsInteger(); }; - json_pair("mask", writeMask, os); + json_pair("mask", WriteMask, OS); - os << ","; + OS << ","; // Segments. - auto writeSegments = [&SR, &os]() { + auto WriteSegments = [&SR, &OS]() { for (auto &S : SR.segments) { - write_segment(S, os); + write_segment(S, OS); } }; - json_array("segments", writeSegments, os); + json_array("segments", WriteSegments, OS); - os << "\n},\n"; + OS << "\n},\n"; } void write_live_interval(LiveInterval &LI, const MachineRegisterInfo &MRI, - const SIRegisterInfo *SIRI, raw_ostream &os) { - os << "{\n"; + const SIRegisterInfo *SIRI, raw_ostream &OS) { + OS << "{\n"; - auto writeReg = [&LI, &MRI, &SIRI, &os]() { - write_reg(LI.reg(), 0, MRI, SIRI, os); + auto WriteReg = [&LI, &MRI, &SIRI, &OS]() { + write_reg(LI.reg(), 0, MRI, SIRI, OS); }; - json_obj_pair("reg", writeReg, os); + json_obj_pair("reg", WriteReg, OS); - os << ","; + OS << ","; - auto writeSegments = [&LI, &os]() { + auto WriteSegments = [&LI, &OS]() { for (auto &S : LI.segments) { - write_segment(S, os); + write_segment(S, OS); } }; - json_array("segments", writeSegments, os); + json_array("segments", WriteSegments, OS); - os << ","; + OS << ","; - auto writeSubRanges = [&LI, &os]() { + auto WriteSubRanges = [&LI, &OS]() { for (auto &SR : LI.subranges()) { - write_subrange(SR, os); + write_subrange(SR, OS); } }; - json_array("subranges", writeSubRanges, os); + json_array("subranges", WriteSubRanges, OS); - os << "},\n"; + OS << "},\n"; } std::string get_legal_str(const MDString *MDStr) { - std::string str; - raw_string_ostream Stream(str); + std::string Str; + raw_string_ostream Stream(Str); MDStr->print(Stream); Stream.flush(); // Remove !. - str = str.substr(1); + Str = Str.substr(1); // Remove "" - str = str.substr(1); - str.pop_back(); - std::replace(str.begin(), str.end(), '\\', '#'); - return str; + Str = Str.substr(1); + Str.pop_back(); + std::replace(Str.begin(), Str.end(), '\\', '#'); + return Str; } -void write_file(const MDNode *FileNode, raw_ostream &os) { +void write_file(const MDNode *FileNode, raw_ostream &OS) { const MDString *FileName = cast(FileNode->getOperand(0).get()); - StringRef fileNameStr = FileName->getString(); - if (fileNameStr.find("__AMDGPU_GPUMAP_") == 0) + StringRef FileNameStr = FileName->getString(); + if (FileNameStr.find("__AMDGPU_GPUMAP_") == 0) return; - if (fileNameStr.find("__AMDGPU_DWARF_") == 0) + if (FileNameStr.find("__AMDGPU_DWARF_") == 0) return; - os << "{"; + OS << "{"; - std::string str0 = get_legal_str(FileName); - auto writeName = [&str0, &os]() { os << str0; }; - json_pair("filename", writeName, os); + std::string Str0 = get_legal_str(FileName); + auto WriteName = [&Str0, &OS]() { OS << Str0; }; + json_pair("filename", WriteName, OS); - os << ",\n"; + OS << ",\n"; const MDString *Content = cast(FileNode->getOperand(1).get()); std::string str = get_legal_str(Content); - auto writeContent = [&str, &os]() { os << str; }; - json_pair("content", writeContent, os); - os << "\n},\n"; + auto WriteContent = [&str, &OS]() { OS << str; }; + json_pair("content", WriteContent, OS); + OS << "\n},\n"; } -void write_DIFile(const DIFile *File, raw_ostream &os) { +void write_DIFile(const DIFile *File, raw_ostream &OS) { if (File) { - std::string name = get_legal_str(File->getRawFilename()); - std::string dir = ""; + std::string Name = get_legal_str(File->getRawFilename()); + std::string Dir = ""; if (MDString *MDDir = File->getRawDirectory()) - dir = get_legal_str(MDDir); - os << dir << name; + Dir = get_legal_str(MDDir); + OS << Dir << Name; } else { - os << "ArtificialFile"; + OS << "ArtificialFile"; } } -void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &os) { - os << "{"; +void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &OS) { + OS << "{"; - auto writeSlot = [&Slot, &os]() { Slot.print(os); }; + auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); }; - json_pair("slot_index", writeSlot, os); + json_pair("slot_index", WriteSlot, OS); - os << ",\n"; + OS << ",\n"; MDNode *Scope = DL.getScope(); - unsigned line = DL.getLine(); - unsigned col = DL.getCol(); + unsigned Line = DL.getLine(); + unsigned Col = DL.getCol(); - auto writeLine = [&line, &os]() { os << line; }; - json_pair("line", writeLine, os); + auto WriteLine = [&Line, &OS]() { OS << Line; }; + json_pair("line", WriteLine, OS); - os << ",\n"; + OS << ",\n"; - auto writeCol = [&col, &os]() { os << col; }; - json_pair("col", writeCol, os); + auto WriteCol = [&Col, &OS]() { OS << Col; }; + json_pair("col", WriteCol, OS); - os << ",\n"; + OS << ",\n"; - auto writeFile = [&Scope, &os]() { + auto WriteFile = [&Scope, &OS]() { const DIFile *File = cast(Scope)->getFile(); - write_DIFile(File, os); + write_DIFile(File, OS); }; - json_pair("file", writeFile, os); + json_pair("file", WriteFile, OS); - if (DILocation *inlineDL = DL.getInlinedAt()) { - os << ",\n"; - unsigned inlineLine = inlineDL->getLine(); - auto writeLine = [&inlineLine, &os]() { os << inlineLine; }; - json_pair("inline_line", writeLine, os); + if (DILocation *InlineDL = DL.getInlinedAt()) { + OS << ",\n"; + unsigned InlineLine = InlineDL->getLine(); + auto WriteLine = [&InlineLine, &OS]() { OS << InlineLine; }; + json_pair("inline_line", WriteLine, OS); - os << ",\n"; + OS << ",\n"; - unsigned inlineCol = inlineDL->getColumn(); - auto writeCol = [&inlineCol, &os]() { os << inlineCol; }; - json_pair("inline_col", writeCol, os); + unsigned InlineCol = InlineDL->getColumn(); + auto WriteCol = [&InlineCol, &OS]() { OS << InlineCol; }; + json_pair("inline_col", WriteCol, OS); - os << ",\n"; + OS << ",\n"; const MDNode *InlineScope = DL.getInlinedAtScope(); - auto writeFile = [&InlineScope, &os]() { + auto WriteFile = [&InlineScope, &OS]() { const DIFile *File = cast(InlineScope)->getFile(); - write_DIFile(File, os); + write_DIFile(File, OS); }; - json_pair("inline_file", writeFile, os); + json_pair("inline_file", WriteFile, OS); } - os << "\n},\n"; + OS << "\n},\n"; } void write_dbg_val(unsigned Reg, const DIVariable *V, const DIExpression *Exp, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, - raw_ostream &os) { - os << "{"; + raw_ostream &OS) { + OS << "{"; - auto writeReg = [&MRI, &SIRI, &Reg, &os]() { + auto WriteReg = [&MRI, &SIRI, &Reg, &OS]() { const unsigned SubReg = 0; - write_reg(Reg, SubReg, MRI, SIRI, os); + write_reg(Reg, SubReg, MRI, SIRI, OS); }; - json_obj_pair("reg", writeReg, os); + json_obj_pair("reg", WriteReg, OS); - os << ",\n"; + OS << ",\n"; if (V) { - auto writeName = [&V, &os]() { os << V->getName(); }; - json_pair("debug_val_name", writeName, os); - os << ",\n"; + auto WriteName = [&V, &OS]() { OS << V->getName(); }; + json_pair("debug_val_name", WriteName, OS); + OS << ",\n"; - auto writeFile = [&V, &os]() { + auto WriteFile = [&V, &OS]() { const DIFile *File = V->getFile(); - write_DIFile(File, os); + write_DIFile(File, OS); }; - json_pair("debug_val_file", writeFile, os); - os << ",\n"; + json_pair("debug_val_file", WriteFile, OS); + OS << ",\n"; - auto writeLine = [&V, &os]() { os << V->getLine(); }; - json_pair("debug_val_line", writeLine, os); + auto WriteLine = [&V, &OS]() { OS << V->getLine(); }; + json_pair("debug_val_line", WriteLine, OS); } if (Exp->isValid() && Exp->getNumElements()) { - os << ",\n"; - auto writeV = [&Exp, &os]() { - os << '['; + OS << ",\n"; + auto WriteV = [&Exp, &OS]() { + OS << '['; bool NeedSep = false; for (auto Op : Exp->expr_ops()) { if (NeedSep) - os << ", "; + OS << ", "; else NeedSep = true; - os << dwarf::OperationEncodingString(Op.getOp()); + OS << dwarf::OperationEncodingString(Op.getOp()); for (unsigned I = 0; I < Op.getNumArgs(); ++I) - os << ' ' << Op.getArg(I); + OS << ' ' << Op.getArg(I); } - os << "] "; + OS << "] "; }; - json_pair("debug_exp", writeV, os); + json_pair("debug_exp", WriteV, OS); } - os << "\n},\n"; + OS << "\n},\n"; } void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, const SlotIndexes *SlotIndexes, - const NamedMDNode *SourceMD, raw_ostream &os) { - os << ",\n"; + const NamedMDNode *SourceMD, raw_ostream &OS) { + OS << ",\n"; - auto writeFiles = [&SourceMD, &os]() { + auto WriteFiles = [&SourceMD, &OS]() { for (const MDNode *FileNode : SourceMD->operands()) { - write_file(FileNode, os); + write_file(FileNode, OS); } }; - json_array("files", writeFiles, os); + json_array("files", WriteFiles, OS); - os << ",\n"; + OS << ",\n"; - auto writeLineMapping = [&MF, &SlotIndexes, &os]() { + auto WriteLineMapping = [&MF, &SlotIndexes, &OS]() { for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { if (MI.isDebugInstr()) { @@ -1368,16 +1368,16 @@ void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS, if (!DL) continue; SlotIndex Slot = SlotIndexes->getInstructionIndex(MI); - write_line_mapping(Slot, DL, os); + write_line_mapping(Slot, DL, OS); } } }; - json_array("line_mapping", writeLineMapping, os); + json_array("line_mapping", WriteLineMapping, OS); - os << ",\n"; + OS << ",\n"; - auto writeDebugVals = [&MF, &MRI, &SIRI, &os]() { + auto WriteDebugVals = [&MF, &MRI, &SIRI, &OS]() { for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { if (!MI.isDebugValue()) @@ -1392,91 +1392,89 @@ void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS, const DIVariable *V = MI.getDebugVariable(); const DIExpression *Exp = MI.getDebugExpression(); - write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, os); + write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, OS); } } }; - json_array("debug_vals", writeDebugVals, os); + json_array("debug_vals", WriteDebugVals, OS); } void write_function(MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI, const SIInstrInfo *SIII, - const SIRegisterInfo *SIRI, raw_ostream &os) { + const SIRegisterInfo *SIRI, raw_ostream &OS) { const SlotIndexes *SlotIndexes = LIS->getSlotIndexes(); - os << "{\n"; - auto writeName = [&MF, &os]() { os << MF.getName(); }; - json_pair("name", writeName, os); + OS << "{\n"; + auto WriteName = [&MF, &OS]() { OS << MF.getName(); }; + json_pair("name", WriteName, OS); - os << ",\n"; + OS << ",\n"; - auto writeBlocks = [&MF, &SlotIndexes, &LIS, &MRI, &SIRI, &SIII, &os]() { + auto WriteBlocks = [&MF, &LIS, &MRI, &SIRI, &SIII, &OS]() { for (MachineBasicBlock &MBB : MF) { - write_block(MBB, LIS, MRI, SIRI, SIII, os); - os << ",\n"; + write_block(MBB, LIS, MRI, SIRI, SIII, OS); + OS << ",\n"; } }; - json_array("blocks", writeBlocks, os); + json_array("blocks", WriteBlocks, OS); - os << ",\n"; + OS << ",\n"; - auto writeDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() { - write_defines(MF, SlotIndexes, MRI, SIRI, os); + auto WriteDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &OS]() { + write_defines(MF, SlotIndexes, MRI, SIRI, OS); }; - json_array("defines", writeDefines, os); + json_array("defines", WriteDefines, OS); - os << ",\n"; + OS << ",\n"; - auto writeUses = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() { - write_uses(MF, SlotIndexes, MRI, SIRI, os); + auto WriteUses = [&MF, &SlotIndexes, &MRI, &SIRI, &OS]() { + write_uses(MF, SlotIndexes, MRI, SIRI, OS); }; - json_array("uses", writeUses, os); + json_array("uses", WriteUses, OS); - os << ",\n"; + OS << ",\n"; - auto writeLiveness = [&MF, &LIS, &MRI, &SIRI, &os]() { + auto WriteLiveness = [&MF, &LIS, &MRI, &SIRI, &OS]() { for (MachineBasicBlock &MBB : MF) for (MachineInstr &MI : MBB) { if (MI.isDebugInstr()) continue; const SlotIndex &SI = LIS->getInstructionIndex(MI).getBaseIndex(); GCNRPTracker::LiveRegSet LISLR = llvm::getLiveRegs(SI, *LIS, MRI); - write_liveness(SI, LISLR, MRI, SIRI, os); + write_liveness(SI, LISLR, MRI, SIRI, OS); } }; - json_array("liveness", writeLiveness, os); + json_array("liveness", WriteLiveness, OS); - os << ",\n"; + OS << ",\n"; - auto writeLiveIntervals = [&MRI, &SIRI, &LIS, &os]() { + auto WriteLiveIntervals = [&MRI, &SIRI, &LIS, &OS]() { for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) { auto Reg = Register::index2VirtReg(i); if (!LIS->hasInterval(Reg)) continue; auto &LI = LIS->getInterval(Reg); - write_live_interval(LI, MRI, SIRI, os); + write_live_interval(LI, MRI, SIRI, OS); } }; - json_array("live_intervals", writeLiveIntervals, os); + json_array("live_intervals", WriteLiveIntervals, OS); -#if 0 // TODO: Do we need this? // Check debug info. const Function &F = MF.getFunction(); const Module *M = F.getParent(); const NamedMDNode *SourceMD = - M->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceContentsMDName); + M->getNamedMetadata("dx.source.contents"); if (SourceMD) { - write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, os); + write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, OS); } -#endif - os << "\n}"; + OS << "\n}"; } void write_pressure(MachineFunction &MF, LiveIntervals *LIS, @@ -1500,13 +1498,13 @@ void write_pressure(MachineFunction &MF, LiveIntervals *LIS, O.close(); } -void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) { +void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &OS) { const GCNSubtarget *ST = &MF.getSubtarget(); const auto *SIII = ST->getInstrInfo(); const auto *SIRI = ST->getRegisterInfo(); auto &MRI = MF.getRegInfo(); - write_function(MF, LIS, MRI, SIII, SIRI, os); - os.flush(); + write_function(MF, LIS, MRI, SIII, SIRI, OS); + OS.flush(); } } // namespace pressure @@ -1524,16 +1522,15 @@ class ContributionList { DenseMap> MIContributorMap; // Set of inst which been contributed by the key MachineInstr. DenseMap> MIContributedToMap; - void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &os); + void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &OS); void writeBlock(MachineBasicBlock &MBB, const SIInstrInfo *SIII, - raw_ostream &os); - void write(raw_ostream &os); + raw_ostream &OS); + void write(raw_ostream &OS); }; void buildMIContribution(MachineInstr &MI, DenseSet &ContributorSet, - DenseSet &ContributedSet, - const SIRegisterInfo &SIRI, MachineRegisterInfo &MRI) { + DenseSet &ContributedSet, MachineRegisterInfo &MRI) { for (MachineOperand &UseMO : MI.uses()) { if (!UseMO.isReg()) continue; @@ -1565,134 +1562,132 @@ void buildMIContribution(MachineInstr &MI, } bool ContributionList::propagateContribution() { - bool bUpdated = false; + bool IsUpdated = false; ReversePostOrderTraversal RPOT(&MF); for (auto *MBB : RPOT) { for (auto &MI : *MBB) { - auto &contributors = MIContributorMap[&MI]; - unsigned size = contributors.size(); - DenseSet parentContributors; - for (auto *CMI : contributors) { - auto &pContributors = MIContributorMap[CMI]; - parentContributors.insert(pContributors.begin(), pContributors.end()); + auto &Contributors = MIContributorMap[&MI]; + unsigned Size = Contributors.size(); + DenseSet ParentContributors; + for (auto *CMI : Contributors) { + auto &Contributors = MIContributorMap[CMI]; + ParentContributors.insert(Contributors.begin(), Contributors.end()); } - contributors.insert(parentContributors.begin(), parentContributors.end()); - bUpdated |= size < contributors.size(); + Contributors.insert(ParentContributors.begin(), ParentContributors.end()); + IsUpdated |= Size < Contributors.size(); } } - return bUpdated; + return IsUpdated; } void ContributionList::build() { // Build contribution. auto &MRI = MF.getRegInfo(); - const GCNSubtarget *ST = &MF.getSubtarget(); - const auto *SIRI = ST->getRegisterInfo(); for (auto &MBB : MF) { for (auto &MI : MBB) { - auto &contributors = MIContributorMap[&MI]; - auto &contributed = MIContributedToMap[&MI]; - buildMIContribution(MI, contributors, contributed, *SIRI, MRI); + auto &Contributors = MIContributorMap[&MI]; + auto &Contributed = MIContributedToMap[&MI]; + buildMIContribution(MI, Contributors, Contributed, MRI); } } // propagate contribution. - bool bUpdated = true; - while (bUpdated) { - bUpdated = propagateContribution(); + bool IsUpdated = true; + while (IsUpdated) { + IsUpdated = propagateContribution(); } } void ContributionList::writeInst(MachineInstr &MI, const SIInstrInfo *SIII, - raw_ostream &os) { - os << "\n{\n"; + raw_ostream &OS) { + OS << "\n{\n"; unsigned ID = MIIndexMap[&MI]; - auto writeSlot = [&ID, &os]() { os << ID; }; + auto WriteSlot = [&ID, &OS]() { OS << ID; }; - json_pair("ID", writeSlot, os); + json_pair("ID", WriteSlot, OS); - os << ","; + OS << ","; - auto writeAsm = [&MI, &SIII, &os]() { - MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false, + auto WriteAsm = [&MI, &SIII, &OS]() { + MI.print(OS, /*IsStandalone*/ true, /*SkipOpers*/ false, /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII); }; - json_pair("asm", writeAsm, os); + json_pair("asm", WriteAsm, OS); - os << ",\n"; + OS << ",\n"; - auto &contributors = MIContributorMap[&MI]; - auto writeContributor = [&contributors, this, &os]() { - for (auto *MI : contributors) { + auto &Contributors = MIContributorMap[&MI]; + auto WriteContributor = [&Contributors, this, &OS]() { + for (auto *MI : Contributors) { unsigned ID = MIIndexMap[MI]; - os << ID << ","; + OS << ID << ","; } }; - json_array("contributors", writeContributor, os); - os << ",\n"; + json_array("contributors", WriteContributor, OS); + OS << ",\n"; - auto &contributeds = MIContributedToMap[&MI]; - auto writeContributed = [&contributeds, this, &os]() { - for (auto *MI : contributeds) { + auto &Contributeds = MIContributedToMap[&MI]; + auto WriteContributed = [&Contributeds, this, &OS]() { + for (auto *MI : Contributeds) { unsigned ID = MIIndexMap[MI]; - os << ID << ","; + OS << ID << ","; } }; - json_array("contributed", writeContributed, os); - os << "\n}\n"; + json_array("contributed", WriteContributed, OS); + OS << "\n}\n"; } void ContributionList::writeBlock(MachineBasicBlock &MBB, - const SIInstrInfo *SIII, raw_ostream &os) { - os << "{\n"; - auto writeName = [&MBB, &os]() { os << MBB.getName(); }; - json_pair("name", writeName, os); + const SIInstrInfo *SIII, raw_ostream &OS) { + OS << "{\n"; + auto WriteName = [&MBB, &OS]() { OS << MBB.getName(); }; + json_pair("name", WriteName, OS); - os << ","; + OS << ","; - auto writeIndex = [&MBB, &os]() { os << MBB.getNumber(); }; - json_pair("id", writeIndex, os); + auto WriteIndex = [&MBB, &OS]() { OS << MBB.getNumber(); }; + json_pair("id", WriteIndex, OS); - os << ",\n"; + OS << ",\n"; - auto writeInsts = [this, &MBB, &SIII, &os]() { + auto WriteInsts = [this, &MBB, &SIII, &OS]() { for (MachineInstr &MI : MBB) { if (MI.isDebugInstr()) continue; - writeInst(MI, SIII, os); - os << ",\n"; + writeInst(MI, SIII, OS); + OS << ",\n"; } }; - json_array("instructions", writeInsts, os); + json_array("instructions", WriteInsts, OS); - os << ",\n"; + OS << ",\n"; - auto writePreds = [&MBB, &os]() { + auto WritePreds = [&MBB, &OS]() { for (MachineBasicBlock *Pred : MBB.predecessors()) { - os << Pred->getNumber() << ","; + OS << Pred->getNumber() << ","; } }; - json_array("preds", writePreds, os); + json_array("preds", WritePreds, OS); - os << ","; + OS << ","; - auto writeSuccs = [&MBB, &os]() { + auto WriteSuccs = [&MBB, &OS]() { for (MachineBasicBlock *Succ : MBB.successors()) { - os << Succ->getNumber() << ","; + OS << Succ->getNumber() << ","; } }; - json_array("succs", writeSuccs, os); + json_array("succs", WriteSuccs, OS); - os << "}"; + OS << "}"; } -void ContributionList::write(raw_ostream &os) { +void ContributionList::write(raw_ostream &OS) { unsigned ID = 0; - // Build ID for write. + // Build ID for Write. ReversePostOrderTraversal RPOT(&MF); for (auto *MBB : RPOT) { for (auto &MI : *MBB) { @@ -1703,22 +1698,22 @@ void ContributionList::write(raw_ostream &os) { const GCNSubtarget *ST = &MF.getSubtarget(); const auto *SIII = ST->getInstrInfo(); - os << "{\n"; - auto writeName = [this, &os]() { os << MF.getName(); }; - json_pair("name", writeName, os); + OS << "{\n"; + auto WriteName = [this, &OS]() { OS << MF.getName(); }; + json_pair("name", WriteName, OS); - os << ",\n"; + OS << ",\n"; - auto writeBlocks = [this, &SIII, &RPOT, &os]() { + auto WriteBlocks = [this, &SIII, &RPOT, &OS]() { for (auto *MBB : RPOT) { - writeBlock(*MBB, SIII, os); - os << ",\n"; + writeBlock(*MBB, SIII, OS); + OS << ",\n"; } }; - json_array("blocks", writeBlocks, os); + json_array("blocks", WriteBlocks, OS); - os << "\n}"; + OS << "\n}"; } } // namespace @@ -1788,8 +1783,8 @@ void llvm::updatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, void llvm::buildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, SmallDenseSet &LiveOutSet, const MachineRegisterInfo *MRI) { - for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) { - auto &MI = *rit; + for (auto RIt = NewBB->rbegin(); RIt != NewBB->rend(); RIt++) { + auto &MI = *RIt; // Add all physical register defs (exlicit+implicit) to the def register // set. for (MachineOperand &Def : MI.operands()) { @@ -1805,7 +1800,7 @@ void llvm::buildPhysRegLiveInForBlock(MachineBasicBlock *NewBB, continue; // Reserved regs do not need to be tracked through live-in sets. - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue; @@ -1843,7 +1838,7 @@ MachineReg llvm::createVirtualDstReg(MachineOpcode Opcode, // Return true if the MI is a copy of exec. // If true then sets pDst to the destination register. bool llvm::isExecCopy(const MachineInstr &MI, MachineReg Exec, - MachineReg *pDst) { + MachineReg *OutDst) { enum { DST = 0, SRC = 1 }; bool FoundCopy = false; if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::S_MOV_B32 || @@ -1853,60 +1848,13 @@ bool llvm::isExecCopy(const MachineInstr &MI, MachineReg Exec, FoundCopy = true; } } -#if 0 // TODO: Delete this. - else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO || - MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32) - { - FoundCopy = true; - } -#endif - if (FoundCopy) { - *pDst = MI.getOperand(DST).getReg(); + *OutDst = MI.getOperand(DST).getReg(); } return FoundCopy; } -llvm::MachineRegWithSubReg llvm::getWqmEntryActiveMask(MachineFunction &MF) { - llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, - AMDGPU::NoSubRegister}; - if (MachineInstr *MI = getWqmEntryActiveMaskInst(MF)) { - LiveLaneMask.Reg = MI->getOperand(0).getReg(); - LiveLaneMask.SubReg = MI->getOperand(0).getSubReg(); - } - - return LiveLaneMask; -} - -MachineInstr *llvm::getWqmEntryActiveMaskInst(MachineFunction &MF) { -#if 0 // TODO: Get rid of this - // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction. - // This instruction is added by the SIWholeQuadMode pass. - MachineBasicBlock &MBB = MF.front(); - for (MachineInstr &MI : MBB) - { - if (MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK || - MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK_32) - { - return &MI; - } - } -#endif - - return nullptr; -} - -bool llvm::isFetchShaderCall(const MachineInstr *MI) { -#if 0 // TODO: Get rid of this. - return - MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER || - MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall); -#else - return false; -#endif -} - bool llvm::isSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) { const TargetRegisterInfo *TRI = @@ -1967,7 +1915,7 @@ MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef( // If the instruction modifies exec then we cannot use it as // an insertion point (if that is a constraint from the caller). // The check for EXEC works for both wave64 and wave32 because - // it will also catch writes to the subregisters (e.g. exec_lo). + // it will also catch Writes to the subregisters (e.g. exec_lo). if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) { break; } @@ -1979,8 +1927,8 @@ MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef( } // If no safe location can be found in the block we can save and restore - // SCC around MI. There is no way to directly read or write SCC so we use - // s_cselect to read the current value of SCC and s_cmp to write the saved + // SCC around MI. There is no way to directly read or Write SCC so we use + // s_cselect to read the current value of SCC and s_cmp to Write the saved // value back to SCC. // // The generated code will look like this; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index 04b4b74fbd726..e4b8a28dda6e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -90,19 +90,7 @@ MachineReg createVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand, MachineReg createVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF); bool isExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, - MachineReg *pDst); -struct MachineRegWithSubReg { - MachineReg Reg = /*NoRegister*/ 0; - unsigned SubReg = /*NoSubRegister*/ 0; -}; -MachineRegWithSubReg getWqmEntryActiveMask(llvm::MachineFunction &MF); -llvm::MachineInstr *getWqmEntryActiveMaskInst(llvm::MachineFunction &MF); - -// Return true if this machine instruction represents a call to the fetch -// shader. We curently have two mechanisims for calling fetch shader: -// 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction -// 2. A CALL instruction with the `FetchShaderCall` flag set to true. -bool isFetchShaderCall(const llvm::MachineInstr *MI); + MachineReg *OutDst); bool isSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index 94d78fb676f9a..95066743b59bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -716,7 +716,7 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { static bool renderGraphFromBottomUp() { return true; } - static bool isNodeHidden(const SUnit *Node) { + static bool isNodeHidden(const SUnit *Node, const llvm::ExpDag *) { if (ViewNodes.empty()) return false; @@ -921,28 +921,6 @@ unsigned getSGPRSize(const TargetRegisterClass *RC, return RC->getLaneMask().getNumLanes(); } -void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet &BackNodes, - unsigned NodeNum, - SmallDenseSet &VisitedNodes) { - if (VisitedNodes.count(SU)) - return; - VisitedNodes.insert(SU); - - for (SDep &Dep : SU->Succs) { - if (Dep.isWeak()) - continue; - if (Dep.getLatency() > 0) - continue; - - SUnit *Succ = Dep.getSUnit(); /* - if (Succ->NodeNum >= NodeNum) - continue;*/ - - BackNodes.insert(Succ); - collectSameHeightBackNodes(Succ, BackNodes, NodeNum, VisitedNodes); - } -} - } // namespace namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h deleted file mode 100644 index c49590a7d8f7f..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h +++ /dev/null @@ -1,99 +0,0 @@ -//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Build degree about VMem to help balance latency and pressure inside a -/// block. -// -//===----------------------------------------------------------------------===// -#pragma once - -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit. -#include - -namespace llvm { -class MachineBasicBlock; -class SUnit; -class SIInstrInfo; -class MachineInstr; - -class SimpleDAG { -public: - SimpleDAG(llvm::MachineBasicBlock &MBB, const llvm::SIInstrInfo *TII) - : SIII(TII), MBB(MBB) {} - std::vector SUnits; - // InstrInfo. - const llvm::SIInstrInfo *SIII; - llvm::DenseMap MISUnitMap; - llvm::DenseMap SUnitMIMap; - llvm::MachineBasicBlock &MBB; - void build(); - -private: - void initNodes(); - void addDependence(); - void addCtrlDep(); -}; - -// Collect height/depth for high latency mem ld, which only update height/depth -// when cross high latency mem ld. Call the height/depth as VMem degree here. -// The rule is sample and its user should has different degree. -// For example -// a = sample // a has depth 0, height 3 -// b = sample a // b has depth 1, height 2 -// c = sample c // c has depth 2, height 1 -// user of c // user of c has depth 2, height 0 -// -// For the purpose of in block reorder/remat, nothing will move/clone cross the -// block. So do this after cross blk remat? In the middle of cross block remat -// to help reach target when only move things cross blk cannot reach the target. -// Reorder at the beginning? No pressure at that time? After get pressure, might -// need to update max pressure. - -class VMemDegreeDAG { -public: - VMemDegreeDAG(std::vector &Units, const llvm::SIInstrInfo *TII) - : SUnits(Units), SIII(TII) {} - std::vector &SUnits; - // InstrInfo. - const llvm::SIInstrInfo *SIII; - void build(); - - bool isHighLatency(const llvm::SUnit *SU) const; - bool isHighLatency(const llvm::MachineInstr *MI) const; - // height/depth based on Long latency inst. - std::vector VMemDataHeight; - std::vector VMemDataDepth; - // Full height/depth count non-data dependent too. - std::vector VMemFullHeight; - std::vector VMemFullDepth; - llvm::SmallVector VMemSUs; - llvm::SmallVector, 16> GroupedVMemSUs; - llvm::SmallVector, 16> - GroupedVMemSUsByDepth; - - void dump(); - -private: - static constexpr unsigned kNoReg = -1; - - std::pair - buildVMemDepthHeight(std::vector &VMemHeight, - std::vector &VMemDepth, bool bDataOnly); - // Compute vmem height/depth. - void buildVMemDepthHeight(); - void buildVMemDataDepthHeight(); - void groupVmemSUnits(); -}; - -// Split block based on vmem depth. -void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag); - -} // namespace llvm From 84d8dd8df6bb481d1974201cf3c25bb4a5db8d37 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 17:09:04 -0700 Subject: [PATCH 17/25] Additional cleanup + format --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 2 +- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 34 +++++----- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 14 ++--- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 63 ++++++++++++------- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 32 ++++++---- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 19 +++++- 6 files changed, 100 insertions(+), 64 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 974cd8a5f36b4..7b61d21e6e20e 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -436,7 +436,7 @@ class TargetRegisterInfo : public MCRegisterInfo { /// \returns an empty set if there is no set of covering sub registers. std::vector getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC, - LaneBitmask mask) const; + LaneBitmask Mask) const; /// The lane masks returned by getSubRegIndexLaneMask() above can only be /// used to determine if sub-registers overlap - they can't be used to diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index d458648fd8bd8..e8f0c526fcd33 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -730,14 +730,14 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex, std::vector TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask( - const TargetRegisterClass *RC, LaneBitmask mask) const { + const TargetRegisterClass *RC, LaneBitmask Mask) const { // TODO: this could replace the code it was copied from in SplitKit.cpp // First pass: Try to find a perfectly matching subregister index. // If none exists find the one covering the most lanemask bits. SmallVector PossibleIndexes; unsigned BestIdx = 0; - const LaneBitmask avoid = ~mask; + const LaneBitmask Avoid = ~Mask; { unsigned BestCover = 0; for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) { @@ -746,13 +746,13 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask( continue; LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); // Early exit if we found a perfect match. - if (SubRegMask == mask) { + if (SubRegMask == Mask) { BestIdx = Idx; break; } // The index must not cover any lanes outside - if ((SubRegMask & avoid).any()) + if ((SubRegMask & Avoid).any()) continue; unsigned PopCount = SubRegMask.getNumLanes(); @@ -767,36 +767,36 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask( // Abort if we cannot possibly implement the COPY with the given indexes. if (BestIdx == 0) { LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " - << getRegClassName(RC) << " mask " << PrintLaneMask(mask) + << getRegClassName(RC) << " mask " << PrintLaneMask(Mask) << '\n'); assert(false && "Impossible to span reg class"); return std::vector(); } - std::vector result; - result.push_back(BestIdx); + std::vector Result; + Result.push_back(BestIdx); // Greedy heuristic: Keep iterating keeping the best covering subreg index // each time. - mask &= ~(getSubRegIndexLaneMask(BestIdx)); - while (mask.any()) { + Mask &= ~(getSubRegIndexLaneMask(BestIdx)); + while (Mask.any()) { BestIdx = 0; int BestCover = std::numeric_limits::min(); for (unsigned Idx : PossibleIndexes) { LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); // Early exit if we found a perfect match. - if (SubRegMask == mask) { + if (SubRegMask == Mask) { BestIdx = Idx; break; } // Guaranteed above - assert((SubRegMask & avoid).none()); + assert((SubRegMask & Avoid).none()); // Try to cover as much of the remaining lanes as possible but as few of // the already covered lanes as possible. - int Cover = (SubRegMask & mask).getNumLanes() - - (SubRegMask & ~mask).getNumLanes(); + int Cover = (SubRegMask & Mask).getNumLanes() - + (SubRegMask & ~Mask).getNumLanes(); if (Cover > BestCover) { BestCover = Cover; BestIdx = Idx; @@ -805,16 +805,16 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask( if (BestIdx == 0) { LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " - << getRegClassName(RC) << " mask " << PrintLaneMask(mask) + << getRegClassName(RC) << " mask " << PrintLaneMask(Mask) << '\n'); assert(false && "Impossible to span reg class"); return std::vector(); } - result.push_back(BestIdx); - mask &= ~getSubRegIndexLaneMask(BestIdx); + Result.push_back(BestIdx); + Mask &= ~getSubRegIndexLaneMask(BestIdx); } - return result; + return Result; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index a6ce3426a7b93..012ab0c91b257 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -3287,7 +3287,8 @@ void sortSubExpCandidates(std::vector &SubExpCandidates) { MapVector SortMap; for (auto It : InputMap) { unsigned Reg = It.first; - MapVector>::iterator OutIt = OutputMap.find(Reg); + MapVector>::iterator OutIt = + OutputMap.find(Reg); if (OutIt == OutputMap.end()) continue; auto &InExps = It.second; @@ -3622,8 +3623,7 @@ collectPassThrus(MachineBasicBlock *MBB, return PassThrus; } // Try to build a free subExp which all input is passThrus. -SubExp buildFreeSubExp(SubExp &Exp, - GCNRPTracker::LiveRegSet &PassThrus, +SubExp buildFreeSubExp(SubExp &Exp, GCNRPTracker::LiveRegSet &PassThrus, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) { SubExp FreeExp; // Try to split the subExp to find a help case. @@ -3777,8 +3777,7 @@ std::vector buildSubExpCandidates( } if (!canHelpPressureWhenSink(Exp, PassThrus, MRI, SIRI, MLI, DT, IsCanClone, IsSgprBound)) { - if (AllowPartialUseInSubExp && - Exp.isSafeToMove(MRI)) { + if (AllowPartialUseInSubExp && Exp.isSafeToMove(MRI)) { SubExp FreeSubExp = buildFreeSubExp(Exp, PassThrus, MRI, SIRI); if (canHelpPressureWhenSink(FreeSubExp, PassThrus, MRI, SIRI, MLI, DT, IsCanClone, IsSgprBound)) { @@ -4249,9 +4248,8 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector &HotBlocks, continue; // Collect pass thru regs. - GCNRPTracker::LiveRegSet PassThrus = - collectPassThrus(MBB, InputLive, OutputLive, - LiveRegCandidates, MRI, IsCanClone); + GCNRPTracker::LiveRegSet PassThrus = collectPassThrus( + MBB, InputLive, OutputLive, LiveRegCandidates, MRI, IsCanClone); // Group pass thru regs by def MBB. SmallVector> diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 990718cd7525f..01336b84c6786 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -1,3 +1,17 @@ +//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -383,8 +397,9 @@ struct Piece { } }; -static void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC, - unsigned Offset, const SIRegisterInfo *SIRI) { +static void updateSubReg(MachineOperand &UseMO, + const llvm::TargetRegisterClass *NewRC, + unsigned Offset, const SIRegisterInfo *SIRI) { unsigned Size = NewRC->getLaneMask().getNumLanes(); if (Size == 1) { UseMO.setSubReg(0); @@ -529,12 +544,13 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, case 1: return reduceChannel(Piece.Offset, MI, SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM - : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), + : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), MRI, SIRI, SIII, SlotIndexes); case 2: return reduceChannel(Piece.Offset, MI, - SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM - : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), MRI, SIRI, SIII, SlotIndexes); case 3: if (FullMask == 0xf) @@ -542,8 +558,9 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, LLVM_FALLTHROUGH; case 4: return reduceChannel(Piece.Offset, MI, - SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM - : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR), + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR), MRI, SIRI, SIII, SlotIndexes); case 5: case 6: @@ -553,8 +570,9 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, LLVM_FALLTHROUGH; case 8: return reduceChannel(Piece.Offset, MI, - SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM - : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR), + SIII->get(IsImm + ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR), MRI, SIRI, SIII, SlotIndexes); } @@ -751,19 +769,19 @@ unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI, void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, raw_ostream &OS) { if (Mask.none()) { - unsigned size = get_reg_size(Reg, MRI, SIRI); - Mask = LaneBitmask((1 << size) - 1); + unsigned Size = get_reg_size(Reg, MRI, SIRI); + Mask = LaneBitmask((1 << Size) - 1); } - unsigned mask = Mask.getAsInteger(); + unsigned IntMask = Mask.getAsInteger(); for (unsigned i = 0; i <= Mask.getHighestLane(); i++) { - if (mask & (1 << i)) { + if (IntMask & (1 << i)) { write_reg(Reg, i, MRI, SIRI, OS); OS << ",\n"; } } } -void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask, +void write_dag_input_node(unsigned ID, unsigned Reg, unsigned Mask, const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, raw_ostream &OS) { OS << "{"; @@ -773,13 +791,13 @@ void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask, OS << ","; - auto WriteReg = [®, &MRI, &SIRI, &OS]() { print_reg(reg, MRI, SIRI, OS); }; + auto WriteReg = [&Reg, &MRI, &SIRI, &OS]() { print_reg(Reg, MRI, SIRI, OS); }; json_pair("reg", WriteReg, OS); OS << ","; - auto WriteMask = [&mask, &OS]() { OS << mask; }; + auto WriteMask = [&Mask, &OS]() { OS << Mask; }; json_pair("mask", WriteMask, OS); @@ -1220,8 +1238,8 @@ void write_file(const MDNode *FileNode, raw_ostream &OS) { OS << ",\n"; const MDString *Content = cast(FileNode->getOperand(1).get()); - std::string str = get_legal_str(Content); - auto WriteContent = [&str, &OS]() { OS << str; }; + std::string Str = get_legal_str(Content); + auto WriteContent = [&Str, &OS]() { OS << Str; }; json_pair("content", WriteContent, OS); OS << "\n},\n"; } @@ -1468,8 +1486,7 @@ void write_function(MachineFunction &MF, LiveIntervals *LIS, // Check debug info. const Function &F = MF.getFunction(); const Module *M = F.getParent(); - const NamedMDNode *SourceMD = - M->getNamedMetadata("dx.source.contents"); + const NamedMDNode *SourceMD = M->getNamedMetadata("dx.source.contents"); if (SourceMD) { write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, OS); } @@ -1530,7 +1547,8 @@ class ContributionList { void buildMIContribution(MachineInstr &MI, DenseSet &ContributorSet, - DenseSet &ContributedSet, MachineRegisterInfo &MRI) { + DenseSet &ContributedSet, + MachineRegisterInfo &MRI) { for (MachineOperand &UseMO : MI.uses()) { if (!UseMO.isReg()) continue; @@ -1938,8 +1956,7 @@ MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef( // MI // S_CMP_LG_U32 %SavedSCC, 0 # Restore SCC // - Register TmpScc = - MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); DebugLoc DL = MI->getDebugLoc(); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc) .addImm(-1) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index e4b8a28dda6e6..52fa19a82b773 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -1,8 +1,20 @@ +//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H -#pragma once - #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -51,9 +63,9 @@ bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO, const llvm::MachineRegisterInfo &MRI); -void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet); -void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet); -void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet); +void andLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet); +void andNotLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet); +void mergeLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet); llvm::MachineBasicBlock *split(llvm::MachineInstr *I); // For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only @@ -71,9 +83,6 @@ bool reach_block(llvm::MachineBasicBlock *FromBB, void viewCFGWithPhi(llvm::MachineFunction &MF); void write_contribution_list(llvm::MachineFunction &MF, const char *Filename); -llvm::MachineBasicBlock *createNullExportBlock(llvm::MachineFunction &MF, - const llvm::SIInstrInfo *TII); - bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, llvm::MachineBasicBlock &MBB); @@ -128,7 +137,7 @@ llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( // local. bool isLocalLiveInterval( const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes, - llvm::SmallDenseSet &touchedMBBSet); + llvm::SmallDenseSet &TouchedMBBSet); bool isLocalLiveInterval(const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes); @@ -149,13 +158,12 @@ bool isFastMathInst(llvm::MachineInstr &MI); namespace pressure { void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI, - const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &os); + const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &OS); void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, const char *Filename); void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS, - llvm::raw_ostream &os); + llvm::raw_ostream &OS); } // namespace pressure -// bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage); // Look for the successor `Succ` of the given `MBB`. // Returns MBB->succ_end() if `Succ` is not a successor of MBB. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index 95066743b59bd..19a63b7900645 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -1,9 +1,23 @@ +//===----------- AMDGPUSubExpDag.cpp - AMDGPU Sub Expression DAG ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Sub Expression DAG. Helper for building a dag based on sub +/// expressions. +// +//===----------------------------------------------------------------------===// + #include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/SlotIndexes.h" -// #include "dxc/DXIL/DxilMetadataHelper.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" @@ -752,8 +766,7 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { const llvm::ExpDag *G) { return G->getGraphNodeLabel(SU); } - static std::string getNodeAttributes(const SUnit *N, - const llvm::ExpDag *) { + static std::string getNodeAttributes(const SUnit *N, const llvm::ExpDag *) { std::string Str("shape=Mrecord"); Str += ",style=filled,fillcolor=\"#"; From 303a4015bed7f6cbe670f0a9c7ae98117fbfff8b Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 19:59:51 -0700 Subject: [PATCH 18/25] Added cmath --- llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h index c04afe61c9809..7c9a4e5fc297f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -15,6 +15,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/MC/MCInstrItineraries.h" +#include + namespace llvm { class MachineFunction; From 971e5568d97d056023c5af6189b6ed54e6a36555 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 20:19:27 -0700 Subject: [PATCH 19/25] Wrong place for std header --- llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp | 2 ++ llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp index a8eef88ac2af8..e313c1f264a92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -20,6 +20,8 @@ #include "llvm/CodeGen/MachineLoopInfo.h" +#include + namespace llvm { // Other info which can help compare schedule result. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h index 7c9a4e5fc297f..c04afe61c9809 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -15,8 +15,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/MC/MCInstrItineraries.h" -#include - namespace llvm { class MachineFunction; From be03462ab1a3660d77c9c8ba113158176b59303b Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 20:35:19 -0700 Subject: [PATCH 20/25] Made getMinimalSpanningSubRegIdxSetForLaneMask local --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 8 -- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 91 ------------------ llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 96 ++++++++++++++++++- 3 files changed, 94 insertions(+), 101 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 7b61d21e6e20e..e4fad8f9ec869 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -430,14 +430,6 @@ class TargetRegisterInfo : public MCRegisterInfo { LaneBitmask LaneMask, SmallVectorImpl &Indexes) const; - /// Return the set of sub register indexes that minimally cover the given - /// lane mask for the given register class. - /// - /// \returns an empty set if there is no set of covering sub registers. - std::vector - getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC, - LaneBitmask Mask) const; - /// The lane masks returned by getSubRegIndexLaneMask() above can only be /// used to determine if sub-registers overlap - they can't be used to /// determine if a set of sub-registers completely cover another diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index e8f0c526fcd33..701a9f8d72a65 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -727,94 +727,3 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex, dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n"; } #endif - -std::vector -TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask( - const TargetRegisterClass *RC, LaneBitmask Mask) const { - // TODO: this could replace the code it was copied from in SplitKit.cpp - - // First pass: Try to find a perfectly matching subregister index. - // If none exists find the one covering the most lanemask bits. - SmallVector PossibleIndexes; - unsigned BestIdx = 0; - const LaneBitmask Avoid = ~Mask; - { - unsigned BestCover = 0; - for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) { - // Is this index even compatible with the given class? - if (getSubClassWithSubReg(RC, Idx) != RC) - continue; - LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); - // Early exit if we found a perfect match. - if (SubRegMask == Mask) { - BestIdx = Idx; - break; - } - - // The index must not cover any lanes outside - if ((SubRegMask & Avoid).any()) - continue; - - unsigned PopCount = SubRegMask.getNumLanes(); - PossibleIndexes.push_back(Idx); - if (PopCount > BestCover) { - BestCover = PopCount; - BestIdx = Idx; - } - } - } - - // Abort if we cannot possibly implement the COPY with the given indexes. - if (BestIdx == 0) { - LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " - << getRegClassName(RC) << " mask " << PrintLaneMask(Mask) - << '\n'); - assert(false && "Impossible to span reg class"); - return std::vector(); - } - - std::vector Result; - Result.push_back(BestIdx); - - // Greedy heuristic: Keep iterating keeping the best covering subreg index - // each time. - Mask &= ~(getSubRegIndexLaneMask(BestIdx)); - while (Mask.any()) { - BestIdx = 0; - int BestCover = std::numeric_limits::min(); - for (unsigned Idx : PossibleIndexes) { - LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); - // Early exit if we found a perfect match. - if (SubRegMask == Mask) { - BestIdx = Idx; - break; - } - - // Guaranteed above - assert((SubRegMask & Avoid).none()); - - // Try to cover as much of the remaining lanes as possible but as few of - // the already covered lanes as possible. - int Cover = (SubRegMask & Mask).getNumLanes() - - (SubRegMask & ~Mask).getNumLanes(); - if (Cover > BestCover) { - BestCover = Cover; - BestIdx = Idx; - } - } - - if (BestIdx == 0) { - LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " - << getRegClassName(RC) << " mask " << PrintLaneMask(Mask) - << '\n'); - assert(false && "Impossible to span reg class"); - return std::vector(); - } - - Result.push_back(BestIdx); - Mask &= ~getSubRegIndexLaneMask(BestIdx); - } - - return Result; -} - diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 01336b84c6786..73904c308b1f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -397,6 +397,98 @@ struct Piece { } }; +static std::vector +getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI, + const TargetRegisterClass *RC, + LaneBitmask Mask) { + // TODO: this could replace the code it was copied from in SplitKit.cpp + + // First pass: Try to find a perfectly matching subregister index. + // If none exists find the one covering the most lanemask bits. + SmallVector PossibleIndexes; + unsigned BestIdx = 0; + const LaneBitmask Avoid = ~Mask; + { + unsigned BestCover = 0; + for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI->getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == Mask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside + if ((SubRegMask & Avoid).any()) + continue; + + unsigned PopCount = SubRegMask.getNumLanes(); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) { + LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for " + << TRI->getRegClassName(RC) << " mask " + << PrintLaneMask(Mask) << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + std::vector Result; + Result.push_back(BestIdx); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx)); + while (Mask.any()) { + BestIdx = 0; + int BestCover = std::numeric_limits::min(); + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == Mask) { + BestIdx = Idx; + break; + } + + // Guaranteed above + assert((SubRegMask & Avoid).none()); + + // Try to cover as much of the remaining lanes as possible but as few of + // the already covered lanes as possible. + int Cover = (SubRegMask & Mask).getNumLanes() - + (SubRegMask & ~Mask).getNumLanes(); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) { + LLVM_DEBUG( + dbgs() << "Unable to find minimal spanning sub register(s) for " + << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask) + << '\n'); + assert(false && "Impossible to span reg class"); + return std::vector(); + } + + Result.push_back(BestIdx); + Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx); + } + + return Result; +} + static void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC, unsigned Offset, const SIRegisterInfo *SIRI) { @@ -409,8 +501,8 @@ static void updateSubReg(MachineOperand &UseMO, unsigned Mask = LaneMask.getAsInteger() >> Offset; - unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask( - NewRC, LaneBitmask(Mask)) + unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask( + SIRI, NewRC, LaneBitmask(Mask)) .front(); UseMO.setSubReg(NewSubReg); From 436058b1b06d01ecbdac0cfc2967e8ed2a451e22 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 22:04:37 -0700 Subject: [PATCH 21/25] Fixed build break after rebase --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 27 +++++++++---------- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h | 2 +- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 012ab0c91b257..3e691239ab2f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -2023,7 +2023,7 @@ void printVreg(Register Reg, const MachineRegisterInfo &MRI) { if (Name != "") { dbgs() << '%' << Name; } else { - dbgs() << '%' << Register::virtReg2Index(Reg); + dbgs() << '%' << Reg.virtRegIndex(); } } } @@ -3851,7 +3851,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, } for (auto OutIt : Exp.OutputLive) { - unsigned Reg = OutIt.first; + Register Reg = OutIt.first; LaneBitmask OutMask = OutIt.second; LaneBitmask MBBBeginMask; if (CrossLive.find(Reg) != CrossLive.end()) @@ -3863,10 +3863,9 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, : (OutMask & MBBBeginMask); if (MBBBeginMask.any()) { unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); - LLVM_DEBUG(std::string movStr = + LLVM_DEBUG(std::string MovStr = Exp.IsHoist ? "output hoist:" : "output sink:"; - dbgs() - << movStr << Register::virtReg2Index(Reg) << " " << Size); + dbgs() << MovStr << Reg.virtRegIndex() << " " << Size); // Exp out live at block input. // It will descrease live for MBB when sink and increase when hoist. if (SIRI->isVGPR(MRI, Reg)) { @@ -3886,7 +3885,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, } for (auto InIt : Exp.InputLive) { - unsigned Reg = InIt.first; + Register Reg = InIt.first; LaneBitmask InMask = InIt.second; LaneBitmask MBBBeginMask; if (CrossLive.find(Reg) != CrossLive.end()) @@ -3903,9 +3902,9 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, // It will increase live for MBB. unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); - LLVM_DEBUG( - std::string movStr = Exp.IsHoist ? "input hoist:" : "input sink:"; - dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size); + LLVM_DEBUG(std::string MovStr = + Exp.IsHoist ? "input hoist:" : "input sink:"; + dbgs() << MovStr << Reg.virtRegIndex() << " " << Size); if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); if (Exp.IsHoist) @@ -3928,7 +3927,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, // MBB. So cannot count that output live reg as profit. // Hoist into loop is not supported now. for (auto OutIt : Exp.OutputLive) { - unsigned Reg = OutIt.first; + Register Reg = OutIt.first; bool IsDomUser = false; for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) { MachineBasicBlock *UserMBB = MI.getParent(); @@ -3947,8 +3946,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, LaneBitmask ProfitMask = OutMask & MBBBeginMask; if (MBBBeginMask.any()) { unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); - LLVM_DEBUG(dbgs() - << "move:" << Register::virtReg2Index(Reg) << " " << Size); + LLVM_DEBUG(dbgs() << "move:" << Reg.virtRegIndex() << " " << Size); // Exp out live at block input. // It will descrease live for MBB. if (SIRI->isVGPR(MRI, Reg)) { @@ -3962,7 +3960,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, } for (auto InIt : Exp.InputLive) { - unsigned Reg = InIt.first; + Register Reg = InIt.first; LaneBitmask InMask = InIt.second; LaneBitmask MBBBeginMask; if (InputLive.find(Reg) != InputLive.end()) @@ -3976,8 +3974,7 @@ calculateSaving(HotBlock &HotBb, std::vector &SubExpCandidates, // It will increase live for MBB. unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI); - LLVM_DEBUG(dbgs() - << "add:" << Register::virtReg2Index(Reg) << " " << Size); + LLVM_DEBUG(dbgs() << "add:" << Reg.virtRegIndex() << " " << Size); if (SIRI->isVGPR(MRI, Reg)) { LLVM_DEBUG(dbgs() << "v\n"); VgprDiff += Size; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 73904c308b1f6..3feaa2f0f508f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -819,7 +819,7 @@ void print_reg(Register Reg, const MachineRegisterInfo &MRI, if (Name != "") { OS << '%' << Name; } else { - OS << '%' << Register::virtReg2Index(Reg); + OS << '%' << Reg.virtRegIndex(); } } else if (Reg < SIRI->getNumRegs()) { OS << '$'; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h index 52fa19a82b773..7aa053b9f7fe8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -52,7 +52,7 @@ using LiveSet = llvm::DenseMap; unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI); -void collectLiveSetPressure(const LiveSet &liveSet, +void collectLiveSetPressure(const LiveSet &LiveSet, const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI, unsigned &VPressure, unsigned &SPressure); From 9dbab90c299479105d4652935cfde044cf5c97c0 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 17 Mar 2025 22:16:43 -0700 Subject: [PATCH 22/25] Clang format --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++-- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ec39b385ecbd2..d680e01e3f8fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -397,8 +397,8 @@ static cl::opt // Enable Hot block rematerialize static cl::opt EnableHotBlockRemat("amdgpu-enable-hot-block-remat", - cl::desc("Enable HotBlock Rematerialize optimization"), - cl::init(false), cl::Hidden); + cl::desc("Enable HotBlock Rematerialize optimization"), + cl::init(false), cl::Hidden); // Enable GFX11+ VOPD static cl::opt diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 3c467c098a65e..14db2b39ef9d4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1332,7 +1332,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isLowLatencyInstruction(const MachineInstr &MI) const; bool isHighLatencyDef(int Opc) const override; - bool isHighLatencyInstruction(const MachineInstr& MI) const { + bool isHighLatencyInstruction(const MachineInstr &MI) const { return isHighLatencyDef(MI.getOpcode()); } From ebcbb24c4f8123b5e34cfc3c0a3e01b0778f858e Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 18 Mar 2025 11:43:48 -0700 Subject: [PATCH 23/25] Fixing undef deprecator failures --- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 3feaa2f0f508f..55477bd39fb73 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -258,7 +258,7 @@ bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) { bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { // Support multi def for pattern of pointer: - // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 // %808.sub1:sgpr_64 = S_MOV_B32 0 bool HasSub0 = false; bool HasSub1 = false; @@ -296,7 +296,7 @@ bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { LaneBitmask getRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI) { - // We don't rely on read-undef flag because in case of tentative schedule + // We don't rely on read-undef_ flag because in case of tentative schedule // tracking it isn't set correctly yet. This works correctly however since // use mask has been tracked before using LIS. return MO.getSubReg() == 0 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index 19a63b7900645..0673346b11ab4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -420,7 +420,7 @@ void ExpDag::addDataDep() { Register Reg = MO.getReg(); // For case like: - // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 // %808.sub1:sgpr_64 = S_MOV_B32 0 // When partially write, link MI to previous def. if (MO.getSubReg() != 0) { From b5d143c2511f853732db1da1dc0a1b92be066ed2 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Tue, 18 Mar 2025 12:19:33 -0700 Subject: [PATCH 24/25] Ran latest format --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 40 ++++++++----------- llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 9 ++--- 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 3e691239ab2f1..8fecd9f7e2534 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -365,15 +365,13 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, LLVM_DEBUG( const SIRegisterInfo *SIRI = ST->getRegisterInfo(); - dbgs() << "output live"; for (auto &It - : Status.MBBOutputLiveMap) { + dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) { unsigned Idx = It.first->getNumber(); auto LiveReg = It.second; dbgs() << "MBB" << Idx << ":"; llvm::dumpLiveSet(LiveReg, SIRI); } dbgs() << "input live"; - for (auto &It - : Status.MBBInputLiveMap) { + for (auto &It : Status.MBBInputLiveMap) { unsigned Idx = It.first->getNumber(); auto LiveReg = It.second; dbgs() << "MBB" << Idx << ":"; @@ -1811,10 +1809,9 @@ std::vector buildSubExpFromCandidates( Defs.emplace_back(&MI); } - LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI - : Defs) { - MI->dump(); - } dbgs() << "\nFinished Candidate Defs End\n";); + LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; + for (MachineInstr *MI : Defs) { MI->dump(); } dbgs() + << "\nFinished Candidate Defs End\n";); // Build SubExp with CandidateDefs as Nodes, CandidateInput as input // Candidates as output. @@ -1999,13 +1996,11 @@ std::vector buildSubExpFromCandidatesTopBottom( Defs.emplace_back(&MI); } - LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI - : Defs) { - MI->dump(); - } dbgs() << "\nFinished Candidate Defs End\n";); + LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; + for (MachineInstr *MI : Defs) { MI->dump(); } dbgs() + << "\nFinished Candidate Defs End\n";); - LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto It - : LocalCandidates) { + LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto It : LocalCandidates) { pressure::print_reg(It.first, MRI, SIRI, llvm::dbgs()); } dbgs() << "\nLocalCandidates End\n";); // Make sure all input reg are uniqueDef. @@ -3552,13 +3547,13 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &PassThrus, llvm::SmallVector> Result = Candidates.takeVector(); - LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto It - : Result) { - MachineBasicBlock *MBB = It.first; - auto &defInMBB = It.second; - MBB->dump(); - llvm::dumpLiveSet(defInMBB, SIRI); - } llvm::dbgs() << "end of candidates\n";); + LLVM_DEBUG( + llvm::dbgs() << "Before sort candidates\n"; for (auto It : Result) { + MachineBasicBlock *MBB = It.first; + auto &defInMBB = It.second; + MBB->dump(); + llvm::dumpLiveSet(defInMBB, SIRI); + } llvm::dbgs() << "end of candidates\n";); std::sort(Result.begin(), Result.end(), [](std::pair &It0, @@ -3566,8 +3561,7 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &PassThrus, return It0.first->getNumber() < It1.first->getNumber(); }); - LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto It - : Result) { + LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto It : Result) { MachineBasicBlock *MBB = It.first; auto &defInMBB = It.second; MBB->dump(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp index 0673346b11ab4..548bfa508c735 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp @@ -1023,8 +1023,7 @@ void HRB::buildLinear(std::vector &SUnits) { } LLVM_DEBUG( - dbgs() << "Chained Nodes:"; for (SUnit *SU - : ChainedNodes) { + dbgs() << "Chained Nodes:"; for (SUnit *SU : ChainedNodes) { dbgs() << " " << SU->NodeNum << "\n"; } for (unsigned i = 0; i < Lineages.size(); i++) { dbgs() << "Lineage" << i << ":"; @@ -1225,8 +1224,7 @@ void HRB::buildReachRelation(ArrayRef BotRoots) { } ReachMap.erase(&FakeEntry); - LLVM_DEBUG(for (Lineage &L - : Lineages) { + LLVM_DEBUG(for (Lineage &L : Lineages) { for (SUnit *SU : L.Nodes) { DenseSet &CurReach = ReachMap[SU]; dbgs() << SU->NodeNum << " reach: "; @@ -1687,8 +1685,7 @@ std::vector hrbSched(std::vector &SUnits, return ConfA > ConfB; }); - LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU - : ReadyList) { + LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU : ReadyList) { dbgs() << " " << SU->NodeNum; } dbgs() << "\n";); SUnit *Candidate = nullptr; From 87d9404f5a6f1e3921954380fc75ac1e88e72d59 Mon Sep 17 00:00:00 2001 From: Adam Yang Date: Mon, 31 Mar 2025 16:38:06 -0700 Subject: [PATCH 25/25] Fixed failing tests, and added tests --- .../AMDGPU/AMDGPUHotBlockRematerialize.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp | 35 +- .../test/CodeGen/AMDGPU/remat/phi_pacifist.ll | 88 +++ .../CodeGen/AMDGPU/remat/phi_pacifist.mir | 372 ++++++++++++ .../CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir | 565 ++++++++++++++++++ 5 files changed, 1055 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir create mode 100644 llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp index 8fecd9f7e2534..7d2e1a6d81db8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -2714,7 +2714,7 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS, SmallVector PacifistList; LLVM_DEBUG(dbgs() << "pacifist begin\n"); for (MachineInstr &MI : MBB) { - if (MI.isDebugInstr()) + if (MI.isDebugInstr() || MI.isPHI()) continue; if (collectPacifist(MI, InputLive, OutputLive, MRI)) { PacifistList.emplace_back(&MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp index 55477bd39fb73..beace3a501a19 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -509,6 +509,14 @@ static void updateSubReg(MachineOperand &UseMO, } } +static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI, + const MachineRegisterInfo &MRI) { + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + const TargetRegisterClass *SubregRC = + SIRI->getSubRegisterClass(RC, AMDGPU::sub0); + return SubregRC->LaneMask.getNumLanes(); +} + bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) { @@ -526,7 +534,19 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, const llvm::TargetRegisterClass *NewRC = SIRI->getRegClass(Desc.operands().front().RegClass); - unsigned Size = NewRC->getLaneMask().getNumLanes(); + if (!NewRC->isAllocatable()) { + if (SIRI->isSGPRClass(NewRC)) + NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits); + else if (SIRI->isVGPRClass(NewRC)) + NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits); + else + return false; + + if (!NewRC->isAllocatable()) + return false; + } + + unsigned NumLanes = NewRC->getLaneMask().getNumLanes(); if (Offset > 0) { // Update offset operand in MI. MachineOperand *OffsetOp = @@ -573,8 +593,8 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc, for (MachineOperand *UseMO : UseMOs) { updateSubReg(*UseMO, NewRC, Offset, SIRI); } - } else if (Size == 1) { - // Clear subReg when size is 1. + } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) { + // Clear subReg when it's a single 32-bit reg. for (MachineOperand *UseMO : UseMOs) { UseMO->setSubReg(0); } @@ -630,7 +650,10 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, if (IsImm && Piece.Offset != 0) return false; - switch (Piece.Size) { + const unsigned Num32BitLanes = + Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI); + + switch (Num32BitLanes) { default: return false; case 1: @@ -645,7 +668,7 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR), MRI, SIRI, SIII, SlotIndexes); case 3: - if (FullMask == 0xf) + if (FullMask == 0xff) return false; LLVM_FALLTHROUGH; case 4: @@ -657,7 +680,7 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI, case 5: case 6: case 7: - if (FullMask == 0xff) + if (FullMask == 0xffff) return false; LLVM_FALLTHROUGH; case 8: diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll new file mode 100644 index 0000000000000..3369486e0323a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-hot-block-remat -amdgpu-remat-enable-sub-exp-remat + +; Regression test for PHI being sinked to uses as a pacifist. +; Just checking that the test does not crash. + +; ModuleID = 'reduced.ll' +source_filename = "reduced.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn--amdpal" + +define amdgpu_ps void @_amdgpu_ps_main(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, i32 %arg5, float %arg6, float %arg7, float %arg8, <2 x half> %arg9, i1 %arg10) #0 { +bb: + br label %bb19 + +bb11: ; preds = %bb19 + %i = bitcast i32 %i21 to float + %i12 = bitcast i32 %i23 to float + %i13 = fmul float 0.000000e+00, %i26 + %i14 = fmul float %i13, 0.000000e+00 + %i15 = fmul float %i12, %i + %i16 = fadd float %i15, %i14 + %i17 = select i1 false, float 0.000000e+00, float %i16 + %i18 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %arg4, float %arg8) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i18, <2 x half> %arg9, i1 false, i1 false) + ret void + +bb19: ; preds = %bb19, %bb + %i20 = phi i32 [ 0, %bb19 ], [ %arg5, %bb ] + %i21 = phi i32 [ %i35, %bb19 ], [ 0, %bb ] + %i22 = phi i32 [ %i38, %bb19 ], [ 0, %bb ] + %i23 = phi i32 [ %i60, %bb19 ], [ 0, %bb ] + %i24 = phi i32 [ %i61, %bb19 ], [ 0, %bb ] + %i25 = phi i32 [ %i62, %bb19 ], [ 0, %bb ] + %i26 = phi float [ %i39, %bb19 ], [ 0.000000e+00, %bb ] + %i27 = phi i32 [ %i49, %bb19 ], [ 0, %bb ] + %i28 = phi i32 [ %i50, %bb19 ], [ 0, %bb ] + %i29 = phi i32 [ %i51, %bb19 ], [ 0, %bb ] + %i30 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 1, i32 %i20, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0) + %i31 = extractelement <4 x float> %i30, i64 0 + %i32 = fmul float %arg1, %i31 + %i33 = bitcast i32 %i22 to float + %i34 = fmul float %arg, %i32 + %i35 = select i1 %arg10, i32 %arg5, i32 %i21 + %i36 = fadd float 0.000000e+00, %i33 + %i37 = bitcast float %i36 to i32 + %i38 = select i1 %arg10, i32 %i22, i32 %i37 + %i39 = fadd float %i26, 1.000000e+00 + %i40 = bitcast i32 %i27 to float + %i41 = bitcast i32 %i28 to float + %i42 = bitcast i32 %i29 to float + %i43 = fadd float 0.000000e+00, %i40 + %i44 = fadd float 0.000000e+00, %i41 + %i45 = fadd float 0.000000e+00, %i42 + %i46 = bitcast float %i43 to i32 + %i47 = bitcast float %i44 to i32 + %i48 = bitcast float %i45 to i32 + %i49 = select i1 %arg10, i32 %i27, i32 %i46 + %i50 = select i1 %arg10, i32 %i28, i32 %i47 + %i51 = select i1 %arg10, i32 %i29, i32 %i48 + %i52 = fmul float %i34, %arg7 + %i53 = bitcast i32 %i24 to float + %i54 = bitcast i32 %i25 to float + %i55 = fadd float %arg6, %i53 + %i56 = fadd float %arg2, %i54 + %i57 = bitcast float %i52 to i32 + %i58 = bitcast float %i55 to i32 + %i59 = bitcast float %i56 to i32 + %i60 = select i1 %arg10, i32 %i57, i32 %i23 + %i61 = select i1 %arg10, i32 %i58, i32 %i24 + %i62 = select i1 %arg10, i32 %i59, i32 %i25 + %i63 = sitofp i32 %i20 to float + %i64 = fcmp olt float %arg3, %i63 + br i1 %i64, label %bb11, label %bb19 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #3 + +attributes #0 = { "target-features"=",+wavefrontsize64,+cumode,-xnack" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #3 = { nocallback nofree nosync nounwind willreturn memory(read) } diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir new file mode 100644 index 0000000000000..e9a8486bfa6b1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir @@ -0,0 +1,372 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat | FileCheck %s + +# Regression test for PHI being sinked to uses as a pacifist. + +# CHECK: bb.2.bb19: +# CHECK: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI +# CHECK-NEXT: PHI + +--- | + ; ModuleID = 'C:\llvm-project\llvm\test\CodeGen\AMDGPU\remat\phi_pacifist.ll' + source_filename = "reduced.ll" + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" + target triple = "amdgcn" + + define amdgpu_ps void @_amdgpu_ps_main(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, i32 %arg5, float %arg6, float %arg7, float %arg8, <2 x half> %arg9, i1 %arg10) #0 { + bb: + br label %bb19, !amdgpu.uniform !0 + + bb11: ; preds = %bb19 + %i21.lcssa = phi i32 [ %i21, %bb19 ] + %i23.lcssa = phi i32 [ %i23, %bb19 ] + %i26.lcssa = phi float [ %i26, %bb19 ] + %.lcssa = phi i64 [ %0, %bb19 ] + call void @llvm.amdgcn.end.cf.i64(i64 %.lcssa) + %i = bitcast i32 %i21.lcssa to float + %i12 = bitcast i32 %i23.lcssa to float + %i13 = fmul float 0.000000e+00, %i26.lcssa + %i18 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %arg4, float %arg8) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i18, <2 x half> %arg9, i1 false, i1 false) + ret void + + bb19: ; preds = %bb19, %bb + %phi.broken = phi i64 [ %0, %bb19 ], [ 0, %bb ] + %i20 = phi i32 [ %arg5, %bb ], [ 0, %bb19 ] + %i21 = phi i32 [ 0, %bb ], [ %i35, %bb19 ] + %i22 = phi i32 [ 0, %bb ], [ %i38, %bb19 ] + %i23 = phi i32 [ 0, %bb ], [ %i60, %bb19 ] + %i24 = phi i32 [ 0, %bb ], [ %i61, %bb19 ] + %i25 = phi i32 [ 0, %bb ], [ %i62, %bb19 ] + %i26 = phi float [ 0.000000e+00, %bb ], [ %i39, %bb19 ] + %i27 = phi i32 [ 0, %bb ], [ %i49, %bb19 ] + %i28 = phi i32 [ 0, %bb ], [ %i50, %bb19 ] + %i29 = phi i32 [ 0, %bb ], [ %i51, %bb19 ] + %i30 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 1, i32 %i20, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0) + %i31 = extractelement <4 x float> %i30, i64 0 + %i32 = fmul float %arg1, %i31 + %i33 = bitcast i32 %i22 to float + %i34 = fmul float %arg, %i32 + %i35 = select i1 %arg10, i32 %arg5, i32 %i21 + %i36 = fadd float 0.000000e+00, %i33 + %i37 = bitcast float %i36 to i32 + %i38 = select i1 %arg10, i32 %i22, i32 %i37 + %i39 = fadd float %i26, 1.000000e+00 + %i40 = bitcast i32 %i27 to float + %i41 = bitcast i32 %i28 to float + %i42 = bitcast i32 %i29 to float + %i43 = fadd float 0.000000e+00, %i40 + %i44 = fadd float 0.000000e+00, %i41 + %i45 = fadd float 0.000000e+00, %i42 + %i46 = bitcast float %i43 to i32 + %i47 = bitcast float %i44 to i32 + %i48 = bitcast float %i45 to i32 + %i49 = select i1 %arg10, i32 %i27, i32 %i46 + %i50 = select i1 %arg10, i32 %i28, i32 %i47 + %i51 = select i1 %arg10, i32 %i29, i32 %i48 + %i52 = fmul float %i34, %arg7 + %i53 = bitcast i32 %i24 to float + %i54 = bitcast i32 %i25 to float + %i55 = fadd float %arg6, %i53 + %i56 = fadd float %arg2, %i54 + %i57 = bitcast float %i52 to i32 + %i58 = bitcast float %i55 to i32 + %i59 = bitcast float %i56 to i32 + %i60 = select i1 %arg10, i32 %i57, i32 %i23 + %i61 = select i1 %arg10, i32 %i58, i32 %i24 + %i62 = select i1 %arg10, i32 %i59, i32 %i25 + %i63 = sitofp i32 %i20 to float + %i64 = fcmp olt float %arg3, %i63 + %0 = call i64 @llvm.amdgcn.if.break.i64(i1 %i64, i64 %phi.broken) + %1 = call i1 @llvm.amdgcn.loop.i64(i64 %0) + br i1 %1, label %bb11, label %bb19 + } + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) + declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #2 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) + declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #3 + + ; Function Attrs: nocallback nofree nounwind willreturn memory(none) + declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #4 + + ; Function Attrs: nocallback nofree nounwind willreturn + declare i1 @llvm.amdgcn.loop.i64(i64) #5 + + ; Function Attrs: nocallback nofree nounwind willreturn + declare void @llvm.amdgcn.end.cf.i64(i64) #5 + + attributes #0 = { "target-cpu"="gfx1010" "target-features"=",+wavefrontsize64,+cumode,-xnack" } + attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" } + attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx1010" } + attributes #3 = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx1010" } + attributes #4 = { nocallback nofree nounwind willreturn memory(none) } + attributes #5 = { nocallback nofree nounwind willreturn } + + !0 = !{} + +... +--- +name: _amdgpu_ps_main +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 3, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 4, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 5, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 6, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 7, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 8, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 10, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 11, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 12, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 13, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 14, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 15, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 16, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 17, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 18, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 19, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 20, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 21, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 22, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 23, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 24, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 25, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 26, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 27, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 30, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 31, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 32, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 33, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 34, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 35, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 36, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 37, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 38, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 39, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 40, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 41, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 42, class: sreg_64_xexec, preferred-register: '$vcc', flags: [ ] } + - { id: 43, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 44, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 45, class: sgpr_256, preferred-register: '', flags: [ ] } + - { id: 46, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 47, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 48, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 49, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 50, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 51, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 52, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 53, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 54, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 55, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 56, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 57, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 58, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 59, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 60, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 61, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 62, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 63, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 64, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 65, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 66, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 67, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 68, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 69, class: sreg_64, preferred-register: '$vcc', flags: [ ] } + - { id: 70, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 71, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 72, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 73, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 74, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 75, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 76, class: vgpr_32, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$vgpr0', virtual-reg: '%25' } + - { reg: '$vgpr1', virtual-reg: '%26' } + - { reg: '$vgpr2', virtual-reg: '%27' } + - { reg: '$vgpr3', virtual-reg: '%28' } + - { reg: '$vgpr4', virtual-reg: '%29' } + - { reg: '$vgpr5', virtual-reg: '%30' } + - { reg: '$vgpr6', virtual-reg: '%31' } + - { reg: '$vgpr7', virtual-reg: '%32' } + - { reg: '$vgpr8', virtual-reg: '%33' } + - { reg: '$vgpr9', virtual-reg: '%34' } + - { reg: '$vgpr10', virtual-reg: '%35' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + explicitKernArgSize: 0 + maxKernArgAlign: 4 + ldsSize: 0 + gdsSize: 0 + dynLDSAlign: 1 + isEntryFunction: true + isChainFunction: false + noSignedZerosFPMath: false + memoryBound: false + waveLimiter: false + hasSpilledSGPRs: false + hasSpilledVGPRs: false + scratchRSrcReg: '$sgpr100_sgpr101_sgpr102_sgpr103' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + bytesInStackArgArea: 0 + returnsVoid: true + argumentInfo: + privateSegmentWaveByteOffset: { reg: '$sgpr0' } + psInputAddr: 2047 + psInputEnable: 2047 + maxMemoryClusterDWords: 8 + mode: + ieee: false + dx10-clamp: true + fp32-input-denormals: true + fp32-output-denormals: true + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + highBitsOf32BitAddress: 0 + occupancy: 20 + vgprForAGPRCopy: '' + sgprForEXECCopy: '$sgpr104_sgpr105' + longBranchReservedReg: '' + hasInitWholeWave: false +body: | + bb.0.bb: + successors: %bb.2(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + + %35:vgpr_32 = COPY $vgpr10 + %34:vgpr_32 = COPY $vgpr9 + %33:vgpr_32 = COPY $vgpr8 + %32:vgpr_32 = COPY $vgpr7 + %31:vgpr_32 = COPY $vgpr6 + %30:vgpr_32 = COPY $vgpr5 + %29:vgpr_32 = COPY $vgpr4 + %28:vgpr_32 = COPY $vgpr3 + %27:vgpr_32 = COPY $vgpr2 + %26:vgpr_32 = COPY $vgpr1 + %25:vgpr_32 = COPY $vgpr0 + %41:vgpr_32 = V_AND_B32_e64 1, %35, implicit $exec + %42:sreg_64_xexec = V_CMP_EQ_U32_e64 1, killed %41, implicit $exec + %39:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %38:sreg_64 = S_MOV_B64 0 + %76:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %51:sgpr_32 = S_MOV_B32 0 + %45:sgpr_256 = REG_SEQUENCE %51, %subreg.sub0, %51, %subreg.sub1, %51, %subreg.sub2, %51, %subreg.sub3, %51, %subreg.sub4, %51, %subreg.sub5, %51, %subreg.sub6, %51, %subreg.sub7 + S_BRANCH %bb.2 + + bb.1.bb11: + SI_END_CF %24, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %70:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, %29, 0, %33, 0, 0, implicit $mode, implicit $exec + %71:sreg_32 = IMPLICIT_DEF + %72:vgpr_32 = COPY %71 + %73:sreg_32 = IMPLICIT_DEF + %74:vgpr_32 = COPY %73 + EXP 0, killed %70, %34, %72, %74, 0, -1, 0, implicit $exec + S_ENDPGM 0 + + bb.2.bb19: + successors: %bb.1(0x04000000), %bb.2(0x7c000000) + + %4:sreg_64 = PHI %38, %bb.0, %24, %bb.2 + %5:vgpr_32 = PHI %30, %bb.0, %76, %bb.2 + %6:vgpr_32 = PHI %39, %bb.0, %15, %bb.2 + %7:vgpr_32 = PHI %39, %bb.0, %16, %bb.2 + %8:vgpr_32 = PHI %39, %bb.0, %21, %bb.2 + %9:vgpr_32 = PHI %39, %bb.0, %22, %bb.2 + %10:vgpr_32 = PHI %39, %bb.0, %23, %bb.2 + %75:vgpr_32 = PHI %76, %bb.0, %55, %bb.2 + %12:vgpr_32 = PHI %39, %bb.0, %18, %bb.2 + %13:vgpr_32 = PHI %39, %bb.0, %19, %bb.2 + %14:vgpr_32 = PHI %39, %bb.0, %20, %bb.2 + %46:vgpr_32 = IMAGE_LOAD_V1_V2_nsa_gfx10 %5, %76, %45, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + %48:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %26, 0, killed %46, 0, 0, implicit $mode, implicit $exec + %49:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %25, 0, killed %48, 0, 0, implicit $mode, implicit $exec + %15:vgpr_32 = V_CNDMASK_B32_e64 0, %6, 0, %30, %42, implicit $exec + %52:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %7, 0, 0, implicit $mode, implicit $exec + %16:vgpr_32 = V_CNDMASK_B32_e64 0, killed %52, 0, %7, %42, implicit $exec + %55:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 1065353216, 0, %75, 0, 0, implicit $mode, implicit $exec + %56:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %12, 0, 0, implicit $mode, implicit $exec + %57:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %13, 0, 0, implicit $mode, implicit $exec + %58:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %14, 0, 0, implicit $mode, implicit $exec + %18:vgpr_32 = V_CNDMASK_B32_e64 0, killed %56, 0, %12, %42, implicit $exec + %19:vgpr_32 = V_CNDMASK_B32_e64 0, killed %57, 0, %13, %42, implicit $exec + %20:vgpr_32 = V_CNDMASK_B32_e64 0, killed %58, 0, %14, %42, implicit $exec + %62:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %49, 0, %32, 0, 0, implicit $mode, implicit $exec + %63:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %31, 0, %9, 0, 0, implicit $mode, implicit $exec + %64:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %27, 0, %10, 0, 0, implicit $mode, implicit $exec + %21:vgpr_32 = V_CNDMASK_B32_e64 0, %8, 0, killed %62, %42, implicit $exec + %22:vgpr_32 = V_CNDMASK_B32_e64 0, %9, 0, killed %63, %42, implicit $exec + %23:vgpr_32 = V_CNDMASK_B32_e64 0, %10, 0, killed %64, %42, implicit $exec + %68:vgpr_32 = V_CVT_F32_I32_e64 %5, 0, 0, implicit $mode, implicit $exec + %69:sreg_64 = nofpexcept V_CMP_LT_F32_e64 0, %28, 0, killed %68, 0, implicit $mode, implicit $exec + %24:sreg_64 = SI_IF_BREAK killed %69, %4, implicit-def dead $scc + SI_LOOP %24, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir new file mode 100644 index 0000000000000..9f5d402340329 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir @@ -0,0 +1,565 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s + +# Check that the buffer loads have been moved to the use and the lanes are reduced +# correctly. +# +# CHECK: bb.2: +#========================================================================== +# X4_IMM, Using .x +# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0 +# X4_IMM, Using .xy +# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0 +# X4_IMM, Using .xyz +# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0 +# X4_IMM, Using .yz +# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0 +# X4_IMM, Using .yzw +# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0 +#========================================================================== +# X8_IMM, Using .x +# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0 +# X8_IMM, Using .xy +# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0 +# X8_IMM, Using .xyz +# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0 +# X8_IMM, Using .xyzw +# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0 +# X8_IMM, Using .xyzw + 5th dword +# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0 +#========================================================================== +# X16_IMM, Using .xy and .zw +# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0 +#========================================================================== +# X4_SGPR, Using .x +# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0 +# X8_SGPR, Using .xy +# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0 +# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0 +# X16_SGPR, Using .xy + .zw +# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0 +# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0 +#========================================================================== +# +# +# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0 +# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0 +# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0 +# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0 +# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0 +# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0 +# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0 +# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0 +# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0 +# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0 +# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0 +# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0 +# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0 +# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0 +# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0 +# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0 +# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0 +# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0 +# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0 +# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0 +# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0 +# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0 +# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0 +# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0 +# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0 +# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0 +# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0 +# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0 +# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0 +# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0 +# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0 +# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0 +# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0 +# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0 +# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0 +# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0 +# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0 +# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0 +# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0 +# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0 +# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0 +# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0 +# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0 +# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0 +# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0 +# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0 +# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0 +# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0 +# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0 +# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0 +# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0 + + +--- | + source_filename = ".\main.ll" + define amdgpu_ps void @main() #1 { + ret void + } + attributes #1 = { "target-cpu"="gfx1010" } + !llvm.ident = !{!0} + !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +... +--- +name: main +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$sgpr1' } + - { reg: '$sgpr2' } + - { reg: '$sgpr3' } + - { reg: '$sgpr4' } + - { reg: '$sgpr5' } + - { reg: '$sgpr6' } + - { reg: '$sgpr7' } + - { reg: '$sgpr8' } + - { reg: '$sgpr8' } + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1 + + %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1 + %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3 + %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3 + + ; X4_IMM + %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0 + %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0 + %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0 + %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0 + %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0 + + ; X8_IMM + %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0 + %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0 + %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0 + %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0 + %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0 + + ; X16_IMM + %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0 + + ; X4_SGPR + %50:sgpr_32 = COPY $sgpr0 + %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0 + + ; X8_SGPR + %51:sgpr_32 = COPY $sgpr1 + %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0 + + ; X16_SGPR + %52:sgpr_32 = COPY $sgpr2 + %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0 + + %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0 + %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0 + %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0 + %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0 + %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0 + %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0 + %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0 + %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0 + %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0 + %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0 + %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0 + %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0 + %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0 + %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0 + %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0 + %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0 + %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0 + %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0 + %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0 + %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0 + %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0 + %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0 + %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0 + %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0 + %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0 + %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0 + %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0 + %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0 + %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0 + %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0 + %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0 + %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0 + %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0 + %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0 + %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0 + %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0 + %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0 + %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0 + %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0 + %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0 + %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0 + %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0 + %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0 + %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0 + %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0 + %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0 + %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0 + %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0 + %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0 + %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0 + + %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + + %8000:vgpr_32 = IMPLICIT_DEF + %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %116:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %8001:vgpr_32 = COPY %8000 + S_BRANCH %bb.2 + + bb.2: + + %3:vgpr_32 = IMPLICIT_DEF + ;========================================================================== + ; X4_IMM, Using .x + S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0 + S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses. + + ; X4_IMM, Using .xy + S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0 + S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0 + + ; X4_IMM, Using .xyz + S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0 + S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0 + S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0 + + ; X4_IMM, Using .yz + S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0 + S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0 + + ; X4_IMM, Using .yzw + S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0 + S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0 + S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0 + + ;========================================================================== + ; X8_IMM, Using .x + S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0 + S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0 + + ; X8_IMM, Using .xy + S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0 + S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0 + + ; X8_IMM, Using .xyz + S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0 + S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0 + S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0 + + ; X8_IMM, Using .xyzw + S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0 + S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0 + + ; X8_IMM, Using .xyzw + 5th dword + S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0 + S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0 + + ;========================================================================== + ; X16_IMM, Using .xy and .zw + S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0 + S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0 + + ;========================================================================== + ; X4_SGPR, Using .x + S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0 + S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0 + + ; X8_SGPR, Using .xy + S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0 + S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0 + + ; X16_SGPR, Using .xy + .zw + S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0 + S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0 + + ;========================================================================== + S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0 + S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0 + + EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec + EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec + + + S_ENDPGM 0 +... + + + +