-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Added hot-block-rematerialize pass #136631
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Adam Yang (adam-yang) ChangesThis is a reduced PR from #126331. Added a new optimization pass hot-block-rematerialize to AMDGPU backend. Searches for instructions to sink or clone to its users to reduce register pressure. Patch is 178.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136631.diff 13 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4ff761ec19b3c..1ba8e3e2a54d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -530,6 +530,10 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &);
+FunctionPass *createAMDGPUHotBlockRematerializePass();
+extern char &AMDGPUHotBlockRematerializeID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
new file mode 100644
index 0000000000000..3c5d592602c6f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -0,0 +1,1532 @@
+//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+
+namespace {
+
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
+
+struct RematNode {
+ enum class RematKind {
+ Candidate, // Not ready yet.
+ OneDefOneUse,
+ Clone,
+ };
+ RematNode()
+ : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr),
+ Kind(RematKind::Candidate), Size(0) {}
+ RematNode(unsigned R, MachineInstr *MI, unsigned S)
+ : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr),
+ Kind(RematKind::Candidate), Size(S) {}
+ unsigned Reg;
+ MachineInstr *DefMI;
+ MachineBasicBlock *InsertBlock;
+ union {
+ MachineInstr *InsertPointMI;
+ unsigned UserCount;
+ };
+ RematKind Kind;
+ unsigned Size;
+};
+
+struct BlockLiveInfo {
+ MachineBasicBlock *BB;
+ unsigned MaxSReg;
+ unsigned MaxVReg;
+ // Input live is the live reg which cross block.
+ const GCNRPTracker::LiveRegSet InputLive;
+};
+
+struct RematStatus {
+ unsigned TargetOcc;
+ unsigned TargetVLimit;
+ unsigned TargetSLimit;
+ unsigned MaxVPressure;
+ unsigned MaxSPressure;
+ unsigned InputPhysicalVPressure;
+ unsigned InputPhysicalSPressure;
+ // More occupancy can help more than latency cost to reach It.
+ bool MemBound;
+ // abs(VTargetOcc-STargetOcc) > 1.
+ bool NotBalance;
+ DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+ DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+ // Collect MBBs which has memory write. When move instructions cross MBB, skip
+ // mem inst if the MBB has memory write. To make things fast, just check
+ // mayStore and isBarrier.
+ DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+ static char ID;
+
+ DenseSet<const MachineInstr *> TotalUniformInsts;
+ DenseSet<const MachineInstr *> SafeToRemoveInsts;
+ DenseSet<const MachineInstr *> DivergentInsts;
+ void removeInst(const MachineInstr *MI) {
+ TotalUniformInsts.erase(MI);
+ SafeToRemoveInsts.erase(MI);
+ DivergentInsts.erase(MI);
+ }
+
+ AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void applyCloneRemat(RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
+ void applyRemat(MapVector<Register, RematNode> &RematMap,
+ std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes,
+ MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF);
+ bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, bool &IsNearTarget);
+
+ StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+ AU.addRequired<SlotIndexesWrapperPass>();
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
+ MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+ const bool WillSmashScc =
+ InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+ if (WillSmashScc) {
+ CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
+ MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+ }
+
+ return CurrentInsertPoint;
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+ unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
+ DenseSet<MachineBasicBlock *> &UserMBBSet,
+ std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT) {
+ // Collect hot blocks which Exp is live in.
+ DenseSet<MachineBasicBlock *> HotBlockSet;
+ for (BlockLiveInfo &HotBlock : HotBlocks) {
+ if (HotBlock.InputLive.count(Reg)) {
+ HotBlockSet.insert(HotBlock.BB);
+ }
+ }
+
+ // For userBlocks which dominate all hotBlocks, don't need to clone because
+ // the value not cross hotBlocks when later blocks are cloned.
+ // For userBlocks which dominated by all hotBlocks, they could share clones
+ // because once after hot block, the pressure is OK.
+ DenseSet<MachineBasicBlock *> AfterHotRangeMBBs;
+ for (MachineBasicBlock *MBB : UserMBBSet) {
+ // Always clone in hot block.
+ if (HotBlockSet.count(MBB))
+ continue;
+
+ bool IsDomAllHotBlocks = true;
+ bool IsDomedByAllHotBlocks = true;
+ for (MachineBasicBlock *HotMBB : HotBlockSet) {
+ if (!DT->dominates(MBB, HotMBB)) {
+ IsDomAllHotBlocks = false;
+ }
+ if (!DT->dominates(HotMBB, MBB)) {
+ IsDomedByAllHotBlocks = false;
+ }
+ if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
+ break;
+ }
+ }
+ if (IsDomAllHotBlocks) {
+ UserBlocks.erase(MBB);
+ } else if (IsDomedByAllHotBlocks) {
+ AfterHotRangeMBBs.insert(MBB);
+ }
+ }
+
+ // Split after hotRange block set by domtree.
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+ if (!AfterHotRangeMBBs.empty()) {
+ for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+ for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) {
+ if (MBB == MBB2)
+ continue;
+ if (DT->dominates(MBB, MBB2)) {
+ auto &Dom = DomMap[MBB];
+ Dom.insert(MBB2);
+ auto &Dom2 = DomMap[MBB2];
+ Dom.insert(Dom2.begin(), Dom2.end());
+ }
+ }
+ }
+ for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+ auto &Dom = DomMap[MBB];
+ for (MachineBasicBlock *DomedMBB : Dom) {
+ // Remove domedMBB.
+ DomMap.erase(DomedMBB);
+ UserMBBSet.erase(DomedMBB);
+ }
+ }
+ }
+
+ return DomMap;
+}
+
+void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
+ SmallVector<MachineInstr *, 2> &UserMIs) {
+ for (MachineInstr *UseMI : UserMIs) {
+ for (MachineOperand &MO : UseMI->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == Reg) {
+ MO.setReg(NewReg);
+ if (IsSubRegDef)
+ MO.setSubReg(0);
+ }
+ }
+ }
+}
+
+void AMDGPUHotBlockRematerialize::applyCloneRemat(
+ RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+ MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII, MachineFunction &MF) {
+ unsigned Reg = Node.Reg;
+
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+ auto DefOp = DefMI->getOperand(0);
+ const MCInstrDesc &Desc = DefMI->getDesc();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ // When the unique def has subReg, just create newReg for the subReg part.
+ bool IsSubRegDef = false;
+ if (DefOp.getSubReg() != 0) {
+ RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
+ IsSubRegDef = true;
+ }
+ const DebugLoc DL = DefMI->getDebugLoc();
+ unsigned OpNum = DefMI->getNumOperands();
+
+ Node.Kind = RematNode::RematKind::Clone;
+
+ // Group user in same blocks.
+ BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
+ DenseSet<MachineBasicBlock *> UserMBBSet;
+ for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+ UseIt != MRI.use_instr_nodbg_end();) {
+ MachineInstr &UseMI = *(UseIt++);
+ UserMap[UseMI.getParent()].emplace_back(&UseMI);
+ UserMBBSet.insert(UseMI.getParent());
+ }
+
+ DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+ reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT);
+
+ for (auto UseIt : UserMap) {
+ MachineBasicBlock *MBB = UseIt.first;
+ // Skip same block uses.
+ if (MBB == DefMI->getParent()) {
+ continue;
+ }
+ // Skip MBB which share clone from other MBBs.
+ if (UserMBBSet.count(MBB) == 0)
+ continue;
+
+ Register NewReg = MRI.createVirtualRegister(RC);
+ auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
+ for (unsigned I = 1; I < OpNum; I++) {
+ NewDef = NewDef.add(DefMI->getOperand(I));
+ }
+
+ MachineInstr *InsertPointMI = UseIt.second.front();
+ SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+
+ for (MachineInstr *UseMI : UseIt.second) {
+ SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI);
+ if (LastSlot > Slot) {
+ LastSlot = Slot;
+ InsertPointMI = UseMI;
+ }
+ }
+
+ MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash(
+ DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
+
+ for (MachineMemOperand *MO : DefMI->memoperands()) {
+ NewDef->addMemOperand(MF, MO);
+ }
+
+ MBB->insert(InsertPoint, NewDef);
+
+ SlotIndexes->insertMachineInstrInMaps(*NewDef);
+
+ SmallVector<MachineInstr *, 2> &UserMIs = UseIt.second;
+ updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+
+ // update users in dom MBBs.
+ auto DomMapIt = DomMap.find(MBB);
+ if (DomMapIt != DomMap.end()) {
+ for (MachineBasicBlock *UpdateMBB : DomMapIt->second) {
+ SmallVector<MachineInstr *, 2> &UserMIs = UserMap[UpdateMBB];
+ updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+ }
+ }
+
+ llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes);
+ }
+ if (MRI.use_empty(Reg)) {
+ SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ }
+}
+
+void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+ SlotIndexes *SlotIndexes,
+ const SIRegisterInfo *SIRI,
+ const SIInstrInfo *SIII) {
+ MachineInstr *DefMI = Node.DefMI;
+ MachineInstr *InsertPointMI = Node.InsertPointMI;
+ MachineBasicBlock *MBB = nullptr;
+
+ // Find a valid insert point.
+ MachineBasicBlock::iterator InsertPoint;
+ if (InsertPointMI) {
+ InsertPoint = InsertPointMI->getIterator();
+ MBB = InsertPointMI->getParent();
+ } else {
+ InsertPoint = Node.InsertBlock->getFirstTerminator();
+ MBB = Node.InsertBlock;
+ }
+
+ InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+ SIRI, SIII);
+
+ // Move instruction to new location.
+ DefMI->removeFromParent();
+ InsertPoint->getParent()->insert(InsertPoint, DefMI);
+
+ // Update slot index.
+ SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+ SlotIndexes->insertMachineInstrInMaps(*DefMI);
+}
+
+void AMDGPUHotBlockRematerialize::applyRemat(
+ MapVector<Register, RematNode> &RematMap,
+ std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+ llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) {
+ std::vector<RematNode> UpdateList;
+ for (auto &It : RematMap) {
+ UpdateList.emplace_back(It.second);
+ }
+ // Sort update list with slotIndex to make sure def moved before use.
+ // If use moved before def, It might not be the first use anymore.
+ std::sort(UpdateList.begin(), UpdateList.end(),
+ [&SlotIndexes](RematNode &I, RematNode &J) {
+ SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI);
+ SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI);
+ return A < B;
+ });
+
+ for (RematNode &Node : UpdateList) {
+ if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+ applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
+ } else if (Node.Kind == RematNode::RematKind::Clone) {
+ applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF);
+ }
+ }
+}
+
+unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+ const GCNSubtarget *ST, unsigned &MaxVPressure,
+ unsigned &MaxSPressure, RematStatus &Status) {
+ // Skip processing current block if It has only debug instructions
+ if (MBB.getFirstNonDebugInstr() == MBB.end())
+ return ST->getOccupancyWithNumVGPRs(0);
+ auto BBEnd = MBB.rbegin();
+ GCNUpwardRPTracker RPTracker(*LIS);
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ if (!llvm::getNonDebugMBBEnd(BBEnd, MBB))
+ return ST->getOccupancyWithNumVGPRs(0);
+
+ GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
+ RPTracker.reset(*BBEnd, &OutputLive, true);
+
+ for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
+ MachineInstr &MI = (*I++);
+ RPTracker.recede(MI);
+ if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
+ Status.MemWriteMBBSet.insert(&MBB);
+ }
+
+ GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
+ unsigned SPressure = RP.getMaxSGPR();
+ if (SPressure > MaxSPressure) {
+ MaxSPressure = SPressure;
+ }
+ if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) {
+ MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+ }
+ Status.MBBPressureMap[&MBB] = RP;
+ return RP.getOccupancy(*ST);
+}
+
+unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+ const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST, unsigned &MaxVPressure,
+ unsigned &MaxSPressure, RematStatus &Status) {
+ unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
+ // If only have one block, input/ouput virtual live set are empty.
+ if (MF.size() > 1) {
+ // Build input output live reg first.
+ auto *SlotIndexes = LIS->getSlotIndexes();
+ DenseMap<MachineBasicBlock *, SlotIndex> MBBInputSlotMap;
+ DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+ for (MachineBasicBlock &MBB : MF) {
+ auto BBBegin = MBB.getFirstNonDebugInstr();
+ if (BBBegin != MBB.end()) {
+ auto SI = SlotIndexes->getInstructionIndex(*BBBegin);
+ MBBInputSlotMap[&MBB] = SI;
+ }
+
+ auto BBEnd = MBB.rbegin();
+
+ // R.End doesn't point to the boundary instruction.
+ // Skip Debug instr.
+ if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) {
+ auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+ MBBOutputSlotMap[&MBB] = SI;
+ }
+ }
+
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ auto Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+
+ const auto &LI = LIS->getInterval(Reg);
+
+ // Skip local live interval to make live input/ouput faster.
+ if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+ continue;
+
+ for (auto InputIt : MBBInputSlotMap) {
+ MachineBasicBlock *MBB = InputIt.first;
+ auto SI = InputIt.second;
+
+ auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+ if (LiveMask.any())
+ Status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+ }
+
+ for (auto OutputIt : MBBOutputSlotMap) {
+ MachineBasicBlock *MBB = OutputIt.first;
+ auto SI = OutputIt.second;
+
+ auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+ if (LiveMask.any())
+ Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+ }
+ }
+ }
+
+ LLVM_DEBUG(
+ const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+ dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) {
+ unsigned Idx = It.first->getNumber();
+ auto LiveReg = It.second;
+ dbgs() << "MBB" << Idx << ":";
+ llvm::dumpLiveSet(LiveReg, SIRI);
+ } dbgs() << "input live";
+ for (auto &It : Status.MBBInputLiveMap) {
+ unsigned Idx = It.first->getNumber();
+ auto LiveReg = It.second;
+ dbgs() << "MBB" << Idx << ":";
+ llvm::dumpLiveSet(LiveReg, SIRI);
+ });
+
+ for (auto It = MF.begin(); It != MF.end(); ++It) {
+ MachineBasicBlock &MBB = *It;
+ unsigned Occ =
+ collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status);
+ if (TgtOcc > Occ)
+ TgtOcc = Occ;
+ }
+ return TgtOcc;
+}
+
+RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+ LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+ const GCNSubtarget *ST) {
+ unsigned MaxSPressure = 0;
+ unsigned MaxVPressure = 0;
+ RematStatus Status;
+ unsigned TgtOcc =
+ collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status);
+ const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+ if (TgtOcc >= MaxOcc) {
+ Status.TargetOcc = TgtOcc;
+ Status.TargetVLimit = 0;
+ Status.TargetSLimit = 0;
+ Status.MaxVPressure = 0;
+ Status.MaxSPressure = 0;
+ Status.InputPhysicalVPressure = 0;
+ Status.InputPhysicalSPressure = 0;
+ Status.MemBound = false;
+ Status.NotBalance = false;
+ return Status;
+ }
+
+ MaxSPressure += RegForVCC;
+ MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF));
+ unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure);
+ unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure);
+
+ llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI);
+ bool MemBound =
+ TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+
+ bool NotBalance = false;
+
+ const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
+ // Currently, only sgpr bound can be fixed with remat.
+ if (STgtOcc < VTgtOcc) {
+ unsigned BigOcc = std::max(STgtOcc, VTgtOcc);
+ // Change TgtOcc to in case sgpr and vgpr is not balance.
+ if (BigOcc > TgtOcc) {
+ TgtOcc = BigOcc;
+ NotBalance = ...
[truncated]
|
|
|
shiltian
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a quick drive-by with some surface-level style comments. Can you go through your entire PR and make sure everything conforms to LLVM coding style?
|
Is there any data supporting this improves performance? How much does it hurt and where? |
|
It will also benefit some description of the idea. |
Did a style pass. |
For our hw's, the default, non-aggressive SGPR code path (which is included in this change) is enabled by default. If we turn it off, across our retail shader test set, occupancy goes down across the board. Among the shaders that showed any change in performance, occupancy lowered by ~1 on average. For real timing measurement on PIX captures, the total frame time of all our performance suite goes down by 1% with the pass turned off. |
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only started skimming, but this is a huge amount of code and disproportionately large compared to the new tests. Why is this done while maintaining LiveIntervals in codegen? We know which blocks are hot in the IR, can't we just do trivial block cloning there?
|
|
||
| namespace llvm { | ||
|
|
||
| bool isSccLiveAt(llvm::MachineBasicBlock *MBB, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function shouldn't be added. It's reinventing physical register liveness tracking for the 100th time in LLVM. Either directly use LiveIntervals, or for fuzzier usage MBB::computeRegisterLiveness, or use tracking in LiveRegUnits in passes with post-RA physreg tracking
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rewrote this whole SCC situation to use LiveIntervals instead, and added tests to verify the functionality.
| // If no safe location can be found in the block we can save and restore | ||
| // SCC around MI. There is no way to directly read or Write SCC so we use | ||
| // s_cselect to read the current value of SCC and s_cmp to Write the saved | ||
| // value back to SCC. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should just use COPY and let that lower to the s_cselect
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
| } | ||
|
|
||
| static std::vector<unsigned> | ||
| getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is reinventing getCoveringSubRegIndexes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is removed, along with the remove lanes functionality.
| //===----------------------------------------------------------------------===// | ||
| // | ||
| /// \file | ||
| /// \brief AMDGPU hot block Rematerialize |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we do this in the IR? The complexity is about 100x higher to do this on MachineIR while dealing with LiveIntervals
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, it would make certain things much easier. It's been tried, but measuring liveness was an issue. It was also difficult to get an accurate picture of the register pressure, partly because the distinction between uniform vector operations and scalar operations was unclear.
| return false; | ||
| } | ||
|
|
||
| // SGPR has alignment requirment, cannot get accurate reg number. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The alignment requirement should not matter for spill purposes, we're going to be copying 32-bits at a time. Is this about allocation granularity? Shouldn't hardcode that either
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's referring to allocation granularity, but in reality it's just a heuristic bias number. Changed its name to reflect this fact.
| } | ||
| } | ||
|
|
||
| SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is reinventing MachineTraceMetrics?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Partly. It has GPU (or even AMDGPU) specific metrics that MachineTraceMetrics won't give us.
|
Latest changes:
|
This is a reduced PR from #126331.
HotBlockRematerialize (HBR) is an optimization pass for reducing register pressure and increasing occupancy by sinking and cloning instructions.
In the original change, there are a few different code paths to target different ways of reducing register pressure and option toggles to control aggressiveness. This change only includes the default and aggressive SGPR code path.
By default, if the max SGPR count is near the wave limit (and is at risk of spilling into VGPR) or occupancy is limited by SGPR, HBR tries to reduce SGPR count by sinking or cloning single scalar instructions to their users.