diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4ff761ec19b3c..1ba8e3e2a54d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -530,6 +530,10 @@ extern char &GCNRewritePartialRegUsesID;
 void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
 extern char &AMDGPUWaitSGPRHazardsLegacyID;
 
+void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &);
+FunctionPass *createAMDGPUHotBlockRematerializePass();
+extern char &AMDGPUHotBlockRematerializeID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
new file mode 100644
index 0000000000000..b00d286c938f8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -0,0 +1,1559 @@
+//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    EnableAggressiveSgpr("amdgpu-remat-enable-hot-block-remat-aggressive-sgpr");
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+
+namespace {
+
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
+
+struct RematNode {
+  enum class RematKind {
+    Candidate, // Not ready yet.
+    OneDefOneUse,
+    Clone,
+  };
+  RematNode()
+      : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr),
+        Kind(RematKind::Candidate), Size(0) {}
+  RematNode(unsigned R, MachineInstr *MI, unsigned S)
+      : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr),
+        Kind(RematKind::Candidate), Size(S) {}
+  Register Reg;
+  MachineInstr *DefMI;
+  MachineBasicBlock *InsertBlock;
+  union {
+    MachineInstr *InsertPointMI;
+    unsigned UserCount;
+  };
+  RematKind Kind;
+  unsigned Size; // This is actually the Gain of the candidate.
+};
+
+struct BlockLiveInfo {
+  MachineBasicBlock *BB;
+  unsigned MaxSReg;
+  unsigned MaxVReg;
+  // Input live is the live reg which cross block.
+  const GCNRPTracker::LiveRegSet InputLive;
+};
+
+struct RematStatus {
+  unsigned TargetOcc;
+  unsigned TargetVLimit;
+  unsigned TargetSLimit;
+  unsigned MaxVPressure;
+  unsigned MaxSPressure;
+  unsigned InputPhysicalVPressure;
+  unsigned InputPhysicalSPressure;
+  // More occupancy can help more than latency cost to reach It.
+  bool MemBound;
+  // abs(VTargetOcc-STargetOcc) > 1.
+  bool NotBalance;
+  DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+  // Collect MBBs which has memory write. When move instructions cross MBB, skip
+  // mem inst if the MBB has memory write. To make things fast, just check
+  // mayStore and isBarrier.
+  DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+  static char ID;
+
+  DenseSet<const MachineInstr *> TotalUniformInsts;
+  DenseSet<const MachineInstr *> SafeToRemoveInsts;
+  DenseSet<const MachineInstr *> DivergentInsts;
+  void removeInst(const MachineInstr *MI) {
+    TotalUniformInsts.erase(MI);
+    SafeToRemoveInsts.erase(MI);
+    DivergentInsts.erase(MI);
+  }
+
+  AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void applyCloneRemat(RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+                       MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+                       SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+                       const SIInstrInfo *SIII, LiveIntervals *LIS,
+                       MachineFunction &MF);
+  void applyRemat(MapVector<Register, RematNode> &RematMap,
+                  std::vector<BlockLiveInfo> &HotBlocks,
+                  MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes,
+                  MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                  const SIInstrInfo *SIII, LiveIntervals *LIS,
+                  MachineFunction &MF);
+  bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
+                     LiveIntervals *LIS, MachineDominatorTree *DT,
+                     MachinePostDominatorTree *PDT, bool &IsNearTarget);
+
+  StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+    AU.addRequired<SlotIndexesWrapperPass>();
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
+    MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS) {
+  const bool WillSmashScc =
+      InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+  if (WillSmashScc) {
+    CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
+        MBB, CurrentInsertPoint, SIRI, SIII, &MRI, LIS);
+  }
+
+  return CurrentInsertPoint;
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+    Register Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
+    DenseSet<MachineBasicBlock *> &UserMBBSet,
+    std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT) {
+  // Collect hot blocks which Exp is live in.
+  DenseSet<MachineBasicBlock *> HotBlockSet;
+  for (BlockLiveInfo &HotBlock : HotBlocks) {
+    if (HotBlock.InputLive.count(Reg)) {
+      HotBlockSet.insert(HotBlock.BB);
+    }
+  }
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseSet<MachineBasicBlock *> AfterHotRangeMBBs;
+  for (MachineBasicBlock *MBB : UserMBBSet) {
+    // Always clone in hot block.
+    if (HotBlockSet.count(MBB))
+      continue;
+
+    bool IsDomAllHotBlocks = true;
+    bool IsDomedByAllHotBlocks = true;
+    for (MachineBasicBlock *HotMBB : HotBlockSet) {
+      if (!DT->dominates(MBB, HotMBB))
+        IsDomAllHotBlocks = false;
+      if (!DT->dominates(HotMBB, MBB))
+        IsDomedByAllHotBlocks = false;
+      if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks)
+        break;
+    }
+    if (IsDomAllHotBlocks)
+      UserBlocks.erase(MBB);
+    else if (IsDomedByAllHotBlocks)
+      AfterHotRangeMBBs.insert(MBB);
+  }
+
+  // Split after hotRange block set by domtree.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+  if (!AfterHotRangeMBBs.empty()) {
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+      for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) {
+        if (MBB == MBB2)
+          continue;
+        if (DT->dominates(MBB, MBB2)) {
+          auto &Dom = DomMap[MBB];
+          Dom.insert(MBB2);
+          auto &Dom2 = DomMap[MBB2];
+          Dom.insert(Dom2.begin(), Dom2.end());
+        }
+      }
+    }
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+      auto &Dom = DomMap[MBB];
+      for (MachineBasicBlock *DomedMBB : Dom) {
+        // Remove domedMBB.
+        DomMap.erase(DomedMBB);
+        UserMBBSet.erase(DomedMBB);
+      }
+    }
+  }
+
+  return DomMap;
+}
+
+void updateUsers(Register Reg, unsigned NewReg, bool IsSubRegDef,
+                 SmallVector<MachineInstr *, 2> &UserMIs) {
+  for (MachineInstr *UseMI : UserMIs) {
+    for (MachineOperand &MO : UseMI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == Reg) {
+        MO.setReg(NewReg);
+        if (IsSubRegDef)
+          MO.setSubReg(0);
+      }
+    }
+  }
+}
+
+void AMDGPUHotBlockRematerialize::applyCloneRemat(
+    RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+    MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+    SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, LiveIntervals *LIS, MachineFunction &MF) {
+  Register Reg = Node.Reg;
+  MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+
+  const MCInstrDesc &Desc = DefMI->getDesc();
+  const TargetRegisterClass *RC =
+      SIRI->getAllocatableClass(SIII->getOpRegClass(*DefMI, 0));
+  const bool IsSubRegDef = DefMI->getOperand(0).getSubReg() != 0;
+
+  const DebugLoc &DL = DefMI->getDebugLoc();
+  const unsigned OpNum = DefMI->getNumOperands();
+
+  Node.Kind = RematNode::RematKind::Clone;
+
+  // Group user in same blocks.
+  BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
+  DenseSet<MachineBasicBlock *> UserMBBSet;
+  for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+       UseIt != MRI.use_instr_nodbg_end();) {
+    MachineInstr &UseMI = *(UseIt++);
+    UserMap[UseMI.getParent()].emplace_back(&UseMI);
+    UserMBBSet.insert(UseMI.getParent());
+  }
+
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+      reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT);
+
+  for (auto UseIt : UserMap) {
+    MachineBasicBlock *MBB = UseIt.first;
+    // Skip same block uses.
+    if (MBB == DefMI->getParent())
+      continue;
+    // Skip MBB which share clone from other MBBs.
+    if (UserMBBSet.count(MBB) == 0)
+      continue;
+
+    Register NewReg = MRI.createVirtualRegister(RC);
+    auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
+    for (unsigned I = 1; I < OpNum; I++)
+      NewDef = NewDef.add(DefMI->getOperand(I));
+
+    MachineInstr *InsertPointMI = UseIt.second.front();
+    SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+
+    for (MachineInstr *UseMI : UseIt.second) {
+      SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI);
+      if (LastSlot > Slot) {
+        LastSlot = Slot;
+        InsertPointMI = UseMI;
+      }
+    }
+
+    MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash(
+        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII, LIS);
+
+    for (MachineMemOperand *MO : DefMI->memoperands()) {
+      NewDef->addMemOperand(MF, MO);
+    }
+
+    MBB->insert(InsertPoint, NewDef);
+
+    SlotIndexes->insertMachineInstrInMaps(*NewDef);
+
+    SmallVector<MachineInstr *, 2> &UserMIs = UseIt.second;
+    updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+
+    // update users in dom MBBs.
+    auto DomMapIt = DomMap.find(MBB);
+    if (DomMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : DomMapIt->second) {
+        SmallVector<MachineInstr *, 2> &UserMIs = UserMap[UpdateMBB];
+        updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+      }
+    }
+  }
+  if (MRI.use_empty(Reg)) {
+    SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+  }
+}
+
+void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+                            SlotIndexes *SlotIndexes,
+                            const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                            LiveIntervals *LIS) {
+  MachineInstr *DefMI = Node.DefMI;
+  MachineInstr *InsertPointMI = Node.InsertPointMI;
+  MachineBasicBlock *MBB = nullptr;
+
+  // Find a valid insert point.
+  MachineBasicBlock::iterator InsertPoint;
+  if (InsertPointMI) {
+    InsertPoint = InsertPointMI->getIterator();
+    MBB = InsertPointMI->getParent();
+  } else {
+    InsertPoint = Node.InsertBlock->getFirstTerminator();
+    MBB = Node.InsertBlock;
+  }
+
+  InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+                                                 SIRI, SIII, LIS);
+
+  // Move instruction to new location.
+  DefMI->removeFromParent();
+  InsertPoint->getParent()->insert(InsertPoint, DefMI);
+
+  // Update slot index.
+  SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+  SlotIndexes->insertMachineInstrInMaps(*DefMI);
+}
+
+void AMDGPUHotBlockRematerialize::applyRemat(
+    MapVector<Register, RematNode> &RematMap,
+    std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+    llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS,
+    MachineFunction &MF) {
+  std::vector<RematNode> UpdateList;
+  for (auto &It : RematMap)
+    UpdateList.emplace_back(It.second);
+
+  // Sort update list with slotIndex to make sure def moved before use.
+  // If use moved before def, It might not be the first use anymore.
+  std::sort(UpdateList.begin(), UpdateList.end(),
+            [&SlotIndexes](RematNode &I, RematNode &J) {
+              SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI);
+              SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI);
+              return A < B;
+            });
+
+  for (RematNode &Node : UpdateList) {
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse)
+      applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII, LIS);
+    else if (Node.Kind == RematNode::RematKind::Clone)
+      applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, LIS,
+                      MF);
+  }
+}
+
+unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+                            const GCNSubtarget *ST, unsigned &MaxVPressure,
+                            unsigned &MaxSPressure, RematStatus &Status) {
+  // Skip processing current block if It has only debug instructions
+  if (MBB.getFirstNonDebugInstr() == MBB.end())
+    return ST->getOccupancyWithNumVGPRs(0);
+  auto BBEnd = MBB.rbegin();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  if (!llvm::getNonDebugMBBEnd(BBEnd, MBB))
+    return ST->getOccupancyWithNumVGPRs(0);
+
+  GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
+  RPTracker.reset(*BBEnd, &OutputLive, true);
+
+  for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
+    MachineInstr &MI = (*I++);
+    RPTracker.recede(MI);
+    if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
+      Status.MemWriteMBBSet.insert(&MBB);
+  }
+
+  GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
+  unsigned SPressure = RP.getMaxSGPR();
+  if (SPressure > MaxSPressure)
+    MaxSPressure = SPressure;
+  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure)
+    MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+  Status.MBBPressureMap[&MBB] = RP;
+  return RP.getOccupancy(*ST);
+}
+
+unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+                           const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST, unsigned &MaxVPressure,
+                           unsigned &MaxSPressure, RematStatus &Status) {
+  unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
+  // If only have one block, input/ouput virtual live set are empty.
+  if (MF.size() > 1) {
+    // Build input output live reg first.
+    auto *SlotIndexes = LIS->getSlotIndexes();
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBInputSlotMap;
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+    for (MachineBasicBlock &MBB : MF) {
+      auto BBBegin = MBB.getFirstNonDebugInstr();
+      if (BBBegin != MBB.end()) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBBegin);
+        MBBInputSlotMap[&MBB] = SI;
+      }
+
+      auto BBEnd = MBB.rbegin();
+
+      // R.End doesn't point to the boundary instruction.
+      // Skip Debug instr.
+      if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+        MBBOutputSlotMap[&MBB] = SI;
+      }
+    }
+
+    for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+      auto Reg = Register::index2VirtReg(I);
+      if (!LIS->hasInterval(Reg))
+        continue;
+
+      const auto &LI = LIS->getInterval(Reg);
+
+      // Skip local live interval to make live input/ouput faster.
+      if (LIS->intervalIsInOneMBB(LI))
+        continue;
+
+      for (auto InputIt : MBBInputSlotMap) {
+        MachineBasicBlock *MBB = InputIt.first;
+        auto SI = InputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          Status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+      }
+
+      for (auto OutputIt : MBBOutputSlotMap) {
+        MachineBasicBlock *MBB = OutputIt.first;
+        auto SI = OutputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+      }
+    }
+  }
+
+  LLVM_DEBUG(
+      const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+      dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) {
+        unsigned Idx = It.first->getNumber();
+        auto LiveReg = It.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      } dbgs() << "input live";
+      for (auto &It : Status.MBBInputLiveMap) {
+        unsigned Idx = It.first->getNumber();
+        auto LiveReg = It.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      });
+
+  for (auto It = MF.begin(); It != MF.end(); ++It) {
+    MachineBasicBlock &MBB = *It;
+    unsigned Occ =
+        collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status);
+    if (TgtOcc > Occ)
+      TgtOcc = Occ;
+  }
+  return TgtOcc;
+}
+
+RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+                           LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST) {
+  unsigned MaxSPressure = 0;
+  unsigned MaxVPressure = 0;
+  RematStatus Status;
+  unsigned TgtOcc =
+      collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status);
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (TgtOcc >= MaxOcc) {
+    Status.TargetOcc = TgtOcc;
+    Status.TargetVLimit = 0;
+    Status.TargetSLimit = 0;
+    Status.MaxVPressure = 0;
+    Status.MaxSPressure = 0;
+    Status.InputPhysicalVPressure = 0;
+    Status.InputPhysicalSPressure = 0;
+    Status.MemBound = false;
+    Status.NotBalance = false;
+    return Status;
+  }
+
+  MaxSPressure += RegForVCC;
+  MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF));
+  unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure);
+  unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure);
+
+  llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI);
+  bool MemBound =
+      TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+
+  bool NotBalance = false;
+
+  const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
+  // Currently, only sgpr bound can be fixed with remat.
+  if (STgtOcc < VTgtOcc) {
+    unsigned BigOcc = std::max(STgtOcc, VTgtOcc);
+    // Change TgtOcc to  in case sgpr and vgpr is not balance.
+    if (BigOcc > TgtOcc) {
+      TgtOcc = BigOcc;
+      NotBalance = true;
+      if (TgtOcc >= MaxOccupancy)
+        TgtOcc = MaxOccupancy - 1;
+    }
+  }
+
+  // Collect input physical pressure.
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  unsigned VInputPressure = 0;
+  uint64_t SInputMask = 0;
+  for (const auto &Livein : MRI.liveins()) {
+    const Register Reg = Livein.first;
+    const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+    assert(Reg.isPhysical() && "input must be physical reg");
+    Register RegSize = RC->getLaneMask().getNumLanes();
+    if (SIRI->isVGPR(MRI, Reg)) {
+      VInputPressure += RegSize;
+    } else {
+      unsigned RegIndex = SIRI->getHWRegIndex(Reg);
+      uint64_t Mask = ((1 << RegSize) - 1) << RegIndex;
+      SInputMask |= Mask;
+    }
+  }
+  // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high
+  // pressure.
+  unsigned SInputPressure = 0;
+  uint64_t Mask = 0xf;
+  while (Mask != 0) {
+    if (Mask & SInputMask)
+      SInputPressure += 4;
+    Mask = Mask << 4;
+  }
+
+  // If balanced, try next occupancy.
+  TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1);
+
+  auto CC = MF.getFunction().getCallingConv();
+  bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS;
+  // For shader profiles other than ps/cs, set target profile max as 4.
+  if (!IsPsCs) {
+    TgtOcc = TgtOcc > 4 ? 4 : TgtOcc;
+  }
+  if (TargetOccupancy)
+    TgtOcc = TargetOccupancy;
+
+  unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true);
+  unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc);
+
+  Status.TargetOcc = TgtOcc;
+  Status.TargetVLimit = VLimit;
+  Status.TargetSLimit = SLimit;
+  Status.MaxVPressure = MaxVPressure;
+  Status.MaxSPressure = MaxSPressure;
+  Status.InputPhysicalVPressure = VInputPressure;
+  Status.InputPhysicalSPressure = SInputPressure;
+  Status.MemBound = MemBound;
+  Status.NotBalance = NotBalance;
+  return Status;
+}
+
+// For case like
+//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0,
+//   implicit-def dead $scc; xb.uniform
+//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc;
+//  xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit
+//  killed $scc; xb.uniform
+// Sink S_AND right before S_CSELECT will overwrite SCC.
+// To avoid It, skip case when DefMI and UseMI has implicit define use.
+bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
+  if (DefMI->getDesc().NumImplicitDefs == 0)
+    return false;
+
+  auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo();
+  for (MachineOperand &Def : DefMI->implicit_operands()) {
+    if (!Def.isReg())
+      continue;
+    if (Def.isUse())
+      continue;
+    Register Reg = Def.getReg();
+    if (UseMI->readsRegister(Reg, TRI))
+      return true;
+  }
+  return false;
+}
+
+bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST,
+                   MachineFunction &MF) {
+  unsigned MaxSGPR = ST->getAddressableNumSGPRs();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  Register ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg)
+    MaxSGPR -= 4;
+
+  const unsigned AlignmentDelta = 3;
+  MaxSGPR -= AlignmentDelta;
+
+  return MaxSPressure > MaxSGPR;
+}
+
+// Skip live reg remated to other block.
+void updateLiveInfo(
+    const MapVector<Register, RematNode> &RematMap,
+    GCNRPTracker::LiveRegSet &LiveSet,
+    const GCNRPTracker::LiveRegSet &InputLive, const MachineBasicBlock *CurBB,
+    DenseMap<const MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+  for (auto &It : RematMap) {
+    Register Reg = It.first;
+    // Skip reg not in live set.
+    if (!LiveSet.count(Reg))
+      continue;
+    // Skip reg already in input set.
+    // Input set will be taken care in getReducedSize.
+    if (InputLive.count(Reg))
+      continue;
+
+    auto &Node = It.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
+      // still before LiveInfo.BB, It is still live.
+      unsigned LiveBBIndex = RPOTIndexMap[CurBB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex > InsertBBIndex)
+        continue;
+    }
+    // Already in remat map, don't need to check again, remove from
+    // candidate.
+    LiveSet.erase(Reg);
+  }
+}
+
+// Returns the actual register saving that would be achieved by moving or
+// cloning this instruction. It's essentially:
+//
+//     size(defs) - size(uses)
+//
+// Note if it is not safe to move/clone this instruction, this function returns
+// 0.
+//
+int rematGainInBits(MachineInstr *DefMI, Register Reg,
+                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                    bool IsVGPR) {
+  int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  for (MachineOperand &MO : DefMI->operands()) {
+    if (MO.isImm())
+      continue;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+    if (MO.isTied())
+      continue;
+
+    if (MO.getReg() == AMDGPU::EXEC)
+      continue;
+
+    // Don't move user of VCC.
+    if (MO.getReg() == AMDGPU::VCC) {
+      RematSize = 0;
+      break;
+    }
+    Register Reg = MO.getReg();
+
+    // Don't move physical register use.
+    if (Reg.isPhysical()) {
+      RematSize = 0;
+      break;
+    }
+
+    if (IsVGPR != SIRI->isVGPR(MRI, Reg)) {
+      // Not support mix of v and s when remat now.
+      // TODO: count possible pressure change here.
+      RematSize = 0;
+      break;
+    }
+    bool IsSingleDef = MRI.hasOneDef(Reg);
+    if (!IsSingleDef) {
+      IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI);
+    }
+
+    if (IsSingleDef) {
+      // The reg might share with other candidates,  check It here.
+      // Count share reg in getReducedSize.
+      if (EnableAggressiveSgpr) {
+        // In case of aggressive remat, treat multi use reg as shared reg and
+        // ignore size of shared reg.
+        if (!MRI.hasOneNonDBGUse(Reg))
+          continue;
+      }
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      if (unsigned SubIdx = MO.getSubReg()) {
+        if (OpRC)
+          OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+      }
+      int InputSize = SIRI->getRegSizeInBits(*OpRC);
+      // If input not live in hotspot, move It cross hotspot should have
+      // less reg then DefMi.
+      if (RematSize > InputSize) {
+        RematSize -= InputSize;
+        continue;
+      }
+    }
+
+    RematSize = 0;
+    break;
+  }
+  return RematSize;
+}
+
+MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB,
+                                        MachineDominatorTree *DT,
+                                        MachineLoopInfo *LI) {
+  while (LI->getLoopDepth(BB) > 0) {
+    MachineDomTreeNode *N = DT->getNode(BB);
+    if (N == nullptr)
+      return nullptr;
+    MachineDomTreeNode *IDom = N->getIDom();
+    if (IDom == nullptr)
+      return nullptr;
+
+    BB = IDom->getBlock();
+  }
+
+  return BB;
+}
+
+MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT,
+                                          BlockSet &Blocks) {
+  auto I = Blocks.begin(), E = Blocks.end();
+
+  MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
+  while (I != E) {
+    MachineBasicBlock *B = cast<MachineBasicBlock>(*(I++));
+    DomB = DT->findNearestCommonDominator(DomB, B);
+    if (DomB == nullptr)
+      return nullptr;
+  }
+  // For split block like:
+  // bb.42:
+  //    %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec,
+  //    //    implicit $exec
+  //  %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  //  implicitdef $scc, implicit $exec
+  //
+  // bb.68:
+  //; predecessors: %bb.42
+  //  successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%),
+  //  %bb.43(50.00%)
+  //
+  //  SI_MASK_BRANCH %bb.43, implicit $exec
+  //  S_BRANCH %bb.45
+  // which is from
+  // bb.42:
+  //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit
+  //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  // SI_MASK_BRANCH %bb.43, implicit $exec
+  // S_BRANCH %bb.45
+  // The real common dom is bb.42.
+  // TODO: use _term version of exec update instructions so don't need this
+  // anymore.
+  if (DomB && DomB->pred_size() == 1 && !DomB->empty()) {
+    // Upstreaming note: This used to be SI_MASK_BRANCH
+    if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) {
+      MachineBasicBlock *Pred = *DomB->pred_begin();
+      if (Pred->succ_size() == 1 &&
+          (Pred->empty() || !Pred->back().isBranch())) {
+        DomB = Pred;
+      }
+    }
+  }
+
+  return DomB;
+}
+
+MachineBasicBlock *
+findInsertBlock(MachineInstr &DefMI, Register Reg, MachineDominatorTree *DT,
+                MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                const MachineRegisterInfo &MRI, bool MemBound) {
+
+  BlockSet BBSet;
+  for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+    BBSet.insert(UseMI.getParent());
+  }
+  if (BBSet.empty())
+    return nullptr;
+
+  MachineBasicBlock *BB = *BBSet.begin();
+  if (BBSet.size() > 1) {
+    MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet);
+    if (!BDom)
+      return nullptr;
+    BB = BDom;
+  }
+  // Try to find non loop dominator.
+  if (!MemBound) {
+    BB = findNonLoopDominator(BB, DT, MLI);
+  }
+  if (!BB)
+    return nullptr;
+
+  // If BB is already a hot block, move to BB will not help.
+  // hotBlockRemat will fail It when process BB.
+
+  // Must reachable from DefMI.
+  if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB))
+    return nullptr;
+
+  return BB;
+}
+
+// Maybe expensive to be called all over the place
+bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+  for (auto &Def : DefMI->defs()) {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) {
+      if (UseMI.isPHI())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool isSafeToMoveOrClone(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+  // Do not move PHI nodes
+  if (isUsedByPhi(DefMI, MRI))
+    return false;
+
+  unsigned OpNum = DefMI->getNumOperands();
+  // Only move DefMI which all operand is unique def.
+  for (unsigned I = 0; I < OpNum; I++) {
+    MachineOperand &Op = DefMI->getOperand(I);
+    if (!Op.isReg())
+      continue;
+    if (!Op.getReg().isPhysical() && !MRI.getUniqueVRegDef(Op.getReg()) &&
+        !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void addOneDefOneUseCandidate(std::vector<RematNode> *OutRematList,
+                              int *OutRematCnt, const RematNode &Node,
+                              MachineRegisterInfo &MRI,
+                              MachineDominatorTree *DT,
+                              MachinePostDominatorTree *PDT,
+                              MachineLoopInfo *MLI, bool IsVGPR,
+                              bool MemBound) {
+  Register Reg = Node.Reg;
+  MachineInstr *DefMI = Node.DefMI;
+
+  unsigned Size = Node.Size;
+  MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin();
+  MachineBasicBlock *InsertBB = UseMI->getParent();
+
+  // For VGPR, always move next to the only user to avoid wqm or exec issue.
+  // But doing this will cause issue when DefMI is in wqm  user not in
+  // wqm. Disable VGPR remat for now.
+  // TODO: make sure single user don't need wqm.
+  if (!IsVGPR) {
+    if (MachineBasicBlock *NewInsertBB =
+            findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) {
+      if (InsertBB != NewInsertBB) {
+        InsertBB = NewInsertBB;
+        // If can find a non-loop insert block, go to the insert block.
+        if (DefMI->getParent() != InsertBB) {
+          if (!InsertBB->empty()) {
+            auto It = InsertBB->getFirstNonPHI();
+            It = skipDebugInstructionsForward(It, InsertBB->end());
+            if (It == InsertBB->end())
+              UseMI = nullptr;
+            else
+              UseMI = &*It;
+          }
+        }
+      }
+    }
+  }
+
+  if (IsVGPR) {
+    // Don't count reg in same block for valu.
+    if (UseMI->getParent() == DefMI->getParent())
+      return;
+  }
+
+  // Skip case when DefMI has implicit define which used by UseMI.
+  if (isImplicitDefUse(DefMI, UseMI)) {
+    return;
+  }
+
+  RematNode FilteredNode = Node;
+  FilteredNode.InsertBlock = InsertBB;
+  FilteredNode.InsertPointMI = UseMI;
+  FilteredNode.Kind = RematNode::RematKind::OneDefOneUse;
+  OutRematList->emplace_back(FilteredNode);
+  *OutRematCnt += Size;
+}
+
+// Build remat candidates from the registers in `CandidateRegSet`.
+void buildRematCandiates(std::vector<RematNode> *OutCandidates,
+                         DenseSet<Register> *PinnedRegSet,
+                         GCNRPTracker::LiveRegSet &CandidateRegSet,
+                         const MachineRegisterInfo &MRI,
+                         const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
+                         bool IsVGPR) {
+
+  for (const auto &LiveRegIt : CandidateRegSet) {
+    Register Reg = LiveRegIt.first;
+    // Skip unsafe reg.
+    if (PinnedRegSet->count(Reg))
+      continue;
+
+    if (SIRI->isVGPR(MRI, Reg) != IsVGPR)
+      continue;
+    bool IsSafeCandidate = true;
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+    if (MI) {
+      if (IsVGPR) {
+        // Only remat valu now.
+        if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY)
+          IsSafeCandidate = false;
+        if (MI->getOpcode() == AMDGPU::COPY) {
+          // Make sure src is unique define.
+          if (MI->getOperand(1).isReg() &&
+              nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg()))
+            IsSafeCandidate = false;
+        } else {
+          // Skip convergent valu.
+          if (MI->isConvergent())
+            IsSafeCandidate = false;
+        }
+      }
+      // Skip inst has more than 1 def.
+      if (MI->getDesc().NumDefs > 1)
+        IsSafeCandidate = false;
+    } else {
+      IsSafeCandidate = false;
+    }
+
+    if (IsSafeCandidate) {
+      int Gain = rematGainInBits(MI, Reg, MRI, SIRI, IsVGPR);
+      if (Gain > 0)
+        OutCandidates->emplace_back(RematNode(Reg, MI, Gain >> 5));
+      else
+        IsSafeCandidate = false;
+    }
+    // Save unsafe reg.
+    if (!IsSafeCandidate)
+      PinnedRegSet->insert(Reg);
+  }
+
+  // Sort by gain.
+  std::sort(OutCandidates->begin(), OutCandidates->end(),
+            [](RematNode &I, RematNode &J) { return I.Size > J.Size; });
+}
+
+void addCloneCandidate(std::vector<RematNode> *OutRematList, int *OutRematCnt,
+                       DenseSet<Register> *OutPinnedRegSet,
+                       std::vector<RematNode *> &&CloneList,
+                       const MachineRegisterInfo &MRI) {
+  // Group user in same blocks.
+  std::vector<BlockSet> UserSetList(CloneList.size());
+
+  for (size_t I = 0; I < CloneList.size(); I++) {
+    auto *Node = CloneList[I];
+    Register Reg = Node->Reg;
+    MachineInstr *DefMI = Node->DefMI;
+    // Group user in same blocks.
+    BlockSet &UserSet = UserSetList[I];
+
+    for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+         UseIt != MRI.use_instr_nodbg_end();) {
+      MachineInstr &UseMI = *(UseIt++);
+      UserSet.insert(UseMI.getParent());
+    }
+
+    if (UserSet.size() == 1) {
+      // All users are in same block with DefMI.
+      if (*UserSet.begin() == DefMI->getParent()) {
+        // Mark cannot remat for now.
+        // TODO: try to split if is bigger than 4 and only used once per
+        // channel.
+        OutPinnedRegSet->insert(Reg);
+        continue;
+      }
+    }
+
+    int Size = Node->Size;
+    Size <<= 16;
+    // Pack userSet size to size.
+    Size |= UserSet.size();
+    Node->UserCount = Size;
+  }
+
+  std::sort(CloneList.begin(), CloneList.end(),
+            // Sort based on userSet size.
+            [](const RematNode *A, const RematNode *B) {
+              static constexpr int Mask = 0xffff;
+              return (A->UserCount & Mask) < (B->UserCount & Mask);
+            });
+
+  for (RematNode *Node : CloneList) {
+    Node->Kind = RematNode::RematKind::Clone;
+    OutRematList->emplace_back(*Node);
+    *OutRematCnt += Node->Size;
+  }
+}
+
+// Filter `Candidates` into `OutRematList` based on whether
+// safe to move, and decides on the actual type of Candidate (move vs cline).
+//
+// Updates `OutPinnedRegSet` with registers that cannot/should not be moved.
+//
+// Returns the accumulated size of all filtered candidates.
+//
+int filterRematCandiates(std::vector<RematNode> *OutRematList,
+                         DenseSet<Register> *OutPinnedRegSet,
+                         std::vector<RematNode> &&Candidates,
+                         MachineDominatorTree *DT,
+                         MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                         MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) {
+  int RematCnt = 0;
+  // Work one def one use first.
+  for (auto &Node : Candidates) {
+    Register Reg = Node.Reg;
+    if (!MRI.hasOneNonDBGUse(Reg))
+      continue;
+
+    MachineInstr *DefMI = Node.DefMI;
+    if (!isSafeToMoveOrClone(DefMI, MRI)) {
+      OutPinnedRegSet->insert(Reg);
+      continue;
+    }
+
+    addOneDefOneUseCandidate(OutRematList, &RematCnt, Node, MRI, DT, PDT, MLI,
+                             IsVGPR, MemBound);
+  }
+
+  if (!IsVGPR) {
+    std::vector<RematNode *> CloneList;
+    // Try multi use case.
+    for (auto &Node : Candidates) {
+      Register Reg = Node.Reg;
+      if (MRI.hasOneNonDBGUse(Reg))
+        continue;
+
+      MachineInstr *DefMI = Node.DefMI;
+      if (!isSafeToMoveOrClone(DefMI, MRI)) {
+        OutPinnedRegSet->insert(Reg);
+        continue;
+      }
+
+      // Clone for each user.
+      CloneList.emplace_back(&Node);
+    }
+
+    addCloneCandidate(OutRematList, &RematCnt, OutPinnedRegSet,
+                      std::move(CloneList), MRI);
+  }
+
+  return RematCnt;
+}
+
+// Calculate the reduced register pressure of RematMap w.r.t. the BB associated
+// with LiveInfo.
+// Returns the number of registers reduced, and the instructions associated with
+// the reduction nodes into `OutReducedInsts`.
+int getReducedSize(const MapVector<Register, RematNode> &RematMap,
+                   GCNRPTracker::LiveRegSet &CanidateSet,
+                   const MachineRegisterInfo &MRI,
+                   const BlockLiveInfo &LiveInfo,
+                   DenseMap<const MachineBasicBlock *, unsigned> &RPOTIndexMap,
+                   InstSet *OutReducedInsts) {
+  int ReducedSize = 0;
+  for (const auto &It : RematMap) {
+    Register Reg = It.first;
+
+    if (!CanidateSet.count(Reg))
+      continue;
+
+    bool IsReduced = false;
+    auto &Node = It.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
+      // moved after LiveInfo.BB, It is not live anymore.
+      unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex < InsertBBIndex)
+        IsReduced = true;
+    } else {
+      // Clone.
+      IsReduced = true;
+      // If has use in LiveInfo.BB, could not reduce from input live.
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        if (UseMI.getParent() == LiveInfo.BB) {
+          IsReduced = false;
+          break;
+        }
+      }
+    }
+    if (IsReduced) {
+      ReducedSize += Node.Size;
+      OutReducedInsts->insert(Node.DefMI);
+    }
+
+    // Already in remat map, don't need to check again, remove from candidate.
+    CanidateSet.erase(Reg);
+  }
+
+  return ReducedSize;
+}
+
+static unsigned getNumLanesIn32BitReg(bool IsVgpr) {
+  const TargetRegisterClass *RC =
+      IsVgpr ? &AMDGPU::VGPR_32RegClass : &AMDGPU::SGPR_32RegClass;
+  return RC->LaneMask.getNumLanes();
+}
+
+// Calculate the amount of OVERLAPPING register pressure among all
+// the instructions in `ReducedInsts`. E.g for:
+//    x = COPY a:sgpr_32
+//    y = COPY a:sgpr_32
+// This function would return 1.
+int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR,
+                         const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI) {
+  int SharedSize = 0;
+  DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
+  for (MachineInstr *DefMI : ReducedInsts) {
+    for (MachineOperand &MO : DefMI->operands()) {
+      if (MO.isImm())
+        continue;
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      if (MO.isTied())
+        continue;
+      Register Reg = MO.getReg();
+
+      if (Reg == AMDGPU::EXEC)
+        continue;
+      if (!Reg.isVirtual())
+        continue;
+
+      if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg()))
+        // Not support mix of v and s when remat now.
+        continue;
+
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      const int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+
+      unsigned Mask = 0;
+      // FIXME: Lane mask is now in the granularity of 16-bit lanes.
+      if (unsigned SubIdx = MO.getSubReg()) {
+        OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+        int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+        Mask = (1 << SubMOSize) - 1;
+      } else {
+        Mask = (1 << MOSize) - 1;
+      }
+      auto SharedRegIt = SharedRegMaskMap.find(Reg);
+      if (SharedRegIt == SharedRegMaskMap.end()) {
+        SharedRegMaskMap[Reg] = LaneBitmask(Mask);
+      } else {
+        unsigned PrevMask = SharedRegIt->second.getAsInteger();
+        if (unsigned SharedMask = (PrevMask & Mask)) {
+          // Some thing is shared.
+          for (int I = 0; I < MOSize; I++) {
+            if (SharedMask & (1 << I)) {
+              SharedSize += 1;
+            }
+          }
+        }
+        LaneBitmask MoMask = LaneBitmask(Mask | PrevMask);
+        SharedRegMaskMap[Reg] = MoMask;
+      }
+    }
+  }
+
+  const unsigned NumLanesPerReg = getNumLanesIn32BitReg(IsVGPR);
+  return SharedSize / NumLanesPerReg;
+}
+
+void dumpRematMap(MapVector<Register, RematNode> &RematMap,
+                  const SIRegisterInfo *SIRI) {
+  dbgs() << "\n rematMap: \n";
+  for (auto It : RematMap) {
+    int Reg = It.first;
+    dbgs() << printReg(Reg, SIRI);
+    dbgs() << "\n";
+  }
+}
+int DebugBlockIndex = 42;
+void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet,
+                  MapVector<Register, RematNode> &VRematMap,
+                  MapVector<Register, RematNode> &SRematMap, int BlockIndex,
+                  const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  llvm::dumpLiveSet(LiveSet, SIRI);
+  dumpRematMap(VRematMap, SIRI);
+  dumpRematMap(SRematMap, SIRI);
+}
+
+void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
+                    const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  dbgs() << "\n Candidates: \n";
+  unsigned TotalSize = 0;
+  for (RematNode &Node : RematCandidates) {
+    dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size;
+    dbgs() << "\n";
+    TotalSize += Node.Size;
+  }
+  dbgs() << "Total Size:" << TotalSize << "\n";
+}
+
+// A heuristic number for keeping the target SGPR number away from the limit.
+constexpr unsigned SgprLimitBias = 10;
+
+bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
+                                                MachineLoopInfo *MLI,
+                                                LiveIntervals *LIS,
+                                                MachineDominatorTree *DT,
+                                                MachinePostDominatorTree *PDT,
+                                                bool &IsNearTarget) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  DenseMap<const MachineBasicBlock *, unsigned> RPOTIndexMap;
+  for (const MachineBasicBlock *MBB : RPOT)
+    RPOTIndexMap[MBB] = RPOTIndexMap.size();
+
+  auto &MRI = MF.getRegInfo();
+
+  bool IsUpdated = false;
+  RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST);
+
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (Status.TargetOcc >= MaxOcc)
+    return false;
+
+  // Early checks
+  {
+    int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit;
+    // when agressive sgpr remat, reserve some for allocation lost.
+    if (EnableAggressiveSgpr)
+      InitialRematSCnt += SgprLimitBias;
+
+    bool InitialIsSGPRSpill = false;
+    if (InitialRematSCnt > 0)
+      InitialIsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
+
+    const bool InitialIsForceRematSgpr =
+        InitialIsSGPRSpill || Status.NotBalance;
+
+    // If bound by lds, skip.
+    if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
+        !InitialIsForceRematSgpr)
+      return false;
+  }
+
+  MachineBasicBlock *EntryMBB = &MF.front();
+
+  auto *SlotIndexes = LIS->getSlotIndexes();
+
+  // Reg which already marked remat.
+  MapVector<Register, RematNode> VRematMap;
+  MapVector<Register, RematNode> SRematMap;
+  // Reg which cannot move around to remat.
+  DenseSet<Register> PinnedRegSet;
+  std::vector<BlockLiveInfo> HotBlocks;
+  for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) {
+    MachineBasicBlock *MBB = *It;
+    auto &RP = Status.MBBPressureMap[MBB];
+    // ignore block not hot.
+
+    if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit &&
+        (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) <
+            Status.TargetSLimit)
+      continue;
+    // Collect reg pressure.
+    unsigned MaxVPressure = 0;
+    unsigned MaxSPressure = 0;
+    const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB];
+
+    const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB];
+    LLVM_DEBUG(
+        dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
+
+    GCNDownwardRPTracker Tracker(*LIS);
+
+    Tracker.reset(*MBB->begin(), &InputLive);
+
+    for (MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      Tracker.advance();
+      auto LISLR = Tracker.getLiveRegs();
+      // Update live set for things already remated.
+      updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
+      updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
+
+      const GCNRPTracker::LiveRegSet &LiveSet = LISLR;
+      unsigned VPressure = 0;
+      unsigned SPressure = 0;
+      collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure);
+      if (MaxVPressure < VPressure)
+        MaxVPressure = VPressure;
+      if (MaxSPressure < SPressure)
+        MaxSPressure = SPressure;
+    }
+    MaxSPressure += RegForVCC + Status.InputPhysicalSPressure;
+    if (MaxVPressure <= Status.TargetVLimit &&
+        MaxSPressure <= Status.TargetSLimit)
+      continue;
+
+    // Build block live info.
+    // Use outputLive for EntryMBB.
+    BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure,
+                              MBB != EntryMBB ? InputLive : OutputLive};
+    // Skip entry block when save hotBlock to reduce clone because not clone in
+    // entry block.
+    if (MBB != EntryMBB)
+      HotBlocks.emplace_back(LiveInfo);
+    GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive;
+
+    // Update reg pressure based on remat list.
+    InstSet VReducedInsts;
+    InstSet SReducedInsts;
+    int VReduced = getReducedSize(VRematMap, CandidateRegs, MRI, LiveInfo,
+                                  RPOTIndexMap, &VReducedInsts);
+    int SReduced = getReducedSize(SRematMap, CandidateRegs, MRI, LiveInfo,
+                                  RPOTIndexMap, &SReducedInsts);
+
+    // Calculate size need to be remat for this BB.
+    const int RematVCnt = MaxVPressure - VReduced - Status.TargetVLimit;
+    const int RematSCnt = MaxSPressure - SReduced - Status.TargetSLimit;
+
+    bool IsSGPRSpill = false;
+    if (RematSCnt > 0)
+      IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF);
+
+    bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
+    // Try to add candidates into remat list.
+
+    int NewRematSCnt = 0;
+    if (RematSCnt > 0) {
+      // Build candidate nodes.
+      std::vector<RematNode> SRematCandidates;
+      buildRematCandiates(&SRematCandidates, &PinnedRegSet, CandidateRegs, MRI,
+                          SIII, SIRI, /*IsVGPR*/ false);
+
+      LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
+      std::vector<RematNode> SRematList;
+      // Filter candidates.
+      NewRematSCnt =
+          filterRematCandiates(&SRematList, &PinnedRegSet,
+                               std::move(SRematCandidates), DT, PDT, MLI, MRI,
+                               /*IsVGPR*/ false, Status.MemBound);
+      if (NewRematSCnt > RematSCnt) {
+        // Has enough remat node to cover rematCnt.
+        int RematCnt = 0;
+        for (RematNode &Node : SRematList) {
+          SRematMap[Node.Reg] = Node;
+          RematCnt += Node.Size;
+          // Stop if the size had reached the required amount, unless
+          // aggressive is set.
+          if (RematCnt > RematSCnt && !EnableAggressiveSgpr)
+            break;
+        }
+        NewRematSCnt = 0;
+      } else {
+        for (RematNode &Node : SRematList) {
+          SReducedInsts.insert(Node.DefMI);
+        }
+        // Check shared size. These are reg uses that are shared among all the
+        // instructions. The overlap will not actually contribute to the
+        // pressure increase when an instruction is moved/cloned, so it can be
+        // treated as a gain.
+        int SharedReducedSize =
+            getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
+
+        int LocalGains = 0;
+        if (((NewRematSCnt + SharedReducedSize) + (int)SgprLimitBias) >=
+            RematSCnt) {
+          for (RematNode &Node : SRematList)
+            SRematMap[Node.Reg] = Node;
+        } else {
+          if (!IsForceRematSgpr)
+            return false;
+          for (RematNode &Node : SRematList)
+            SRematMap[Node.Reg] = Node;
+          // Find local one def one use candidates.
+          for (MachineInstr &MI : *MBB) {
+            if (MI.isDebugInstr())
+              continue;
+            if (MI.getDesc().NumDefs != 1)
+              continue;
+            MachineOperand &DstMO = MI.getOperand(0);
+            Register Reg = DstMO.getReg();
+            if (!SIRI->isSGPRReg(MRI, Reg))
+              continue;
+            if (!MRI.hasOneNonDBGUse(Reg))
+              continue;
+            if (!MRI.hasOneDef(Reg))
+              continue;
+            if (Reg.isPhysical())
+              continue;
+            MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
+            if (UseMI.getParent() != MBB)
+              continue;
+            int Gain = rematGainInBits(&MI, Reg, MRI, SIRI,
+                                       /*IsVGPR*/ false);
+            if (Gain > 0) {
+              // Skip case when DefMI has implicit define which used by UseMI.
+              if (isImplicitDefUse(&MI, &UseMI))
+                continue;
+              RematNode Node = {Reg, &MI, (unsigned)Gain >> 5};
+              Node.InsertPointMI = &UseMI;
+              Node.Kind = RematNode::RematKind::OneDefOneUse;
+              SRematMap[Reg] = Node;
+              LocalGains += Node.Size;
+            }
+          }
+        }
+        NewRematSCnt =
+            RematSCnt - NewRematSCnt - SharedReducedSize - LocalGains;
+      }
+    }
+    // If works, continue.
+
+    // Collect live range from hot inst.
+    // find common live range in hot insts.
+    // Remat these common live range.
+    // Apply the remat.
+
+    int NewRematVCnt = 0;
+    if (RematVCnt > 0) {
+      // TODO: V remat.
+    }
+
+    bool NeedSRemat = RematSCnt > 0;
+    bool NeedVRemat = RematVCnt > 0;
+    // If sgpr spill, always do remat.
+    bool IsSRematOK =
+        (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr;
+    bool IsVRematOK =
+        (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
+    if (NeedSRemat && NeedVRemat) {
+      if (IsVRematOK && IsSRematOK)
+        IsUpdated = true;
+      else if (IsSGPRSpill)
+        IsUpdated = true;
+    } else if (NeedSRemat) {
+      if (IsSRematOK)
+        IsUpdated = true;
+    } else if (NeedVRemat) {
+      if (IsVRematOK)
+        IsUpdated = true;
+    }
+    // TODO: what to do when cannot reach target?
+    if (NewRematSCnt > 0) {
+      if ((unsigned)NewRematSCnt <= ST->getSGPRAllocGranule()) {
+        IsNearTarget = true;
+      } else {
+        if (!IsSGPRSpill)
+          return false;
+      }
+    }
+  }
+
+  if (SRematMap.empty() && VRematMap.empty()) {
+    return IsUpdated;
+  }
+
+  if (!SRematMap.empty()) {
+    IsUpdated = true;
+    applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, LIS, MF);
+    LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
+  }
+
+  // Balance between vector and scalar if possible.
+  return IsUpdated;
+}
+
+bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.size() < 2)
+    return false;
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MachineDominatorTree *DT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachinePostDominatorTree *PDT =
+      &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+
+  bool IsNearTarget = false;
+  return hotBlockRemat(MF, MLI, LIS, DT, PDT, IsNearTarget);
+}
+
+} // namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                      "AMDGPU rematerialize", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                    "AMDGPU rematerialize", false, false)
+
+char AMDGPUHotBlockRematerialize::ID = 0;
+char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
+
+FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
+  return new AMDGPUHotBlockRematerialize();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
new file mode 100644
index 0000000000000..4c55d172018d4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -0,0 +1,254 @@
+//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRUtils.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "xb-mir-util"
+using namespace llvm;
+
+namespace llvm {
+bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+                       MachineBasicBlock &MBB) {
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  while (BBEnd != MBB.rend() && BBEnd->isDebugInstr())
+    BBEnd++;
+  return BBEnd != MBB.rend();
+}
+} // namespace llvm
+
+namespace {
+
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI,
+                                    const MachineBasicBlock *BB) {
+  const MachineLoop *L = LI->getLoopFor(BB);
+  if (L) {
+    while (const MachineLoop *Parent = L->getParentLoop())
+      L = Parent;
+  }
+  return L;
+}
+
+bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
+                      const MachineBasicBlock *BB2) {
+  const MachineLoop *L1 = getOutermostLoop(LI, BB1);
+  const MachineLoop *L2 = getOutermostLoop(LI, BB2);
+  return L1 != nullptr && L1 == L2;
+}
+
+} // namespace
+
+namespace llvm {
+
+bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS) {
+  if (!LIS)
+    return true;
+  const TargetRegisterInfo *TRI = MI.getMF()->getSubtarget().getRegisterInfo();
+  LiveRange &LR =
+      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+  SlotIndex Idx = LIS->getInstructionIndex(MI);
+  return LR.liveAt(Idx);
+}
+
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
+    const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
+    MachineRegisterInfo *MRI, LiveIntervals *LIS,
+    SccDefInsertPointConstraintFlags Constraints) {
+  // If SCC is dead at MI when we can use MI as the insert point.
+  if (!llvm::isSccLiveAt(*MI, LIS))
+    return MI;
+
+  const bool CheckForExecWrite =
+      Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+
+  MachineBasicBlock::reverse_iterator Start = MI.getReverse();
+
+  // Otherwise, walk backwards through the block looking for a location where
+  // SCC is dead.
+  for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend();
+       It != End; ++It) {
+    // If the instruction modifies exec then we cannot use it as
+    // an insertion point (if that is a constraint from the caller).
+    // The check for EXEC works for both wave64 and wave32 because
+    // it will also catch Writes to the subregisters (e.g. exec_lo).
+    if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
+      break;
+
+    if (!llvm::isSccLiveAt(*It, LIS))
+      return It->getIterator();
+  }
+
+  // If no safe location can be found in the block we can save and restore
+  // SCC around MI. There is no way to directly read or Write SCC so we use
+  // s_cselect to read the current value of SCC and s_cmp to Write the saved
+  // value back to SCC.
+  //
+  // The generated code will look like this;
+  //
+  //      %SavedSCC = COPY $scc  # Save SCC
+  //      <----- Newly created safe insert point.
+  //      MI
+  //      $scc = COPY %SavedSCC  # Restore SCC
+  //
+  Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  DebugLoc DL = MI->getDebugLoc();
+  auto CopyFrom =
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), TmpScc).addReg(AMDGPU::SCC);
+  auto CopyTo = BuildMI(*MBB, std::next(MI->getIterator()), DL,
+                        TII->get(AMDGPU::COPY), AMDGPU::SCC)
+                    .addReg(TmpScc);
+
+  // Cut the live segment.
+  auto SlotIndexes = LIS->getSlotIndexes();
+  SlotIndexes->insertMachineInstrInMaps(*CopyFrom);
+  SlotIndexes->insertMachineInstrInMaps(*CopyTo);
+  LiveRange &LR =
+      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+  auto OldSegment = *LR.getSegmentContaining(LIS->getInstructionIndex(*MI));
+  LiveRange::Segment NewSegA(
+      OldSegment.start,
+      SlotIndexes->getInstructionIndex(*CopyFrom).getRegSlot(),
+      OldSegment.valno);
+  LiveRange::Segment NewSegB(LIS->getInstructionIndex(*CopyTo).getRegSlot(),
+                             OldSegment.end, OldSegment.valno);
+  LR.removeSegment(OldSegment);
+  LR.addSegment(NewSegA);
+  LR.addSegment(NewSegB);
+
+  return MI;
+}
+
+void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
+
+  dbgs() << "\n live set: \n";
+  for (auto It : LiveSet) {
+    int Reg = It.first;
+    dbgs() << printReg(Reg, SIRI);
+    if (It.second.any())
+      dbgs() << " mask:" << It.second.getAsInteger();
+    dbgs() << "\n";
+  }
+}
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI) {
+  unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  Size >>= 5;
+  if (Mask.any()) {
+    if (unsigned MaskSize = Mask.getNumLanes()) {
+      if (MaskSize < Size)
+        Size = MaskSize;
+    }
+  }
+  return Size;
+}
+
+void collectLiveSetPressure(const LiveSet &LiveSet,
+                            const MachineRegisterInfo &MRI,
+                            const SIRegisterInfo *SIRI, unsigned &VPressure,
+                            unsigned &SPressure) {
+  VPressure = 0;
+  SPressure = 0;
+  for (auto LiveIt : LiveSet) {
+    unsigned Reg = LiveIt.first;
+    unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI);
+    if (SIRI->isVGPR(MRI, Reg))
+      VPressure += Size;
+    else
+      SPressure += Size;
+  }
+}
+
+bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  // Support multi def for pattern of pointer:
+  // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+  // %808.sub1:sgpr_64 = S_MOV_B32 0
+  bool HasSub0 = false;
+  bool HasSub1 = false;
+  for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) {
+    if (unsigned SubReg = UserDefMO.getSubReg()) {
+      bool IsSingleSubReg = false;
+      switch (SubReg) {
+      default:
+        break;
+      case AMDGPU::sub0:
+        if (!HasSub0) {
+          HasSub0 = true;
+          IsSingleSubReg = true;
+        }
+        break;
+      case AMDGPU::sub1:
+        if (!HasSub1) {
+          HasSub1 = true;
+          IsSingleSubReg = true;
+        }
+        break;
+      }
+      if (!IsSingleSubReg) {
+        HasSub0 = false;
+        break;
+      }
+    } else {
+      HasSub0 = false;
+      break;
+    }
+  }
+
+  return (HasSub0 && HasSub1);
+}
+
+bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
+                 MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+                 MachineBasicBlock *ToBB) {
+  if (FromBB == ToBB)
+    return true;
+
+  if (DT->dominates(FromBB, ToBB))
+    return true;
+
+  if (PDT->dominates(ToBB, FromBB))
+    return true;
+
+  if (loopContainsBoth(LI, ToBB, FromBB))
+    return true;
+
+  // TODO: cover case hotBB in loop,
+  //       one block in that loop dom BB or
+  //       BB post dom one block in that loop.
+  return false;
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
new file mode 100644
index 0000000000000..14cd350398f4c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -0,0 +1,94 @@
+//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+class LiveInterval;
+class LiveIntervals;
+class SlotIndexes;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class SIInstrInfo;
+class MachineDominatorTree;
+class MachinePostDominatorTree;
+
+constexpr unsigned RegForVCC = 2;
+
+bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
+                       llvm::MachineBasicBlock &MBB);
+
+bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
+
+bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS);
+
+// An enum used to pass additional constraints to
+// `FindOrCreateInsertionPointForSccDef()`. This will further
+// constrain the location where the scc def can be inserted.
+enum SccDefInsertPointConstraintFlags {
+  None = 0,        // No additional constraints.
+  NoExecWrite = 1, // Should be no modification of exec between BeforeInst and
+                   // insert point.
+};
+
+// Look for a safe place to insert an instruction that defines scc.
+//
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
+    llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
+    const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
+    llvm::MachineRegisterInfo *MRI, LiveIntervals *LIS,
+    SccDefInsertPointConstraintFlags Constraints =
+        SccDefInsertPointConstraintFlags::None);
+
+// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only
+// used 4 lanes.
+bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
+                       const llvm::SIRegisterInfo *TRI,
+                       const llvm::SIInstrInfo *TII,
+                       llvm::SlotIndexes *SlotIndexes);
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI);
+void collectLiveSetPressure(const LiveSet &LiveSet,
+                            const llvm::MachineRegisterInfo &MRI,
+                            const llvm::SIRegisterInfo *SIRI,
+                            unsigned &VPressure, unsigned &SPressure);
+
+bool reach_block(llvm::MachineBasicBlock *FromBB,
+                 llvm::MachineDominatorTree *DT,
+                 llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
+                 llvm::MachineBasicBlock *ToBB);
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
new file mode 100644
index 0000000000000..6160fe5471376
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -0,0 +1,162 @@
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==------------------------------------------------------------------------==//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//==------------------------------------------------------------------------==//
+
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+#include <cmath>
+
+namespace llvm {
+
+void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) {
+  unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1;
+  LatencyHide += LoopCount * S.LatencyHide;
+  MemLatency += LoopCount * S.MemLatency;
+  MixAlu += LoopCount * S.MixAlu;
+  Alu += LoopCount * S.Alu;
+  Lds += LoopCount * S.Lds;
+  SgprSpill |= S.SgprSpill;
+}
+// Does more occupancy give more perf.
+bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
+  unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc);
+  // 10% is good enough.
+  if ((10 * Gain) >= Alu)
+    return true;
+  return false;
+}
+
+unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
+  unsigned Latency = MemLatency;
+  return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc));
+}
+
+// AMDGPULatencyTracker
+AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
+    : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
+
+void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
+  if (MI.isDebugInstr())
+    return;
+  int Latency = SIII->getInstrLatency(ItinerayData, MI);
+  // If inside latency hide.
+  if (!LatencyMIs.empty()) {
+    bool IsWaitCnt = false;
+    for (auto &MO : MI.operands()) {
+      if (MO.isReg()) {
+        Register Reg = MO.getReg();
+        auto It = LatencyMIs.find(Reg);
+        if (It != LatencyMIs.end()) {
+          IsWaitCnt = true;
+          // If MI use mem result, update latency to mem latency.
+          int Cycle = It->second;
+          if (Cycle > Latency)
+            Latency = Cycle;
+        }
+      }
+    }
+    // Update latency for each mem latency inst.
+    for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) {
+      auto Prev = It;
+      auto L = (It++);
+      int Cycle = L->second;
+      if (Cycle <= Latency) {
+        // Only left cycles.
+        // Remove the reg.
+        LatencyMIs.erase(Prev);
+        if (IsWaitCnt && Cycle == Latency) {
+          Score.MemLatency += Cycle;
+          // Only count memLatency once, the rest is hide.
+          IsWaitCnt = false;
+        } else {
+          // Hide cycle or count mem latency?
+          Score.LatencyHide += Cycle;
+        }
+      } else {
+        L->second -= Latency;
+        // Hide latency.
+        Score.LatencyHide += Latency;
+      }
+    }
+
+  } else {
+    // TODO: check branch/lds?
+    // TODO: check prevVAlu?
+    auto GetAluStatus = [](const MachineInstr &MI,
+                           const llvm::SIInstrInfo *SIII) {
+      AluStatus Status = AluStatus::Nothing;
+      if (SIII->isVALU(MI.getOpcode()))
+        Status = AluStatus::Vector;
+      else if (SIII->isSALU(MI.getOpcode()))
+        Status = AluStatus::Scalar;
+      return Status;
+    };
+    AluStatus Status = GetAluStatus(MI, SIII);
+
+    switch (PrevStatus) {
+    case AluStatus::Nothing: {
+      Score.Alu += Latency;
+      Score.MixAlu += Latency;
+      PrevStatus = Status;
+    } break;
+    case AluStatus::Vector:
+    case AluStatus::Scalar: {
+      Score.Alu += Latency;
+      // Ignore mix alu.
+      if (PrevStatus != Status)
+        PrevStatus = AluStatus::Nothing;
+      else
+        Score.MixAlu += Latency;
+    } break;
+    }
+  }
+  // Update latency inst.
+  if (SIII->isHighLatencyDef(MI.getOpcode()) && MI.mayLoad()) {
+    Register Reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kHighLetency = 180;
+    LatencyMIs[Reg] = kHighLetency;
+  } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) {
+    Register Reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kLowLetency = 35;
+    LatencyMIs[Reg] = kLowLetency;
+  }
+}
+
+SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI) {
+  SchedScore TotalScore;
+  for (auto &MFI : MF) {
+    MachineBasicBlock &MBB = MFI;
+    MachineBasicBlock::iterator Next;
+    AMDGPULatencyTracker LatencyTracker(ST);
+    for (auto &MI : MBB)
+      LatencyTracker.scan(MI);
+    unsigned LoopDepth = 0;
+    if (MLI)
+      LoopDepth = MLI->getLoopDepth(&MBB);
+    TotalScore.sum(LatencyTracker.Score, LoopDepth);
+  }
+  return TotalScore;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
new file mode 100644
index 0000000000000..9c63fa7e6b4a4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -0,0 +1,75 @@
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInstrItineraries.h"
+
+namespace llvm {
+
+class MachineInstr;
+class MachineFunction;
+class GCNSubtarget;
+class MachineLoopInfo;
+class SIInstrInfo;
+
+struct SchedScore {
+  // Score for this Sched result.
+  unsigned Occupancy = 0;
+  bool SgprSpill = false;
+  unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass?
+  unsigned MemLatency = 0;  // Only save mem latency.
+                            // We want mem latency small and hide big. Compare
+                            // memLatency - hide * Occ, smaller is better.
+  unsigned MixAlu = 0;      // VAlu and SAlu can running parallel if Occ > 1.
+  unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
+  unsigned Lds = 0; // Todo: count lds.
+  SchedScore() {}
+
+  void sum(const SchedScore &S, unsigned LoopDepth = 0);
+  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
+  // More latency can be hiden with ExtraOcc.
+  unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
+};
+
+struct AMDGPULatencyTracker {
+  AMDGPULatencyTracker(const llvm::GCNSubtarget &ST);
+  const llvm::SIInstrInfo *SIII;
+  const llvm::InstrItineraryData *ItinerayData;
+  // Latency MI dst reg to cycle map.
+  llvm::DenseMap<unsigned, int> LatencyMIs;
+  SchedScore Score;
+  // Low latency MI not wait.
+  unsigned HideLatency = 0;
+  unsigned MemLatency = 0;
+  // For simple, only consider mixture as one valu one salu.
+  // Not group now.
+  unsigned PrevSAlu = 0;
+  unsigned PrevVAlu = 0;
+  enum class AluStatus {
+    Nothing,
+    Vector,
+    Scalar,
+  } PrevStatus = AluStatus::Nothing;
+  void scan(const llvm::MachineInstr &MI);
+};
+
+SchedScore collectLatency(llvm::MachineFunction &MF,
+                          const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI = nullptr);
+
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 90e3489ced923..9c1aec6cd047d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -397,6 +397,12 @@ static cl::opt<bool>
                          cl::desc("Enable s_delay_alu insertion"),
                          cl::init(true), cl::Hidden);
 
+// Enable Hot block rematerialize
+static cl::opt<bool>
+    EnableHotBlockRemat("amdgpu-enable-hot-block-remat",
+                        cl::desc("Enable HotBlock Rematerialize optimization"),
+                        cl::init(false), cl::Hidden);
+
 // Enable GFX11+ VOPD
 static cl::opt<bool>
     EnableVOPD("amdgpu-enable-vopd",
@@ -521,6 +527,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
+  initializeAMDGPUHotBlockRematerializePass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
@@ -1539,6 +1546,10 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (TM->getOptLevel() > CodeGenOptLevel::Less)
     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
+  // Rematerialize must be run before phi elimination
+  if (isPassEnabled(EnableHotBlockRemat))
+    addPass(&AMDGPUHotBlockRematerializeID);
+
   TargetPassConfig::addOptimizedRegAlloc();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..79fdbba1d0db1 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHotBlockRematerialize.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
@@ -81,10 +82,12 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
+  AMDGPUMIRUtils.cpp
   AMDGPUIGroupLP.cpp
   AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
+  AMDGPUOccupancyAndLatencyHelper.cpp
   AMDGPUPerfHintAnalysis.cpp
   AMDGPUPostLegalizerCombiner.cpp
   AMDGPUPreLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index f74d12cfab0c0..7f76d14eb9ab0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -549,22 +549,26 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
         if (!S.liveAt(SI)) {
           if (It == LiveRegs.end()) {
             It = LiveRegs.find(MO.getReg());
-            if (It == LiveRegs.end())
+            if (!MRI->isSSA() && It == LiveRegs.end())
               llvm_unreachable("register isn't live");
           }
-          auto PrevMask = It->second;
-          It->second &= ~S.LaneMask;
-          CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+          if (It != LiveRegs.end()) {
+            auto PrevMask = It->second;
+            It->second &= ~S.LaneMask;
+            CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+          }
         }
       }
       if (It != LiveRegs.end() && It->second.none())
         LiveRegs.erase(It);
     } else if (!LI.liveAt(SI)) {
       auto It = LiveRegs.find(MO.getReg());
-      if (It == LiveRegs.end())
+      if (!MRI->isSSA() && It == LiveRegs.end())
         llvm_unreachable("register isn't live");
-      CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
-      LiveRegs.erase(It);
+      if (It != LiveRegs.end()) {
+        CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
+        LiveRegs.erase(It);
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7554b9f578fcb..aa4b3f948b726 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -47,6 +47,10 @@ struct GCNRegPressure {
 
   void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
 
+  unsigned getMaxSGPR() const {
+    return std::max(getSGPRNum(), getSGPRTuplesWeight());
+  }
+
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR32]; }
   /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
new file mode 100644
index 0000000000000..d6c6173cd523e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -0,0 +1,179 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# Check that the loads have been moved to the use
+# CHECK: bb.0:
+# CHECK-NOT: S_LOAD_DWORDX4_IMM
+# CHECK: bb.2:
+# CHECK: %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0
+# CHECK: KILL %t0
+# CHECK: %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0
+# CHECK: KILL %t2
+# CHECK: %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0
+# CHECK: KILL %t4
+# CHECK: %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0
+# CHECK: KILL %t6
+# CHECK: %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0
+# CHECK: KILL %t8
+# CHECK: %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0
+# CHECK: KILL %t10
+# CHECK: %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0
+# CHECK: KILL %t12
+# CHECK: %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0
+# CHECK: KILL %t14
+# CHECK: %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0
+# CHECK: KILL %t16
+# CHECK: %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0
+# CHECK: KILL %t18
+# CHECK: %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0
+# CHECK: KILL %t20
+# CHECK: %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0
+# CHECK: KILL %t22
+# CHECK: %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0
+# CHECK: KILL %t24
+# CHECK: %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0
+# CHECK: KILL %t26
+# CHECK: %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0
+# CHECK: KILL %t28
+# CHECK: %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0
+# CHECK: KILL %t30
+# CHECK: %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0
+# CHECK: KILL %t32
+# CHECK: %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0
+# CHECK: KILL %t34
+# CHECK: %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0
+# CHECK: KILL %t36
+# CHECK: %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0
+# CHECK: KILL %t38
+# CHECK: %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0
+# CHECK: KILL %t40
+# CHECK: %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0
+# CHECK: KILL %t42
+# CHECK: %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0
+# CHECK: KILL %t44
+# CHECK: %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0
+# CHECK: KILL %t46
+# CHECK: %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0
+# CHECK: KILL %t48
+# CHECK: %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0
+# CHECK: KILL %t50
+# CHECK: %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0
+# CHECK: KILL %t52
+# CHECK: %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0
+# CHECK: KILL %t54
+# CHECK: %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0
+# CHECK: KILL %t56
+# CHECK: %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0
+# CHECK: KILL %t58
+# CHECK: %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0
+# CHECK: KILL %t60
+# CHECK: %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0
+# CHECK: KILL %t62
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    %ptr:sgpr_64 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0
+    %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0
+    %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0
+    %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0
+    %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0
+    %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0
+    %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0
+    %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0
+    %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0
+    %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0
+    %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0
+    %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0
+    %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0
+    %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0
+    %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0
+    %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0
+    %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0
+    %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0
+    %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0
+    %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0
+    %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0
+    %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0
+    %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0
+    %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0
+    %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0
+    %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0
+    %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0
+    %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0
+    %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0
+    %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0
+    %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0
+    %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0
+
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
+
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir
new file mode 100644
index 0000000000000..a4e9c69d53b7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir
@@ -0,0 +1,575 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test checks that when there are no safe spot to clone/move instructions that
+# modify $scc, a safe spot is created for it.
+
+# CHECK: bb.0:
+# CHECK-NOT: S_NOT_B32:
+# CHECK: bb.2:
+# Save scc
+# CHECK: %[[#scc0:]]:sreg_32_xm0 = COPY $scc
+# CHECK: %t0:sgpr_32 = S_NOT_B32 0
+# CHECK: KILL %t0
+# All subsequent moves are placed within the safe spot created for the first one.
+# CHECK: %t2:sgpr_32 = S_NOT_B32 1
+# CHECK: %t4:sgpr_32 = S_NOT_B32 2
+# CHECK: %t6:sgpr_32 = S_NOT_B32 3
+# CHECK: %t8:sgpr_32 = S_NOT_B32 4
+# CHECK: %t10:sgpr_32 = S_NOT_B32 5
+# CHECK: %t12:sgpr_32 = S_NOT_B32 6
+# CHECK: %t14:sgpr_32 = S_NOT_B32 7
+# CHECK: %t16:sgpr_32 = S_NOT_B32 8
+# CHECK: %t18:sgpr_32 = S_NOT_B32 9
+# CHECK: %t20:sgpr_32 = S_NOT_B32 10
+# CHECK: %t22:sgpr_32 = S_NOT_B32 11
+# CHECK: %t24:sgpr_32 = S_NOT_B32 12
+# CHECK: %t26:sgpr_32 = S_NOT_B32 13
+# CHECK: %t28:sgpr_32 = S_NOT_B32 14
+# CHECK: %t30:sgpr_32 = S_NOT_B32 15
+# CHECK: %t32:sgpr_32 = S_NOT_B32 16
+# CHECK: %t34:sgpr_32 = S_NOT_B32 17
+# CHECK: %t36:sgpr_32 = S_NOT_B32 18
+# CHECK: %t38:sgpr_32 = S_NOT_B32 19
+# CHECK: %t40:sgpr_32 = S_NOT_B32 20
+# CHECK: %t42:sgpr_32 = S_NOT_B32 21
+# CHECK: %t44:sgpr_32 = S_NOT_B32 22
+# CHECK: %t46:sgpr_32 = S_NOT_B32 23
+# CHECK: %t48:sgpr_32 = S_NOT_B32 24
+# CHECK: %t50:sgpr_32 = S_NOT_B32 25
+# CHECK: %t52:sgpr_32 = S_NOT_B32 26
+# CHECK: %t54:sgpr_32 = S_NOT_B32 27
+# CHECK: %t56:sgpr_32 = S_NOT_B32 28
+# CHECK: %t58:sgpr_32 = S_NOT_B32 29
+# CHECK: %t60:sgpr_32 = S_NOT_B32 30
+# CHECK: %t62:sgpr_32 = S_NOT_B32 31
+# CHECK: %t64:sgpr_32 = S_NOT_B32 32
+# CHECK: %t66:sgpr_32 = S_NOT_B32 33
+# CHECK: %t68:sgpr_32 = S_NOT_B32 34
+# CHECK: %t70:sgpr_32 = S_NOT_B32 35
+# CHECK: %t72:sgpr_32 = S_NOT_B32 36
+# CHECK: %t74:sgpr_32 = S_NOT_B32 37
+# CHECK: %t76:sgpr_32 = S_NOT_B32 38
+# CHECK: %t78:sgpr_32 = S_NOT_B32 39
+# CHECK: %t80:sgpr_32 = S_NOT_B32 40
+# CHECK: %t82:sgpr_32 = S_NOT_B32 41
+# CHECK: %t84:sgpr_32 = S_NOT_B32 42
+# CHECK: %t86:sgpr_32 = S_NOT_B32 43
+# CHECK: %t88:sgpr_32 = S_NOT_B32 44
+# CHECK: %t90:sgpr_32 = S_NOT_B32 45
+# CHECK: %t92:sgpr_32 = S_NOT_B32 46
+# CHECK: %t94:sgpr_32 = S_NOT_B32 47
+# CHECK: %t96:sgpr_32 = S_NOT_B32 48
+# CHECK: %t98:sgpr_32 = S_NOT_B32 49
+# CHECK: %t100:sgpr_32 = S_NOT_B32 50
+# CHECK: %t102:sgpr_32 = S_NOT_B32 51
+# CHECK: %t104:sgpr_32 = S_NOT_B32 52
+# CHECK: %t106:sgpr_32 = S_NOT_B32 53
+# CHECK: %t108:sgpr_32 = S_NOT_B32 54
+# CHECK: %t110:sgpr_32 = S_NOT_B32 55
+# CHECK: %t112:sgpr_32 = S_NOT_B32 56
+# CHECK: %t114:sgpr_32 = S_NOT_B32 57
+# CHECK: %t116:sgpr_32 = S_NOT_B32 58
+# CHECK: %t118:sgpr_32 = S_NOT_B32 59
+# CHECK: %t120:sgpr_32 = S_NOT_B32 60
+# CHECK: %t122:sgpr_32 = S_NOT_B32 61
+# CHECK: %t124:sgpr_32 = S_NOT_B32 62
+# CHECK: %t126:sgpr_32 = S_NOT_B32 63
+# CHECK: %t128:sgpr_32 = S_NOT_B32 64
+# CHECK: %t130:sgpr_32 = S_NOT_B32 65
+# CHECK: %t132:sgpr_32 = S_NOT_B32 66
+# CHECK: %t134:sgpr_32 = S_NOT_B32 67
+# CHECK: %t136:sgpr_32 = S_NOT_B32 68
+# CHECK: %t138:sgpr_32 = S_NOT_B32 69
+# CHECK: %t140:sgpr_32 = S_NOT_B32 70
+# CHECK: %t142:sgpr_32 = S_NOT_B32 71
+# CHECK: %t144:sgpr_32 = S_NOT_B32 72
+# CHECK: %t146:sgpr_32 = S_NOT_B32 73
+# CHECK: %t148:sgpr_32 = S_NOT_B32 74
+# CHECK: %t150:sgpr_32 = S_NOT_B32 75
+# CHECK: %t152:sgpr_32 = S_NOT_B32 76
+# CHECK: %t154:sgpr_32 = S_NOT_B32 77
+# CHECK: %t156:sgpr_32 = S_NOT_B32 78
+# CHECK: %t158:sgpr_32 = S_NOT_B32 79
+# CHECK: %t160:sgpr_32 = S_NOT_B32 80
+# CHECK: %t162:sgpr_32 = S_NOT_B32 81
+# CHECK: %t164:sgpr_32 = S_NOT_B32 82
+# CHECK: %t166:sgpr_32 = S_NOT_B32 83
+# CHECK: %t168:sgpr_32 = S_NOT_B32 84
+# CHECK: %t170:sgpr_32 = S_NOT_B32 85
+# CHECK: %t172:sgpr_32 = S_NOT_B32 86
+# CHECK: %t174:sgpr_32 = S_NOT_B32 87
+# CHECK: %t176:sgpr_32 = S_NOT_B32 88
+# CHECK: %t178:sgpr_32 = S_NOT_B32 89
+# CHECK: %t180:sgpr_32 = S_NOT_B32 90
+# CHECK: %t182:sgpr_32 = S_NOT_B32 91
+# CHECK: %t184:sgpr_32 = S_NOT_B32 92
+# CHECK: %t186:sgpr_32 = S_NOT_B32 93
+# CHECK: %t188:sgpr_32 = S_NOT_B32 94
+# CHECK: %t190:sgpr_32 = S_NOT_B32 95
+# CHECK: %t192:sgpr_32 = S_NOT_B32 96
+# CHECK: %t194:sgpr_32 = S_NOT_B32 97
+# CHECK: %t196:sgpr_32 = S_NOT_B32 98
+# CHECK: %t198:sgpr_32 = S_NOT_B32 99
+# CHECK: %t200:sgpr_32 = S_NOT_B32 100
+# CHECK: %t202:sgpr_32 = S_NOT_B32 101
+# CHECK: %t204:sgpr_32 = S_NOT_B32 102
+# CHECK: %t206:sgpr_32 = S_NOT_B32 103
+# CHECK: %t208:sgpr_32 = S_NOT_B32 104
+# CHECK: %t210:sgpr_32 = S_NOT_B32 105
+# CHECK: %t212:sgpr_32 = S_NOT_B32 106
+# CHECK: %t214:sgpr_32 = S_NOT_B32 107
+# CHECK: %t216:sgpr_32 = S_NOT_B32 108
+# CHECK: %t218:sgpr_32 = S_NOT_B32 109
+# CHECK: %t220:sgpr_32 = S_NOT_B32 110
+# CHECK: %t222:sgpr_32 = S_NOT_B32 111
+# CHECK: %t224:sgpr_32 = S_NOT_B32 112
+# CHECK: %t226:sgpr_32 = S_NOT_B32 113
+# CHECK: %t228:sgpr_32 = S_NOT_B32 114
+# CHECK: %t230:sgpr_32 = S_NOT_B32 115
+# CHECK: %t232:sgpr_32 = S_NOT_B32 116
+# CHECK: %t234:sgpr_32 = S_NOT_B32 117
+# CHECK: %t236:sgpr_32 = S_NOT_B32 118
+# CHECK: %t238:sgpr_32 = S_NOT_B32 119
+# CHECK: %t240:sgpr_32 = S_NOT_B32 120
+# CHECK: %t242:sgpr_32 = S_NOT_B32 121
+# CHECK: %t244:sgpr_32 = S_NOT_B32 122
+# CHECK: %t246:sgpr_32 = S_NOT_B32 123
+# CHECK: %t248:sgpr_32 = S_NOT_B32 124
+# CHECK: %t250:sgpr_32 = S_NOT_B32 125
+# CHECK: %t252:sgpr_32 = S_NOT_B32 126
+# CHECK: %t254:sgpr_32 = S_NOT_B32 127
+# Restore scc
+# CHECK: $scc = COPY %[[#scc0]]
+# CHECK: KILL %t2
+# CHECK: KILL %t4
+# CHECK: KILL %t6
+# CHECK: KILL %t8
+# CHECK: KILL %t10
+# CHECK: KILL %t12
+# CHECK: KILL %t14
+# CHECK: KILL %t16
+# CHECK: KILL %t18
+# CHECK: KILL %t20
+# CHECK: KILL %t22
+# CHECK: KILL %t24
+# CHECK: KILL %t26
+# CHECK: KILL %t28
+# CHECK: KILL %t30
+# CHECK: KILL %t32
+# CHECK: KILL %t34
+# CHECK: KILL %t36
+# CHECK: KILL %t38
+# CHECK: KILL %t40
+# CHECK: KILL %t42
+# CHECK: KILL %t44
+# CHECK: KILL %t46
+# CHECK: KILL %t48
+# CHECK: KILL %t50
+# CHECK: KILL %t52
+# CHECK: KILL %t54
+# CHECK: KILL %t56
+# CHECK: KILL %t58
+# CHECK: KILL %t60
+# CHECK: KILL %t62
+# CHECK: KILL %t64
+# CHECK: KILL %t66
+# CHECK: KILL %t68
+# CHECK: KILL %t70
+# CHECK: KILL %t72
+# CHECK: KILL %t74
+# CHECK: KILL %t76
+# CHECK: KILL %t78
+# CHECK: KILL %t80
+# CHECK: KILL %t82
+# CHECK: KILL %t84
+# CHECK: KILL %t86
+# CHECK: KILL %t88
+# CHECK: KILL %t90
+# CHECK: KILL %t92
+# CHECK: KILL %t94
+# CHECK: KILL %t96
+# CHECK: KILL %t98
+# CHECK: KILL %t100
+# CHECK: KILL %t102
+# CHECK: KILL %t104
+# CHECK: KILL %t106
+# CHECK: KILL %t108
+# CHECK: KILL %t110
+# CHECK: KILL %t112
+# CHECK: KILL %t114
+# CHECK: KILL %t116
+# CHECK: KILL %t118
+# CHECK: KILL %t120
+# CHECK: KILL %t122
+# CHECK: KILL %t124
+# CHECK: KILL %t126
+# CHECK: KILL %t128
+# CHECK: KILL %t130
+# CHECK: KILL %t132
+# CHECK: KILL %t134
+# CHECK: KILL %t136
+# CHECK: KILL %t138
+# CHECK: KILL %t140
+# CHECK: KILL %t142
+# CHECK: KILL %t144
+# CHECK: KILL %t146
+# CHECK: KILL %t148
+# CHECK: KILL %t150
+# CHECK: KILL %t152
+# CHECK: KILL %t154
+# CHECK: KILL %t156
+# CHECK: KILL %t158
+# CHECK: KILL %t160
+# CHECK: KILL %t162
+# CHECK: KILL %t164
+# CHECK: KILL %t166
+# CHECK: KILL %t168
+# CHECK: KILL %t170
+# CHECK: KILL %t172
+# CHECK: KILL %t174
+# CHECK: KILL %t176
+# CHECK: KILL %t178
+# CHECK: KILL %t180
+# CHECK: KILL %t182
+# CHECK: KILL %t184
+# CHECK: KILL %t186
+# CHECK: KILL %t188
+# CHECK: KILL %t190
+# CHECK: KILL %t192
+# CHECK: KILL %t194
+# CHECK: KILL %t196
+# CHECK: KILL %t198
+# CHECK: KILL %t200
+# CHECK: KILL %t202
+# CHECK: KILL %t204
+# CHECK: KILL %t206
+# CHECK: KILL %t208
+# CHECK: KILL %t210
+# CHECK: KILL %t212
+# CHECK: KILL %t214
+# CHECK: KILL %t216
+# CHECK: KILL %t218
+# CHECK: KILL %t220
+# CHECK: KILL %t222
+# CHECK: KILL %t224
+# CHECK: KILL %t226
+# CHECK: KILL %t228
+# CHECK: KILL %t230
+# CHECK: KILL %t232
+# CHECK: KILL %t234
+# CHECK: KILL %t236
+# CHECK: KILL %t238
+# CHECK: KILL %t240
+# CHECK: KILL %t242
+# CHECK: KILL %t244
+# CHECK: KILL %t246
+# CHECK: KILL %t248
+# CHECK: KILL %t250
+# CHECK: KILL %t252
+# CHECK: KILL %t254
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Def scc
+    $scc = IMPLICIT_DEF
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    liveins: $scc
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+    liveins: $scc
+    ; Uses
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
+    KILL %t64
+    KILL %t66
+    KILL %t68
+    KILL %t70
+    KILL %t72
+    KILL %t74
+    KILL %t76
+    KILL %t78
+    KILL %t80
+    KILL %t82
+    KILL %t84
+    KILL %t86
+    KILL %t88
+    KILL %t90
+    KILL %t92
+    KILL %t94
+    KILL %t96
+    KILL %t98
+    KILL %t100
+    KILL %t102
+    KILL %t104
+    KILL %t106
+    KILL %t108
+    KILL %t110
+    KILL %t112
+    KILL %t114
+    KILL %t116
+    KILL %t118
+    KILL %t120
+    KILL %t122
+    KILL %t124
+    KILL %t126
+    KILL %t128
+    KILL %t130
+    KILL %t132
+    KILL %t134
+    KILL %t136
+    KILL %t138
+    KILL %t140
+    KILL %t142
+    KILL %t144
+    KILL %t146
+    KILL %t148
+    KILL %t150
+    KILL %t152
+    KILL %t154
+    KILL %t156
+    KILL %t158
+    KILL %t160
+    KILL %t162
+    KILL %t164
+    KILL %t166
+    KILL %t168
+    KILL %t170
+    KILL %t172
+    KILL %t174
+    KILL %t176
+    KILL %t178
+    KILL %t180
+    KILL %t182
+    KILL %t184
+    KILL %t186
+    KILL %t188
+    KILL %t190
+    KILL %t192
+    KILL %t194
+    KILL %t196
+    KILL %t198
+    KILL %t200
+    KILL %t202
+    KILL %t204
+    KILL %t206
+    KILL %t208
+    KILL %t210
+    KILL %t212
+    KILL %t214
+    KILL %t216
+    KILL %t218
+    KILL %t220
+    KILL %t222
+    KILL %t224
+    KILL %t226
+    KILL %t228
+    KILL %t230
+    KILL %t232
+    KILL %t234
+    KILL %t236
+    KILL %t238
+    KILL %t240
+    KILL %t242
+    KILL %t244
+    KILL %t246
+    KILL %t248
+    KILL %t250
+    KILL %t252
+    KILL %t254
+
+    KILL $scc
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir
new file mode 100644
index 0000000000000..39d21dbda3819
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir
@@ -0,0 +1,564 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test checks that scalar instructions that define $scc are not sunk into ranges where $scc is live
+# CHECK: bb.0:
+# CHECK-NOT: S_NOT_B32:
+# CHECK: bb.2:
+# CHECK: %t0:sgpr_32 = S_NOT_B32 0
+# CHECK: %t2:sgpr_32 = S_NOT_B32 1
+# CHECK: %t4:sgpr_32 = S_NOT_B32 2
+# CHECK: %t6:sgpr_32 = S_NOT_B32 3
+# CHECK: %t8:sgpr_32 = S_NOT_B32 4
+# CHECK: %t10:sgpr_32 = S_NOT_B32 5
+# CHECK: %t12:sgpr_32 = S_NOT_B32 6
+# CHECK: %t14:sgpr_32 = S_NOT_B32 7
+# CHECK: %t16:sgpr_32 = S_NOT_B32 8
+# CHECK: %t18:sgpr_32 = S_NOT_B32 9
+# CHECK: %t20:sgpr_32 = S_NOT_B32 10
+# CHECK: %t22:sgpr_32 = S_NOT_B32 11
+# CHECK: %t24:sgpr_32 = S_NOT_B32 12
+# CHECK: %t26:sgpr_32 = S_NOT_B32 13
+# CHECK: %t28:sgpr_32 = S_NOT_B32 14
+# CHECK: %t30:sgpr_32 = S_NOT_B32 15
+# CHECK: %t32:sgpr_32 = S_NOT_B32 16
+# CHECK: %t34:sgpr_32 = S_NOT_B32 17
+# CHECK: %t36:sgpr_32 = S_NOT_B32 18
+# CHECK: %t38:sgpr_32 = S_NOT_B32 19
+# CHECK: %t40:sgpr_32 = S_NOT_B32 20
+# CHECK: %t42:sgpr_32 = S_NOT_B32 21
+# CHECK: %t44:sgpr_32 = S_NOT_B32 22
+# CHECK: %t46:sgpr_32 = S_NOT_B32 23
+# CHECK: %t48:sgpr_32 = S_NOT_B32 24
+# CHECK: %t50:sgpr_32 = S_NOT_B32 25
+# CHECK: %t52:sgpr_32 = S_NOT_B32 26
+# CHECK: %t54:sgpr_32 = S_NOT_B32 27
+# CHECK: %t56:sgpr_32 = S_NOT_B32 28
+# CHECK: %t58:sgpr_32 = S_NOT_B32 29
+# CHECK: %t60:sgpr_32 = S_NOT_B32 30
+# CHECK: %t62:sgpr_32 = S_NOT_B32 31
+# CHECK: %t64:sgpr_32 = S_NOT_B32 32
+# CHECK: %t66:sgpr_32 = S_NOT_B32 33
+# CHECK: %t68:sgpr_32 = S_NOT_B32 34
+# CHECK: %t70:sgpr_32 = S_NOT_B32 35
+# CHECK: %t72:sgpr_32 = S_NOT_B32 36
+# CHECK: %t74:sgpr_32 = S_NOT_B32 37
+# CHECK: %t76:sgpr_32 = S_NOT_B32 38
+# CHECK: %t78:sgpr_32 = S_NOT_B32 39
+# CHECK: %t80:sgpr_32 = S_NOT_B32 40
+# CHECK: %t82:sgpr_32 = S_NOT_B32 41
+# CHECK: %t84:sgpr_32 = S_NOT_B32 42
+# CHECK: %t86:sgpr_32 = S_NOT_B32 43
+# CHECK: %t88:sgpr_32 = S_NOT_B32 44
+# CHECK: %t90:sgpr_32 = S_NOT_B32 45
+# CHECK: %t92:sgpr_32 = S_NOT_B32 46
+# CHECK: %t94:sgpr_32 = S_NOT_B32 47
+# CHECK: %t96:sgpr_32 = S_NOT_B32 48
+# CHECK: %t98:sgpr_32 = S_NOT_B32 49
+# CHECK: %t100:sgpr_32 = S_NOT_B32 50
+# CHECK: %t102:sgpr_32 = S_NOT_B32 51
+# CHECK: %t104:sgpr_32 = S_NOT_B32 52
+# CHECK: %t106:sgpr_32 = S_NOT_B32 53
+# CHECK: %t108:sgpr_32 = S_NOT_B32 54
+# CHECK: %t110:sgpr_32 = S_NOT_B32 55
+# CHECK: %t112:sgpr_32 = S_NOT_B32 56
+# CHECK: %t114:sgpr_32 = S_NOT_B32 57
+# CHECK: %t116:sgpr_32 = S_NOT_B32 58
+# CHECK: %t118:sgpr_32 = S_NOT_B32 59
+# CHECK: %t120:sgpr_32 = S_NOT_B32 60
+# CHECK: %t122:sgpr_32 = S_NOT_B32 61
+# CHECK: %t124:sgpr_32 = S_NOT_B32 62
+# CHECK: %t126:sgpr_32 = S_NOT_B32 63
+# CHECK: %t128:sgpr_32 = S_NOT_B32 64
+# CHECK: %t130:sgpr_32 = S_NOT_B32 65
+# CHECK: %t132:sgpr_32 = S_NOT_B32 66
+# CHECK: %t134:sgpr_32 = S_NOT_B32 67
+# CHECK: %t136:sgpr_32 = S_NOT_B32 68
+# CHECK: %t138:sgpr_32 = S_NOT_B32 69
+# CHECK: %t140:sgpr_32 = S_NOT_B32 70
+# CHECK: %t142:sgpr_32 = S_NOT_B32 71
+# CHECK: %t144:sgpr_32 = S_NOT_B32 72
+# CHECK: %t146:sgpr_32 = S_NOT_B32 73
+# CHECK: %t148:sgpr_32 = S_NOT_B32 74
+# CHECK: %t150:sgpr_32 = S_NOT_B32 75
+# CHECK: %t152:sgpr_32 = S_NOT_B32 76
+# CHECK: %t154:sgpr_32 = S_NOT_B32 77
+# CHECK: %t156:sgpr_32 = S_NOT_B32 78
+# CHECK: %t158:sgpr_32 = S_NOT_B32 79
+# CHECK: %t160:sgpr_32 = S_NOT_B32 80
+# CHECK: %t162:sgpr_32 = S_NOT_B32 81
+# CHECK: %t164:sgpr_32 = S_NOT_B32 82
+# CHECK: %t166:sgpr_32 = S_NOT_B32 83
+# CHECK: %t168:sgpr_32 = S_NOT_B32 84
+# CHECK: %t170:sgpr_32 = S_NOT_B32 85
+# CHECK: %t172:sgpr_32 = S_NOT_B32 86
+# CHECK: %t174:sgpr_32 = S_NOT_B32 87
+# CHECK: %t176:sgpr_32 = S_NOT_B32 88
+# CHECK: %t178:sgpr_32 = S_NOT_B32 89
+# CHECK: %t180:sgpr_32 = S_NOT_B32 90
+# CHECK: %t182:sgpr_32 = S_NOT_B32 91
+# CHECK: %t184:sgpr_32 = S_NOT_B32 92
+# CHECK: %t186:sgpr_32 = S_NOT_B32 93
+# CHECK: %t188:sgpr_32 = S_NOT_B32 94
+# CHECK: %t190:sgpr_32 = S_NOT_B32 95
+# CHECK: %t192:sgpr_32 = S_NOT_B32 96
+# CHECK: %t194:sgpr_32 = S_NOT_B32 97
+# CHECK: %t196:sgpr_32 = S_NOT_B32 98
+# CHECK: %t198:sgpr_32 = S_NOT_B32 99
+# CHECK: %t200:sgpr_32 = S_NOT_B32 100
+# CHECK: %t202:sgpr_32 = S_NOT_B32 101
+# CHECK: %t204:sgpr_32 = S_NOT_B32 102
+# CHECK: %t206:sgpr_32 = S_NOT_B32 103
+# CHECK: %t208:sgpr_32 = S_NOT_B32 104
+# CHECK: %t210:sgpr_32 = S_NOT_B32 105
+# CHECK: %t212:sgpr_32 = S_NOT_B32 106
+# CHECK: %t214:sgpr_32 = S_NOT_B32 107
+# CHECK: %t216:sgpr_32 = S_NOT_B32 108
+# CHECK: %t218:sgpr_32 = S_NOT_B32 109
+# CHECK: %t220:sgpr_32 = S_NOT_B32 110
+# CHECK: %t222:sgpr_32 = S_NOT_B32 111
+# CHECK: %t224:sgpr_32 = S_NOT_B32 112
+# CHECK: %t226:sgpr_32 = S_NOT_B32 113
+# CHECK: %t228:sgpr_32 = S_NOT_B32 114
+# CHECK: %t230:sgpr_32 = S_NOT_B32 115
+# CHECK: %t232:sgpr_32 = S_NOT_B32 116
+# CHECK: %t234:sgpr_32 = S_NOT_B32 117
+# CHECK: %t236:sgpr_32 = S_NOT_B32 118
+# CHECK: %t238:sgpr_32 = S_NOT_B32 119
+# CHECK: %t240:sgpr_32 = S_NOT_B32 120
+# CHECK: %t242:sgpr_32 = S_NOT_B32 121
+# CHECK: %t244:sgpr_32 = S_NOT_B32 122
+# CHECK: %t246:sgpr_32 = S_NOT_B32 123
+# CHECK: %t248:sgpr_32 = S_NOT_B32 124
+# CHECK: %t250:sgpr_32 = S_NOT_B32 125
+# CHECK: %t252:sgpr_32 = S_NOT_B32 126
+# CHECK: %t254:sgpr_32 = S_NOT_B32 127
+# CHECK: KILL %t0
+# CHECK: KILL %t2
+# CHECK: KILL %t4
+# CHECK: KILL %t6
+# CHECK: KILL %t8
+# CHECK: KILL %t10
+# CHECK: KILL %t12
+# CHECK: KILL %t14
+# CHECK: KILL %t16
+# CHECK: KILL %t18
+# CHECK: KILL %t20
+# CHECK: KILL %t22
+# CHECK: KILL %t24
+# CHECK: KILL %t26
+# CHECK: KILL %t28
+# CHECK: KILL %t30
+# CHECK: KILL %t32
+# CHECK: KILL %t34
+# CHECK: KILL %t36
+# CHECK: KILL %t38
+# CHECK: KILL %t40
+# CHECK: KILL %t42
+# CHECK: KILL %t44
+# CHECK: KILL %t46
+# CHECK: KILL %t48
+# CHECK: KILL %t50
+# CHECK: KILL %t52
+# CHECK: KILL %t54
+# CHECK: KILL %t56
+# CHECK: KILL %t58
+# CHECK: KILL %t60
+# CHECK: KILL %t62
+# CHECK: KILL %t64
+# CHECK: KILL %t66
+# CHECK: KILL %t68
+# CHECK: KILL %t70
+# CHECK: KILL %t72
+# CHECK: KILL %t74
+# CHECK: KILL %t76
+# CHECK: KILL %t78
+# CHECK: KILL %t80
+# CHECK: KILL %t82
+# CHECK: KILL %t84
+# CHECK: KILL %t86
+# CHECK: KILL %t88
+# CHECK: KILL %t90
+# CHECK: KILL %t92
+# CHECK: KILL %t94
+# CHECK: KILL %t96
+# CHECK: KILL %t98
+# CHECK: KILL %t100
+# CHECK: KILL %t102
+# CHECK: KILL %t104
+# CHECK: KILL %t106
+# CHECK: KILL %t108
+# CHECK: KILL %t110
+# CHECK: KILL %t112
+# CHECK: KILL %t114
+# CHECK: KILL %t116
+# CHECK: KILL %t118
+# CHECK: KILL %t120
+# CHECK: KILL %t122
+# CHECK: KILL %t124
+# CHECK: KILL %t126
+# CHECK: KILL %t128
+# CHECK: KILL %t130
+# CHECK: KILL %t132
+# CHECK: KILL %t134
+# CHECK: KILL %t136
+# CHECK: KILL %t138
+# CHECK: KILL %t140
+# CHECK: KILL %t142
+# CHECK: KILL %t144
+# CHECK: KILL %t146
+# CHECK: KILL %t148
+# CHECK: KILL %t150
+# CHECK: KILL %t152
+# CHECK: KILL %t154
+# CHECK: KILL %t156
+# CHECK: KILL %t158
+# CHECK: KILL %t160
+# CHECK: KILL %t162
+# CHECK: KILL %t164
+# CHECK: KILL %t166
+# CHECK: KILL %t168
+# CHECK: KILL %t170
+# CHECK: KILL %t172
+# CHECK: KILL %t174
+# CHECK: KILL %t176
+# CHECK: KILL %t178
+# CHECK: KILL %t180
+# CHECK: KILL %t182
+# CHECK: KILL %t184
+# CHECK: KILL %t186
+# CHECK: KILL %t188
+# CHECK: KILL %t190
+# CHECK: KILL %t192
+# CHECK: KILL %t194
+# CHECK: KILL %t196
+# CHECK: KILL %t198
+# CHECK: KILL %t200
+# CHECK: KILL %t202
+# CHECK: KILL %t204
+# CHECK: KILL %t206
+# CHECK: KILL %t208
+# CHECK: KILL %t210
+# CHECK: KILL %t212
+# CHECK: KILL %t214
+# CHECK: KILL %t216
+# CHECK: KILL %t218
+# CHECK: KILL %t220
+# CHECK: KILL %t222
+# CHECK: KILL %t224
+# CHECK: KILL %t226
+# CHECK: KILL %t228
+# CHECK: KILL %t230
+# CHECK: KILL %t232
+# CHECK: KILL %t234
+# CHECK: KILL %t236
+# CHECK: KILL %t238
+# CHECK: KILL %t240
+# CHECK: KILL %t242
+# CHECK: KILL %t244
+# CHECK: KILL %t246
+# CHECK: KILL %t248
+# CHECK: KILL %t250
+# CHECK: KILL %t252
+# CHECK: KILL %t254
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    ; Uses
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
+    KILL %t64
+    KILL %t66
+    KILL %t68
+    KILL %t70
+    KILL %t72
+    KILL %t74
+    KILL %t76
+    KILL %t78
+    KILL %t80
+    KILL %t82
+    KILL %t84
+    KILL %t86
+    KILL %t88
+    KILL %t90
+    KILL %t92
+    KILL %t94
+    KILL %t96
+    KILL %t98
+    KILL %t100
+    KILL %t102
+    KILL %t104
+    KILL %t106
+    KILL %t108
+    KILL %t110
+    KILL %t112
+    KILL %t114
+    KILL %t116
+    KILL %t118
+    KILL %t120
+    KILL %t122
+    KILL %t124
+    KILL %t126
+    KILL %t128
+    KILL %t130
+    KILL %t132
+    KILL %t134
+    KILL %t136
+    KILL %t138
+    KILL %t140
+    KILL %t142
+    KILL %t144
+    KILL %t146
+    KILL %t148
+    KILL %t150
+    KILL %t152
+    KILL %t154
+    KILL %t156
+    KILL %t158
+    KILL %t160
+    KILL %t162
+    KILL %t164
+    KILL %t166
+    KILL %t168
+    KILL %t170
+    KILL %t172
+    KILL %t174
+    KILL %t176
+    KILL %t178
+    KILL %t180
+    KILL %t182
+    KILL %t184
+    KILL %t186
+    KILL %t188
+    KILL %t190
+    KILL %t192
+    KILL %t194
+    KILL %t196
+    KILL %t198
+    KILL %t200
+    KILL %t202
+    KILL %t204
+    KILL %t206
+    KILL %t208
+    KILL %t210
+    KILL %t212
+    KILL %t214
+    KILL %t216
+    KILL %t218
+    KILL %t220
+    KILL %t222
+    KILL %t224
+    KILL %t226
+    KILL %t228
+    KILL %t230
+    KILL %t232
+    KILL %t234
+    KILL %t236
+    KILL %t238
+    KILL %t240
+    KILL %t242
+    KILL %t244
+    KILL %t246
+    KILL %t248
+    KILL %t250
+    KILL %t252
+    KILL %t254
+
+    KILL $scc
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir
new file mode 100644
index 0000000000000..305bf87a6120e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir
@@ -0,0 +1,304 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are present
+# CHECK: S_ENDPGM
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %s0:sgpr_32 = IMPLICIT_DEF
+    S_BRANCH %bb.2
+
+  bb.2:
+    %phi0:sgpr_32 = PHI %t0, %bb.0, %s0, %bb.1
+    %phi2:sgpr_32 = PHI %t2, %bb.0, %s0, %bb.1
+    %phi4:sgpr_32 = PHI %t4, %bb.0, %s0, %bb.1
+    %phi6:sgpr_32 = PHI %t6, %bb.0, %s0, %bb.1
+    %phi8:sgpr_32 = PHI %t8, %bb.0, %s0, %bb.1
+    %phi10:sgpr_32 = PHI %t10, %bb.0, %s0, %bb.1
+    %phi12:sgpr_32 = PHI %t12, %bb.0, %s0, %bb.1
+    %phi14:sgpr_32 = PHI %t14, %bb.0, %s0, %bb.1
+    %phi16:sgpr_32 = PHI %t16, %bb.0, %s0, %bb.1
+    %phi18:sgpr_32 = PHI %t18, %bb.0, %s0, %bb.1
+    %phi20:sgpr_32 = PHI %t20, %bb.0, %s0, %bb.1
+    %phi22:sgpr_32 = PHI %t22, %bb.0, %s0, %bb.1
+    %phi24:sgpr_32 = PHI %t24, %bb.0, %s0, %bb.1
+    %phi26:sgpr_32 = PHI %t26, %bb.0, %s0, %bb.1
+    %phi28:sgpr_32 = PHI %t28, %bb.0, %s0, %bb.1
+    %phi30:sgpr_32 = PHI %t30, %bb.0, %s0, %bb.1
+    %phi32:sgpr_32 = PHI %t32, %bb.0, %s0, %bb.1
+    %phi34:sgpr_32 = PHI %t34, %bb.0, %s0, %bb.1
+    %phi36:sgpr_32 = PHI %t36, %bb.0, %s0, %bb.1
+    %phi38:sgpr_32 = PHI %t38, %bb.0, %s0, %bb.1
+    %phi40:sgpr_32 = PHI %t40, %bb.0, %s0, %bb.1
+    %phi42:sgpr_32 = PHI %t42, %bb.0, %s0, %bb.1
+    %phi44:sgpr_32 = PHI %t44, %bb.0, %s0, %bb.1
+    %phi46:sgpr_32 = PHI %t46, %bb.0, %s0, %bb.1
+    %phi48:sgpr_32 = PHI %t48, %bb.0, %s0, %bb.1
+    %phi50:sgpr_32 = PHI %t50, %bb.0, %s0, %bb.1
+    %phi52:sgpr_32 = PHI %t52, %bb.0, %s0, %bb.1
+    %phi54:sgpr_32 = PHI %t54, %bb.0, %s0, %bb.1
+    %phi56:sgpr_32 = PHI %t56, %bb.0, %s0, %bb.1
+    %phi58:sgpr_32 = PHI %t58, %bb.0, %s0, %bb.1
+    %phi60:sgpr_32 = PHI %t60, %bb.0, %s0, %bb.1
+    %phi62:sgpr_32 = PHI %t62, %bb.0, %s0, %bb.1
+    %phi64:sgpr_32 = PHI %t64, %bb.0, %s0, %bb.1
+    %phi66:sgpr_32 = PHI %t66, %bb.0, %s0, %bb.1
+    %phi68:sgpr_32 = PHI %t68, %bb.0, %s0, %bb.1
+    %phi70:sgpr_32 = PHI %t70, %bb.0, %s0, %bb.1
+    %phi72:sgpr_32 = PHI %t72, %bb.0, %s0, %bb.1
+    %phi74:sgpr_32 = PHI %t74, %bb.0, %s0, %bb.1
+    %phi76:sgpr_32 = PHI %t76, %bb.0, %s0, %bb.1
+    %phi78:sgpr_32 = PHI %t78, %bb.0, %s0, %bb.1
+    %phi80:sgpr_32 = PHI %t80, %bb.0, %s0, %bb.1
+    %phi82:sgpr_32 = PHI %t82, %bb.0, %s0, %bb.1
+    %phi84:sgpr_32 = PHI %t84, %bb.0, %s0, %bb.1
+    %phi86:sgpr_32 = PHI %t86, %bb.0, %s0, %bb.1
+    %phi88:sgpr_32 = PHI %t88, %bb.0, %s0, %bb.1
+    %phi90:sgpr_32 = PHI %t90, %bb.0, %s0, %bb.1
+    %phi92:sgpr_32 = PHI %t92, %bb.0, %s0, %bb.1
+    %phi94:sgpr_32 = PHI %t94, %bb.0, %s0, %bb.1
+    %phi96:sgpr_32 = PHI %t96, %bb.0, %s0, %bb.1
+    %phi98:sgpr_32 = PHI %t98, %bb.0, %s0, %bb.1
+    %phi100:sgpr_32 = PHI %t100, %bb.0, %s0, %bb.1
+    %phi102:sgpr_32 = PHI %t102, %bb.0, %s0, %bb.1
+    %phi104:sgpr_32 = PHI %t104, %bb.0, %s0, %bb.1
+    %phi106:sgpr_32 = PHI %t106, %bb.0, %s0, %bb.1
+    %phi108:sgpr_32 = PHI %t108, %bb.0, %s0, %bb.1
+    %phi110:sgpr_32 = PHI %t110, %bb.0, %s0, %bb.1
+    %phi112:sgpr_32 = PHI %t112, %bb.0, %s0, %bb.1
+    %phi114:sgpr_32 = PHI %t114, %bb.0, %s0, %bb.1
+    %phi116:sgpr_32 = PHI %t116, %bb.0, %s0, %bb.1
+    %phi118:sgpr_32 = PHI %t118, %bb.0, %s0, %bb.1
+    %phi120:sgpr_32 = PHI %t120, %bb.0, %s0, %bb.1
+    %phi122:sgpr_32 = PHI %t122, %bb.0, %s0, %bb.1
+    %phi124:sgpr_32 = PHI %t124, %bb.0, %s0, %bb.1
+    %phi126:sgpr_32 = PHI %t126, %bb.0, %s0, %bb.1
+    %phi128:sgpr_32 = PHI %t128, %bb.0, %s0, %bb.1
+    %phi130:sgpr_32 = PHI %t130, %bb.0, %s0, %bb.1
+    %phi132:sgpr_32 = PHI %t132, %bb.0, %s0, %bb.1
+    %phi134:sgpr_32 = PHI %t134, %bb.0, %s0, %bb.1
+    %phi136:sgpr_32 = PHI %t136, %bb.0, %s0, %bb.1
+    %phi138:sgpr_32 = PHI %t138, %bb.0, %s0, %bb.1
+    %phi140:sgpr_32 = PHI %t140, %bb.0, %s0, %bb.1
+    %phi142:sgpr_32 = PHI %t142, %bb.0, %s0, %bb.1
+    %phi144:sgpr_32 = PHI %t144, %bb.0, %s0, %bb.1
+    %phi146:sgpr_32 = PHI %t146, %bb.0, %s0, %bb.1
+    %phi148:sgpr_32 = PHI %t148, %bb.0, %s0, %bb.1
+    %phi150:sgpr_32 = PHI %t150, %bb.0, %s0, %bb.1
+    %phi152:sgpr_32 = PHI %t152, %bb.0, %s0, %bb.1
+    %phi154:sgpr_32 = PHI %t154, %bb.0, %s0, %bb.1
+    %phi156:sgpr_32 = PHI %t156, %bb.0, %s0, %bb.1
+    %phi158:sgpr_32 = PHI %t158, %bb.0, %s0, %bb.1
+    %phi160:sgpr_32 = PHI %t160, %bb.0, %s0, %bb.1
+    %phi162:sgpr_32 = PHI %t162, %bb.0, %s0, %bb.1
+    %phi164:sgpr_32 = PHI %t164, %bb.0, %s0, %bb.1
+    %phi166:sgpr_32 = PHI %t166, %bb.0, %s0, %bb.1
+    %phi168:sgpr_32 = PHI %t168, %bb.0, %s0, %bb.1
+    %phi170:sgpr_32 = PHI %t170, %bb.0, %s0, %bb.1
+    %phi172:sgpr_32 = PHI %t172, %bb.0, %s0, %bb.1
+    %phi174:sgpr_32 = PHI %t174, %bb.0, %s0, %bb.1
+    %phi176:sgpr_32 = PHI %t176, %bb.0, %s0, %bb.1
+    %phi178:sgpr_32 = PHI %t178, %bb.0, %s0, %bb.1
+    %phi180:sgpr_32 = PHI %t180, %bb.0, %s0, %bb.1
+    %phi182:sgpr_32 = PHI %t182, %bb.0, %s0, %bb.1
+    %phi184:sgpr_32 = PHI %t184, %bb.0, %s0, %bb.1
+    %phi186:sgpr_32 = PHI %t186, %bb.0, %s0, %bb.1
+    %phi188:sgpr_32 = PHI %t188, %bb.0, %s0, %bb.1
+    %phi190:sgpr_32 = PHI %t190, %bb.0, %s0, %bb.1
+    %phi192:sgpr_32 = PHI %t192, %bb.0, %s0, %bb.1
+    %phi194:sgpr_32 = PHI %t194, %bb.0, %s0, %bb.1
+    %phi196:sgpr_32 = PHI %t196, %bb.0, %s0, %bb.1
+    %phi198:sgpr_32 = PHI %t198, %bb.0, %s0, %bb.1
+    %phi200:sgpr_32 = PHI %t200, %bb.0, %s0, %bb.1
+    %phi202:sgpr_32 = PHI %t202, %bb.0, %s0, %bb.1
+    %phi204:sgpr_32 = PHI %t204, %bb.0, %s0, %bb.1
+    %phi206:sgpr_32 = PHI %t206, %bb.0, %s0, %bb.1
+    %phi208:sgpr_32 = PHI %t208, %bb.0, %s0, %bb.1
+    %phi210:sgpr_32 = PHI %t210, %bb.0, %s0, %bb.1
+    %phi212:sgpr_32 = PHI %t212, %bb.0, %s0, %bb.1
+    %phi214:sgpr_32 = PHI %t214, %bb.0, %s0, %bb.1
+    %phi216:sgpr_32 = PHI %t216, %bb.0, %s0, %bb.1
+    %phi218:sgpr_32 = PHI %t218, %bb.0, %s0, %bb.1
+    %phi220:sgpr_32 = PHI %t220, %bb.0, %s0, %bb.1
+    %phi222:sgpr_32 = PHI %t222, %bb.0, %s0, %bb.1
+    %phi224:sgpr_32 = PHI %t224, %bb.0, %s0, %bb.1
+    %phi226:sgpr_32 = PHI %t226, %bb.0, %s0, %bb.1
+    %phi228:sgpr_32 = PHI %t228, %bb.0, %s0, %bb.1
+    %phi230:sgpr_32 = PHI %t230, %bb.0, %s0, %bb.1
+    %phi232:sgpr_32 = PHI %t232, %bb.0, %s0, %bb.1
+    %phi234:sgpr_32 = PHI %t234, %bb.0, %s0, %bb.1
+    %phi236:sgpr_32 = PHI %t236, %bb.0, %s0, %bb.1
+    %phi238:sgpr_32 = PHI %t238, %bb.0, %s0, %bb.1
+    %phi240:sgpr_32 = PHI %t240, %bb.0, %s0, %bb.1
+    %phi242:sgpr_32 = PHI %t242, %bb.0, %s0, %bb.1
+    %phi244:sgpr_32 = PHI %t244, %bb.0, %s0, %bb.1
+    %phi246:sgpr_32 = PHI %t246, %bb.0, %s0, %bb.1
+    %phi248:sgpr_32 = PHI %t248, %bb.0, %s0, %bb.1
+    %phi250:sgpr_32 = PHI %t250, %bb.0, %s0, %bb.1
+    %phi252:sgpr_32 = PHI %t252, %bb.0, %s0, %bb.1
+    %phi254:sgpr_32 = PHI %t254, %bb.0, %s0, %bb.1
+
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir
new file mode 100644
index 0000000000000..94e86a61c09d6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir
@@ -0,0 +1,564 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test checks that instructions that use $scc are sunk to users
+# CHECK: bb.0:
+# CHECK-NOT: S_NOT_B32:
+# CHECK: bb.2:
+# CHECK: %t0:sgpr_32 = S_NOT_B32 0
+# CHECK: KILL %t0
+# CHECK: %t2:sgpr_32 = S_NOT_B32 1
+# CHECK: KILL %t2
+# CHECK: %t4:sgpr_32 = S_NOT_B32 2
+# CHECK: KILL %t4
+# CHECK: %t6:sgpr_32 = S_NOT_B32 3
+# CHECK: KILL %t6
+# CHECK: %t8:sgpr_32 = S_NOT_B32 4
+# CHECK: KILL %t8
+# CHECK: %t10:sgpr_32 = S_NOT_B32 5
+# CHECK: KILL %t10
+# CHECK: %t12:sgpr_32 = S_NOT_B32 6
+# CHECK: KILL %t12
+# CHECK: %t14:sgpr_32 = S_NOT_B32 7
+# CHECK: KILL %t14
+# CHECK: %t16:sgpr_32 = S_NOT_B32 8
+# CHECK: KILL %t16
+# CHECK: %t18:sgpr_32 = S_NOT_B32 9
+# CHECK: KILL %t18
+# CHECK: %t20:sgpr_32 = S_NOT_B32 10
+# CHECK: KILL %t20
+# CHECK: %t22:sgpr_32 = S_NOT_B32 11
+# CHECK: KILL %t22
+# CHECK: %t24:sgpr_32 = S_NOT_B32 12
+# CHECK: KILL %t24
+# CHECK: %t26:sgpr_32 = S_NOT_B32 13
+# CHECK: KILL %t26
+# CHECK: %t28:sgpr_32 = S_NOT_B32 14
+# CHECK: KILL %t28
+# CHECK: %t30:sgpr_32 = S_NOT_B32 15
+# CHECK: KILL %t30
+# CHECK: %t32:sgpr_32 = S_NOT_B32 16
+# CHECK: KILL %t32
+# CHECK: %t34:sgpr_32 = S_NOT_B32 17
+# CHECK: KILL %t34
+# CHECK: %t36:sgpr_32 = S_NOT_B32 18
+# CHECK: KILL %t36
+# CHECK: %t38:sgpr_32 = S_NOT_B32 19
+# CHECK: KILL %t38
+# CHECK: %t40:sgpr_32 = S_NOT_B32 20
+# CHECK: KILL %t40
+# CHECK: %t42:sgpr_32 = S_NOT_B32 21
+# CHECK: KILL %t42
+# CHECK: %t44:sgpr_32 = S_NOT_B32 22
+# CHECK: KILL %t44
+# CHECK: %t46:sgpr_32 = S_NOT_B32 23
+# CHECK: KILL %t46
+# CHECK: %t48:sgpr_32 = S_NOT_B32 24
+# CHECK: KILL %t48
+# CHECK: %t50:sgpr_32 = S_NOT_B32 25
+# CHECK: KILL %t50
+# CHECK: %t52:sgpr_32 = S_NOT_B32 26
+# CHECK: KILL %t52
+# CHECK: %t54:sgpr_32 = S_NOT_B32 27
+# CHECK: KILL %t54
+# CHECK: %t56:sgpr_32 = S_NOT_B32 28
+# CHECK: KILL %t56
+# CHECK: %t58:sgpr_32 = S_NOT_B32 29
+# CHECK: KILL %t58
+# CHECK: %t60:sgpr_32 = S_NOT_B32 30
+# CHECK: KILL %t60
+# CHECK: %t62:sgpr_32 = S_NOT_B32 31
+# CHECK: KILL %t62
+# CHECK: %t64:sgpr_32 = S_NOT_B32 32
+# CHECK: KILL %t64
+# CHECK: %t66:sgpr_32 = S_NOT_B32 33
+# CHECK: KILL %t66
+# CHECK: %t68:sgpr_32 = S_NOT_B32 34
+# CHECK: KILL %t68
+# CHECK: %t70:sgpr_32 = S_NOT_B32 35
+# CHECK: KILL %t70
+# CHECK: %t72:sgpr_32 = S_NOT_B32 36
+# CHECK: KILL %t72
+# CHECK: %t74:sgpr_32 = S_NOT_B32 37
+# CHECK: KILL %t74
+# CHECK: %t76:sgpr_32 = S_NOT_B32 38
+# CHECK: KILL %t76
+# CHECK: %t78:sgpr_32 = S_NOT_B32 39
+# CHECK: KILL %t78
+# CHECK: %t80:sgpr_32 = S_NOT_B32 40
+# CHECK: KILL %t80
+# CHECK: %t82:sgpr_32 = S_NOT_B32 41
+# CHECK: KILL %t82
+# CHECK: %t84:sgpr_32 = S_NOT_B32 42
+# CHECK: KILL %t84
+# CHECK: %t86:sgpr_32 = S_NOT_B32 43
+# CHECK: KILL %t86
+# CHECK: %t88:sgpr_32 = S_NOT_B32 44
+# CHECK: KILL %t88
+# CHECK: %t90:sgpr_32 = S_NOT_B32 45
+# CHECK: KILL %t90
+# CHECK: %t92:sgpr_32 = S_NOT_B32 46
+# CHECK: KILL %t92
+# CHECK: %t94:sgpr_32 = S_NOT_B32 47
+# CHECK: KILL %t94
+# CHECK: %t96:sgpr_32 = S_NOT_B32 48
+# CHECK: KILL %t96
+# CHECK: %t98:sgpr_32 = S_NOT_B32 49
+# CHECK: KILL %t98
+# CHECK: %t100:sgpr_32 = S_NOT_B32 50
+# CHECK: KILL %t100
+# CHECK: %t102:sgpr_32 = S_NOT_B32 51
+# CHECK: KILL %t102
+# CHECK: %t104:sgpr_32 = S_NOT_B32 52
+# CHECK: KILL %t104
+# CHECK: %t106:sgpr_32 = S_NOT_B32 53
+# CHECK: KILL %t106
+# CHECK: %t108:sgpr_32 = S_NOT_B32 54
+# CHECK: KILL %t108
+# CHECK: %t110:sgpr_32 = S_NOT_B32 55
+# CHECK: KILL %t110
+# CHECK: %t112:sgpr_32 = S_NOT_B32 56
+# CHECK: KILL %t112
+# CHECK: %t114:sgpr_32 = S_NOT_B32 57
+# CHECK: KILL %t114
+# CHECK: %t116:sgpr_32 = S_NOT_B32 58
+# CHECK: KILL %t116
+# CHECK: %t118:sgpr_32 = S_NOT_B32 59
+# CHECK: KILL %t118
+# CHECK: %t120:sgpr_32 = S_NOT_B32 60
+# CHECK: KILL %t120
+# CHECK: %t122:sgpr_32 = S_NOT_B32 61
+# CHECK: KILL %t122
+# CHECK: %t124:sgpr_32 = S_NOT_B32 62
+# CHECK: KILL %t124
+# CHECK: %t126:sgpr_32 = S_NOT_B32 63
+# CHECK: KILL %t126
+# CHECK: %t128:sgpr_32 = S_NOT_B32 64
+# CHECK: KILL %t128
+# CHECK: %t130:sgpr_32 = S_NOT_B32 65
+# CHECK: KILL %t130
+# CHECK: %t132:sgpr_32 = S_NOT_B32 66
+# CHECK: KILL %t132
+# CHECK: %t134:sgpr_32 = S_NOT_B32 67
+# CHECK: KILL %t134
+# CHECK: %t136:sgpr_32 = S_NOT_B32 68
+# CHECK: KILL %t136
+# CHECK: %t138:sgpr_32 = S_NOT_B32 69
+# CHECK: KILL %t138
+# CHECK: %t140:sgpr_32 = S_NOT_B32 70
+# CHECK: KILL %t140
+# CHECK: %t142:sgpr_32 = S_NOT_B32 71
+# CHECK: KILL %t142
+# CHECK: %t144:sgpr_32 = S_NOT_B32 72
+# CHECK: KILL %t144
+# CHECK: %t146:sgpr_32 = S_NOT_B32 73
+# CHECK: KILL %t146
+# CHECK: %t148:sgpr_32 = S_NOT_B32 74
+# CHECK: KILL %t148
+# CHECK: %t150:sgpr_32 = S_NOT_B32 75
+# CHECK: KILL %t150
+# CHECK: %t152:sgpr_32 = S_NOT_B32 76
+# CHECK: KILL %t152
+# CHECK: %t154:sgpr_32 = S_NOT_B32 77
+# CHECK: KILL %t154
+# CHECK: %t156:sgpr_32 = S_NOT_B32 78
+# CHECK: KILL %t156
+# CHECK: %t158:sgpr_32 = S_NOT_B32 79
+# CHECK: KILL %t158
+# CHECK: %t160:sgpr_32 = S_NOT_B32 80
+# CHECK: KILL %t160
+# CHECK: %t162:sgpr_32 = S_NOT_B32 81
+# CHECK: KILL %t162
+# CHECK: %t164:sgpr_32 = S_NOT_B32 82
+# CHECK: KILL %t164
+# CHECK: %t166:sgpr_32 = S_NOT_B32 83
+# CHECK: KILL %t166
+# CHECK: %t168:sgpr_32 = S_NOT_B32 84
+# CHECK: KILL %t168
+# CHECK: %t170:sgpr_32 = S_NOT_B32 85
+# CHECK: KILL %t170
+# CHECK: %t172:sgpr_32 = S_NOT_B32 86
+# CHECK: KILL %t172
+# CHECK: %t174:sgpr_32 = S_NOT_B32 87
+# CHECK: KILL %t174
+# CHECK: %t176:sgpr_32 = S_NOT_B32 88
+# CHECK: KILL %t176
+# CHECK: %t178:sgpr_32 = S_NOT_B32 89
+# CHECK: KILL %t178
+# CHECK: %t180:sgpr_32 = S_NOT_B32 90
+# CHECK: KILL %t180
+# CHECK: %t182:sgpr_32 = S_NOT_B32 91
+# CHECK: KILL %t182
+# CHECK: %t184:sgpr_32 = S_NOT_B32 92
+# CHECK: KILL %t184
+# CHECK: %t186:sgpr_32 = S_NOT_B32 93
+# CHECK: KILL %t186
+# CHECK: %t188:sgpr_32 = S_NOT_B32 94
+# CHECK: KILL %t188
+# CHECK: %t190:sgpr_32 = S_NOT_B32 95
+# CHECK: KILL %t190
+# CHECK: %t192:sgpr_32 = S_NOT_B32 96
+# CHECK: KILL %t192
+# CHECK: %t194:sgpr_32 = S_NOT_B32 97
+# CHECK: KILL %t194
+# CHECK: %t196:sgpr_32 = S_NOT_B32 98
+# CHECK: KILL %t196
+# CHECK: %t198:sgpr_32 = S_NOT_B32 99
+# CHECK: KILL %t198
+# CHECK: %t200:sgpr_32 = S_NOT_B32 100
+# CHECK: KILL %t200
+# CHECK: %t202:sgpr_32 = S_NOT_B32 101
+# CHECK: KILL %t202
+# CHECK: %t204:sgpr_32 = S_NOT_B32 102
+# CHECK: KILL %t204
+# CHECK: %t206:sgpr_32 = S_NOT_B32 103
+# CHECK: KILL %t206
+# CHECK: %t208:sgpr_32 = S_NOT_B32 104
+# CHECK: KILL %t208
+# CHECK: %t210:sgpr_32 = S_NOT_B32 105
+# CHECK: KILL %t210
+# CHECK: %t212:sgpr_32 = S_NOT_B32 106
+# CHECK: KILL %t212
+# CHECK: %t214:sgpr_32 = S_NOT_B32 107
+# CHECK: KILL %t214
+# CHECK: %t216:sgpr_32 = S_NOT_B32 108
+# CHECK: KILL %t216
+# CHECK: %t218:sgpr_32 = S_NOT_B32 109
+# CHECK: KILL %t218
+# CHECK: %t220:sgpr_32 = S_NOT_B32 110
+# CHECK: KILL %t220
+# CHECK: %t222:sgpr_32 = S_NOT_B32 111
+# CHECK: KILL %t222
+# CHECK: %t224:sgpr_32 = S_NOT_B32 112
+# CHECK: KILL %t224
+# CHECK: %t226:sgpr_32 = S_NOT_B32 113
+# CHECK: KILL %t226
+# CHECK: %t228:sgpr_32 = S_NOT_B32 114
+# CHECK: KILL %t228
+# CHECK: %t230:sgpr_32 = S_NOT_B32 115
+# CHECK: KILL %t230
+# CHECK: %t232:sgpr_32 = S_NOT_B32 116
+# CHECK: KILL %t232
+# CHECK: %t234:sgpr_32 = S_NOT_B32 117
+# CHECK: KILL %t234
+# CHECK: %t236:sgpr_32 = S_NOT_B32 118
+# CHECK: KILL %t236
+# CHECK: %t238:sgpr_32 = S_NOT_B32 119
+# CHECK: KILL %t238
+# CHECK: %t240:sgpr_32 = S_NOT_B32 120
+# CHECK: KILL %t240
+# CHECK: %t242:sgpr_32 = S_NOT_B32 121
+# CHECK: KILL %t242
+# CHECK: %t244:sgpr_32 = S_NOT_B32 122
+# CHECK: KILL %t244
+# CHECK: %t246:sgpr_32 = S_NOT_B32 123
+# CHECK: KILL %t246
+# CHECK: %t248:sgpr_32 = S_NOT_B32 124
+# CHECK: KILL %t248
+# CHECK: %t250:sgpr_32 = S_NOT_B32 125
+# CHECK: KILL %t250
+# CHECK: %t252:sgpr_32 = S_NOT_B32 126
+# CHECK: KILL %t252
+# CHECK: %t254:sgpr_32 = S_NOT_B32 127
+# CHECK: KILL %t254
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    ; Uses
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
+    KILL %t64
+    KILL %t66
+    KILL %t68
+    KILL %t70
+    KILL %t72
+    KILL %t74
+    KILL %t76
+    KILL %t78
+    KILL %t80
+    KILL %t82
+    KILL %t84
+    KILL %t86
+    KILL %t88
+    KILL %t90
+    KILL %t92
+    KILL %t94
+    KILL %t96
+    KILL %t98
+    KILL %t100
+    KILL %t102
+    KILL %t104
+    KILL %t106
+    KILL %t108
+    KILL %t110
+    KILL %t112
+    KILL %t114
+    KILL %t116
+    KILL %t118
+    KILL %t120
+    KILL %t122
+    KILL %t124
+    KILL %t126
+    KILL %t128
+    KILL %t130
+    KILL %t132
+    KILL %t134
+    KILL %t136
+    KILL %t138
+    KILL %t140
+    KILL %t142
+    KILL %t144
+    KILL %t146
+    KILL %t148
+    KILL %t150
+    KILL %t152
+    KILL %t154
+    KILL %t156
+    KILL %t158
+    KILL %t160
+    KILL %t162
+    KILL %t164
+    KILL %t166
+    KILL %t168
+    KILL %t170
+    KILL %t172
+    KILL %t174
+    KILL %t176
+    KILL %t178
+    KILL %t180
+    KILL %t182
+    KILL %t184
+    KILL %t186
+    KILL %t188
+    KILL %t190
+    KILL %t192
+    KILL %t194
+    KILL %t196
+    KILL %t198
+    KILL %t200
+    KILL %t202
+    KILL %t204
+    KILL %t206
+    KILL %t208
+    KILL %t210
+    KILL %t212
+    KILL %t214
+    KILL %t216
+    KILL %t218
+    KILL %t220
+    KILL %t222
+    KILL %t224
+    KILL %t226
+    KILL %t228
+    KILL %t230
+    KILL %t232
+    KILL %t234
+    KILL %t236
+    KILL %t238
+    KILL %t240
+    KILL %t242
+    KILL %t244
+    KILL %t246
+    KILL %t248
+    KILL %t250
+    KILL %t252
+    KILL %t254
+
+
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file