From b6eb3b304f0a193a3660d921eae1401ed85ff1b2 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 20:24:04 -0700
Subject: [PATCH 01/25] Added rematerialize pass and test.

---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |    8 +
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       |   91 +
 llvm/lib/Target/AMDGPU/AMDGPU.h               |    4 +
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 4665 +++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 2241 ++++++++
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  217 +
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp    | 2767 ++++++++++
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.h      |  281 +
 .../AMDGPUMirSyncDependenceAnalysis.cpp       |  511 ++
 .../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h  |   98 +
 .../AMDGPUOccupancyAndLatencyHelper.cpp       |  188 +
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |   74 +
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    | 1790 +++++++
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h      |  197 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    1 +
 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h  |  106 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    6 +
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |    4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |    3 +
 .../CodeGen/AMDGPU/remat/vector_to_scalar.mir |  405 ++
 20 files changed, 13657 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index e4fad8f9ec869..974cd8a5f36b4 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -430,6 +430,14 @@ class TargetRegisterInfo : public MCRegisterInfo {
                                 LaneBitmask LaneMask,
                                 SmallVectorImpl<unsigned> &Indexes) const;
 
+  /// Return the set of sub register indexes that minimally cover the given
+  /// lane mask for the given register class.
+  ///
+  /// \returns an empty set if there is no set of covering sub registers.
+  std::vector<unsigned>
+  getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC,
+                                            LaneBitmask mask) const;
+
   /// The lane masks returned by getSubRegIndexLaneMask() above can only be
   /// used to determine if sub-registers overlap - they can't be used to
   /// determine if a set of sub-registers completely cover another
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 701a9f8d72a65..d458648fd8bd8 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -727,3 +727,94 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
   dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n";
 }
 #endif
+
+std::vector<unsigned>
+TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
+    const TargetRegisterClass *RC, LaneBitmask mask) const {
+  // TODO: this could replace the code it was copied from in SplitKit.cpp
+
+  // First pass: Try to find a perfectly matching subregister index.
+  // If none exists find the one covering the most lanemask bits.
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  const LaneBitmask avoid = ~mask;
+  {
+    unsigned BestCover = 0;
+    for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
+      // Is this index even compatible with the given class?
+      if (getSubClassWithSubReg(RC, Idx) != RC)
+        continue;
+      LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // The index must not cover any lanes outside
+      if ((SubRegMask & avoid).any())
+        continue;
+
+      unsigned PopCount = SubRegMask.getNumLanes();
+      PossibleIndexes.push_back(Idx);
+      if (PopCount > BestCover) {
+        BestCover = PopCount;
+        BestIdx = Idx;
+      }
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0) {
+    LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+                      << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+                      << '\n');
+    assert(false && "Impossible to span reg class");
+    return std::vector<unsigned>();
+  }
+
+  std::vector<unsigned> result;
+  result.push_back(BestIdx);
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  mask &= ~(getSubRegIndexLaneMask(BestIdx));
+  while (mask.any()) {
+    BestIdx = 0;
+    int BestCover = std::numeric_limits<int>::min();
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Guaranteed above
+      assert((SubRegMask & avoid).none());
+
+      // Try to cover as much of the remaining lanes as possible but as few of
+      // the already covered lanes as possible.
+      int Cover = (SubRegMask & mask).getNumLanes() -
+                  (SubRegMask & ~mask).getNumLanes();
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0) {
+      LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+                        << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+                        << '\n');
+      assert(false && "Impossible to span reg class");
+      return std::vector<unsigned>();
+    }
+
+    result.push_back(BestIdx);
+    mask &= ~getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return result;
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index f5c2b09c84806..24e9bb358d519 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -491,6 +491,10 @@ extern char &GCNRewritePartialRegUsesID;
 void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
 extern char &AMDGPUWaitSGPRHazardsLegacyID;
 
+void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &);
+FunctionPass *createAMDGPUHotBlockRematerializePass();
+extern char &AMDGPUHotBlockRematerializeID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
new file mode 100644
index 0000000000000..44ebaa2d51bec
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -0,0 +1,4665 @@
+//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUMirDivergenceAnalysis.h"
+#include "AMDGPUSubExpDag.h"
+#include "AMDGPUVMemDegreeDAG.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUMIRUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#include <unordered_set>
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+static cl::opt<bool> EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<bool> EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
+static cl::opt<bool> EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
+static cl::opt<bool> EnableVmemDegree("amdgpu-remat-enable-vmem-degree");
+static cl::opt<bool> EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat");
+static cl::opt<bool> EnableSubExp("amdgpu-remat-enable-sub-exp-remat");
+static cl::opt<bool> EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
+static cl::opt<bool> EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
+
+namespace {
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+template<typename T>
+using BlockMap = MapVector<MachineBasicBlock *, T>;
+
+// Rematerialize in a single pass instead of doing in register allcation.
+// If in register allocation, fail to rematerialize will cause spill.
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+  static char ID;
+
+  DenseSet<const MachineInstr*> TotalUniformInsts;
+  DenseSet<const MachineInstr*> SafeToRemoveInsts;
+  DenseSet<const MachineInstr*> DivergentInsts;
+  void RemoveInst(const MachineInstr *MI) {
+    TotalUniformInsts.erase(MI);
+    SafeToRemoveInsts.erase(MI);
+    DivergentInsts.erase(MI);
+  }
+
+  AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+    AU.addRequired<SlotIndexesWrapperPass>();
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+typedef AMDGPUHotBlockRematerialize Remat;
+
+} // end anonymous namespace
+
+// Util functions.
+namespace {
+
+MachineBasicBlock *
+nearest_common_dominator(MachineDominatorTree *DT,
+                         BlockSet &Blocks) {
+  auto I = Blocks.begin(), E = Blocks.end();
+
+  MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
+  while (I != E) {
+    MachineBasicBlock *B = cast<MachineBasicBlock>(*(I++));
+    DomB = DT->findNearestCommonDominator(DomB, B);
+    if (DomB == nullptr)
+      return nullptr;
+  }
+  // For split block like:
+  // bb.42:
+  //    %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec,
+  //    //    implicit $exec
+  //  %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  //  implicitdef $scc, implicit $exec
+  //
+  // bb.68:
+  //; predecessors: %bb.42
+  //  successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%),
+  //  %bb.43(50.00%)
+  //
+  //  SI_MASK_BRANCH %bb.43, implicit $exec
+  //  S_BRANCH %bb.45
+  // which is from
+  // bb.42:
+  //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit
+  //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  // SI_MASK_BRANCH %bb.43, implicit $exec
+  // S_BRANCH %bb.45
+  // The real common dom is bb.42.
+  // TODO: use _term version of exec update instructions so don't need this
+  // anymore.
+  if (DomB && DomB->pred_size() == 1 && !DomB->empty()) {
+    // Upstreaming note: This used to be SI_MASK_BRANCH
+    if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) {
+      MachineBasicBlock *Pred = *DomB->pred_begin();
+      if (Pred->succ_size() == 1 &&
+          (Pred->empty() || !Pred->back().isBranch())) {
+        DomB = Pred;
+      }
+    }
+  }
+
+  return DomB;
+}
+
+MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB,
+                                           MachineDominatorTree *DT,
+                                           MachineLoopInfo *LI) {
+  while (LI->getLoopDepth(BB) > 0) {
+    MachineDomTreeNode *N = DT->getNode(BB);
+    if (N == nullptr)
+      return nullptr;
+    MachineDomTreeNode *IDom = N->getIDom();
+    if (IDom == nullptr)
+      return nullptr;
+
+    BB = IDom->getBlock();
+  }
+
+  return BB;
+}
+
+MachineBasicBlock *
+FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
+                MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                const MachineRegisterInfo &MRI, bool bMemBound) {
+
+  BlockSet BBSet;
+  for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+    BBSet.insert(UseMI.getParent());
+  }
+  if (BBSet.size() == 0)
+    return nullptr;
+
+  MachineBasicBlock *BB = *BBSet.begin();
+  if (BBSet.size() > 1) {
+    MachineBasicBlock *BDom = nearest_common_dominator(DT, BBSet);
+    if (!BDom)
+      return nullptr;
+    BB = BDom;
+  }
+  // Try to find non loop dominator.
+  if (!bMemBound) {
+    BB = find_non_loop_dominator(BB, DT, MLI);
+  }
+  if (!BB)
+    return nullptr;
+
+  // If BB is already a hot block, move to BB will not help.
+  // hotBlockRemat will fail it when process BB.
+
+  // Must reachable from DefMI.
+  if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB))
+    return nullptr;
+
+  return BB;
+}
+
+bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+  unsigned OpNum = DefMI->getNumOperands();
+
+  // Only move DefMI which all operand is unique def.
+  for (unsigned i = 0; i < OpNum; i++) {
+    MachineOperand &Op = DefMI->getOperand(i);
+    if (!Op.isReg())
+      continue;
+    if (!MRI.getUniqueVRegDef(Op.getReg()) &&
+        !llvm::IsSub0Sub1SingleDef(Op.getReg(), MRI)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+// SGPR has alignment requirment, cannot get accurate reg number.
+const unsigned NearTargetRegLimit = 10;
+bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) {
+  unsigned maxSGPR = ST->getAddressableNumSGPRs();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg)
+    maxSGPR -= 4;
+
+  const unsigned AlignmentDelta = 3;
+  maxSGPR -= AlignmentDelta;
+
+  return maxSPressure > maxSGPR;
+}
+
+struct RematStatus {
+  unsigned TargetOcc;
+  unsigned TargetVLimit;
+  unsigned TargetSLimit;
+  unsigned MaxVPressure;
+  unsigned MaxSPressure;
+  unsigned InputPhysicalVPressure;
+  unsigned InputPhysicalSPressure;
+  // More occupancy can help more than latency cost to reach it.
+  bool bMemBound;
+  // abs(VTargetOcc-STargetOcc) > 1.
+  bool bNotBalance;
+  DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+  // Collect MBBs which has memory write. When move instructions cross MBB, skip
+  // mem inst if the MBB has memory write. To make things fast, just check
+  // mayStore and isBarrier.
+  DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+unsigned CollectMBBPressure(
+    MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
+    RematStatus &status) {
+  // Skip processing current block if it has only debug instructions
+  if (MBB.getFirstNonDebugInstr() == MBB.end())
+    return ST->getOccupancyWithNumVGPRs(0);
+  auto BBEnd = MBB.rbegin();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB))
+    return ST->getOccupancyWithNumVGPRs(0);
+
+  GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+  RPTracker.reset(*BBEnd, &outputLive, true);
+
+  for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
+    MachineInstr &MI = (*I++);
+    RPTracker.recede(MI);
+    if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
+      status.MemWriteMBBSet.insert(&MBB);
+  }
+
+  GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
+  unsigned sPressure = RP.getMaxSGPR();
+  if (sPressure > maxSPressure) {
+    maxSPressure = sPressure;
+  }
+  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > maxVPressure) {
+    maxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+  }
+  status.MBBPressureMap[&MBB] = RP;
+  return RP.getOccupancy(*ST);
+}
+
+unsigned CollectFnPressure(
+    MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
+    RematStatus &status) {
+  unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF);
+  // If only have one block, input/ouput virtual live set are empty.
+  if (MF.size() > 1) {
+    // Build input output live reg first.
+    auto *SlotIndexes = LIS->getSlotIndexes();
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBInputSlotMap;
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+    for (MachineBasicBlock &MBB : MF) {
+      auto BBBegin = MBB.getFirstNonDebugInstr();
+      if (BBBegin != MBB.end()) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBBegin);
+        MBBInputSlotMap[&MBB] = SI;
+      }
+
+      auto BBEnd = MBB.rbegin();
+
+      // R.End doesn't point to the boundary instruction.
+      // Skip Debug instr.
+      if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+        MBBOutputSlotMap[&MBB] = SI;
+      }
+    }
+
+    for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+      auto Reg = Register::index2VirtReg(I);
+      if (!LIS->hasInterval(Reg))
+        continue;
+
+      LaneBitmask LiveMask;
+      const auto &LI = LIS->getInterval(Reg);
+
+      // Skip local live interval to make live input/ouput faster.
+      if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+        continue;
+
+      for (auto inputIt : MBBInputSlotMap) {
+        MachineBasicBlock *MBB = inputIt.first;
+        auto SI = inputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+      }
+
+      for (auto outputIt : MBBOutputSlotMap) {
+        MachineBasicBlock *MBB = outputIt.first;
+        auto SI = outputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+      }
+    }
+  }
+
+  LLVM_DEBUG(
+      const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+      dbgs() << "output live"; for (auto &it
+                                    : status.MBBOutputLiveMap) {
+        unsigned Idx = it.first->getNumber();
+        auto LiveReg = it.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      } dbgs() << "input live";
+      for (auto &it
+           : status.MBBInputLiveMap) {
+        unsigned Idx = it.first->getNumber();
+        auto LiveReg = it.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      });
+
+  for (auto it = MF.begin(); it != MF.end(); ++it) {
+    MachineBasicBlock &MBB = *it;
+    unsigned Occ = CollectMBBPressure(MBB, LIS, MRI, ST, maxVPressure,
+                                      maxSPressure, status);
+    if (TgtOcc > Occ)
+      TgtOcc = Occ;
+  }
+  return TgtOcc;
+}
+RematStatus
+GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
+               const MachineRegisterInfo &MRI, const GCNSubtarget *ST) {
+  unsigned maxSPressure = 0;
+  unsigned maxVPressure = 0;
+  RematStatus status;
+  unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure,
+                                      maxSPressure, status);
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (TgtOcc >= MaxOcc) {
+    status.TargetOcc = TgtOcc;
+    status.TargetVLimit = 0;
+    status.TargetSLimit = 0;
+    status.MaxVPressure = 0;
+    status.MaxSPressure = 0;
+    status.InputPhysicalVPressure = 0;
+    status.InputPhysicalSPressure = 0;
+    status.bMemBound = false;
+    status.bNotBalance = false;
+    return status;
+  }
+
+  maxSPressure += RegForVCC;
+  maxVPressure = std::min(maxVPressure, ST->getMaxNumVGPRs(MF));
+  unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(maxSPressure);
+  unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(maxVPressure);
+
+  llvm::SchedScore totalScore = llvm::CollectLatency(MF, *ST, MLI);
+  bool bMemBound =
+      totalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+
+  bool bNotBalance = false;
+
+  const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
+  // Currently, only sgpr bound can be fixed with remat.
+  if (STgtOcc < VTgtOcc) {
+    unsigned bigOcc = std::max(STgtOcc, VTgtOcc);
+    // Change TgtOcc to bigOcc in case sgpr and vgpr is not balance.
+    if (bigOcc > TgtOcc) {
+      TgtOcc = bigOcc;
+      bNotBalance = true;
+      if (TgtOcc >= MaxOccupancy)
+        TgtOcc = MaxOccupancy-1;
+    }
+  }
+
+  // Collect input physical pressure.
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  unsigned vInputPressure = 0;
+  uint64_t sInputMask = 0;
+  for (const auto &livein : MRI.liveins()) {
+    const Register Reg = livein.first;
+    const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+    assert(Reg.isPhysical() && "input must be physical reg");
+    unsigned RegSize = RC->getLaneMask().getNumLanes();
+    if (SIRI->isVGPR(MRI, Reg)) {
+      vInputPressure += RegSize;
+    } else {
+      unsigned RegIndex = SIRI->getHWRegIndex(Reg);
+      uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex;
+      sInputMask |= mask;
+    }
+  }
+  // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high
+  // pressure.
+  unsigned sInputPressure = 0;
+  uint64_t mask = 0xf;
+  while (mask != 0) {
+    if (mask & sInputMask) {
+      sInputPressure += 4;
+    }
+    mask = mask << 4;
+  }
+
+
+  // If balanced, try next occupancy.
+  TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1);
+
+  auto CC = MF.getFunction().getCallingConv();
+  bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS;
+  // For shader profiles other than ps/cs, set target profile max as 4.
+  if (!IsPsCs) {
+    TgtOcc = TgtOcc > 4 ? 4 : TgtOcc;
+  }
+  if (TargetOccupancy)
+    TgtOcc = TargetOccupancy;
+
+  unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true);
+  unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc);
+
+  status.TargetOcc = TgtOcc;
+  status.TargetVLimit = VLimit;
+  status.TargetSLimit = SLimit;
+  status.MaxVPressure = maxVPressure;
+  status.MaxSPressure = maxSPressure;
+  status.InputPhysicalVPressure = vInputPressure;
+  status.InputPhysicalSPressure = sInputPressure;
+  status.bMemBound = bMemBound;
+  status.bNotBalance = bNotBalance;
+  return status;
+}
+
+} // namespace
+
+// Remat.
+namespace {
+
+struct RematNode {
+  enum class RematKind {
+    Candidate, // Not ready yet.
+    OneDefOneUse,
+    Clone,
+  };
+  RematNode()
+      : Reg(0), DefMI(nullptr), Kind(RematKind::Candidate),
+        InsertPointMI(nullptr), InsertBlock(nullptr), Size(0) {}
+  RematNode(unsigned R, MachineInstr *MI, unsigned S)
+      : Reg(R), DefMI(MI), Kind(RematKind::Candidate), InsertPointMI(nullptr),
+        InsertBlock(nullptr), Size(S) {}
+  RematNode(const RematNode &N)
+      : Reg(N.Reg), DefMI(N.DefMI), Kind(N.Kind),
+        InsertPointMI(N.InsertPointMI), InsertBlock(N.InsertBlock),
+        Size(N.Size) {}
+  unsigned Reg;
+  MachineInstr *DefMI;
+  MachineBasicBlock *InsertBlock;
+  union {
+    MachineInstr *InsertPointMI;
+    unsigned UserCount;
+  };
+  RematKind Kind;
+  unsigned Size;
+};
+
+struct BlockLiveInfo {
+  MachineBasicBlock *BB;
+  unsigned maxSReg;
+  unsigned maxVReg;
+  // Input live is the live reg which cross block.
+  const GCNRPTracker::LiveRegSet inputLive;
+};
+
+// Skip live reg remated to other block.
+void UpdateLiveInfo(MapVector<unsigned, RematNode> &RematMap,
+                    GCNRPTracker::LiveRegSet &LiveSet,
+                    const GCNRPTracker::LiveRegSet &inputLive,
+                    MachineBasicBlock *CurBB,
+                    DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+  for (auto &it : RematMap) {
+    unsigned Reg = it.first;
+    // Skip reg not in live set.
+    if (!LiveSet.count(Reg))
+      continue;
+    // Skip reg already in input set.
+    // Input set will be taken care in GetReducedSize.
+    if (inputLive.count(Reg))
+      continue;
+
+    auto &Node = it.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
+      // still before LiveInfo.BB, it is still live.
+      unsigned LiveBBIndex = RPOTIndexMap[CurBB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex > InsertBBIndex) {
+        continue;
+      }
+    }
+    // Already in remat map, don't need to check again, remove from
+    // candidate.
+    LiveSet.erase(Reg);
+  }
+}
+
+int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
+                         const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI) {
+
+  // Find shared operand in ReducedInsts.
+  int SharedSize = 0;
+  DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
+  for (MachineInstr *DefMI : ReducedInsts) {
+    for (MachineOperand &MO : DefMI->operands()) {
+      if (MO.isImm())
+        continue;
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      if (MO.isTied())
+        continue;
+      Register Reg = MO.getReg();
+
+      if (Reg == AMDGPU::EXEC)
+        continue;
+      if (!Reg.isVirtual())
+        continue;
+
+      bool isVGPR = SIRI->isVGPR(MRI, MO.getReg());
+      if (bVGPR != isVGPR) {
+        // Not support mix of v and s when remat now.
+        continue;
+      }
+
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+      unsigned Mask;
+      if (unsigned SubIdx = MO.getSubReg()) {
+        OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+        int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+        Mask = (1 << SubMOSize) - 1;
+      } else {
+        Mask = (1 << MOSize) - 1;
+      }
+      auto SharedRegIt = SharedRegMaskMap.find(Reg);
+      if (SharedRegIt == SharedRegMaskMap.end()) {
+        SharedRegMaskMap[Reg] = LaneBitmask(Mask);
+      } else {
+        unsigned PrevMask = SharedRegIt->second.getAsInteger();
+        if (unsigned SharedMask = (PrevMask & Mask)) {
+          // Some thing is shared.
+          for (int i = 0; i < MOSize; i++) {
+            if (SharedMask & (1 << i)) {
+              SharedSize += 1;
+            }
+          }
+        }
+        LaneBitmask MoMask = LaneBitmask(Mask | PrevMask);
+        SharedRegMaskMap[Reg] = MoMask;
+      }
+    }
+  }
+  return SharedSize;
+}
+
+int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
+                   GCNRPTracker::LiveRegSet &CanidateSet,
+                   InstSet &ReducedInsts,
+                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   BlockLiveInfo &LiveInfo,
+                   DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+  int ReducedSize = 0;
+  for (auto &it : RematMap) {
+    unsigned Reg = it.first;
+
+    if (!CanidateSet.count(Reg))
+      continue;
+
+    bool bReduced = false;
+    auto &Node = it.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
+      // moved after LiveInfo.BB, it is not live anymore.
+      unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex < InsertBBIndex)
+        bReduced = true;
+    } else {
+      // Clone.
+      bReduced = true;
+      // If has use in LiveInfo.BB, could not reduce from input live.
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        if (UseMI.getParent() == LiveInfo.BB) {
+          bReduced = false;
+          break;
+        }
+      }
+    }
+    if (bReduced) {
+      ReducedSize += Node.Size;
+      ReducedInsts.insert(Node.DefMI);
+    }
+
+    // Already in remat map, don't need to check again, remove from candidate.
+    CanidateSet.erase(Reg);
+  }
+
+  return ReducedSize;
+}
+
+int RematGain(MachineInstr *DefMI, unsigned Reg,
+              GCNRPTracker::LiveRegSet &CandidateRegSet,
+              const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+              bool bVGPR) {
+  int rematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  for (MachineOperand &MO : DefMI->operands()) {
+    if (MO.isImm())
+      continue;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+    if (MO.isTied())
+      continue;
+
+    if (MO.getReg() == AMDGPU::EXEC)
+      continue;
+
+    // Don't move user of VCC.
+    if (MO.getReg() == AMDGPU::VCC) {
+      rematSize = 0;
+      break;
+    }
+    Register Reg = MO.getReg();
+
+    // Don't move physical register use.
+    if (Reg.isPhysical()) {
+      rematSize = 0;
+      break;
+    }
+
+    bool isVGPR = SIRI->isVGPR(MRI, Reg);
+    if (bVGPR != isVGPR) {
+      // Not support mix of v and s when remat now.
+      // TODO: count possible pressure change here.
+      rematSize = 0;
+      break;
+    }
+    bool bSingleDef = MRI.hasOneDef(Reg);
+    if (!bSingleDef) {
+      bSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI);
+    }
+
+    if (bSingleDef) {
+      // The reg might share with other candidates, but not check it here.
+      // Count share reg in GetReducedSize.
+      if (EnableAggressive) {
+        // In case of aggressive remat, treat multi use reg as shared reg and
+        // ignore size of shared reg.
+        if (!MRI.hasOneNonDBGUse(Reg))
+          continue;
+      }
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      if (unsigned SubIdx = MO.getSubReg()) {
+        if (OpRC)
+          OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+      }
+      int inputSize = SIRI->getRegSizeInBits(*OpRC);
+      // If input not live in hotspot, move it cross hotspot should have
+      // less reg then DefMi.
+      if (rematSize > inputSize) {
+        rematSize -= inputSize;
+        continue;
+      }
+    }
+
+    rematSize = 0;
+    break;
+  }
+  return rematSize;
+}
+
+void BuildRematCandiates(std::vector<RematNode> &Candidates,
+                         GCNRPTracker::LiveRegSet &CandidateRegSet,
+                         DenseSet<unsigned> &PinnedRegSet,
+                         const MachineRegisterInfo &MRI,
+                         const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
+                         bool bVGPR) {
+
+  for (auto liveRegIt : CandidateRegSet) {
+    unsigned Reg = liveRegIt.first;
+    // Skip unsafe reg.
+    if (PinnedRegSet.count(Reg))
+      continue;
+
+    bool isVGPR = SIRI->isVGPR(MRI, Reg);
+    if (isVGPR != bVGPR)
+      continue;
+    bool bSafeCandidate = true;
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+    if (MI) {
+      if (bVGPR) {
+        // Only remat valu now.
+        if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY)
+          bSafeCandidate = false;
+        if (MI->getOpcode() == AMDGPU::COPY) {
+          // Make sure src is unique define.
+          if (MI->getOperand(1).isReg() &&
+              nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg()))
+            bSafeCandidate = false;
+        } else {
+          // Skip convergent valu.
+          if (MI->isConvergent())
+            bSafeCandidate = false;
+        }
+      }
+      // Skip inst has more than 1 def.
+      if (MI->getDesc().NumDefs > 1)
+        bSafeCandidate = false;
+    } else {
+      bSafeCandidate = false;
+    }
+
+    if (bSafeCandidate) {
+      int gain = RematGain(MI, Reg, CandidateRegSet, MRI, SIRI, bVGPR);
+      if (gain > 0) {
+        Candidates.emplace_back(RematNode(Reg, MI, gain >> 5));
+      } else {
+        bSafeCandidate = false;
+      }
+    }
+    // Save unsafe reg.
+    if (!bSafeCandidate)
+      PinnedRegSet.insert(Reg);
+  }
+
+  // Sort by gain.
+  std::sort(Candidates.begin(), Candidates.end(),
+            [](RematNode &i, RematNode &j) { return i.Size > j.Size; });
+}
+
+// For case like
+//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform
+//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform
+//  %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform
+// Sink S_AND right before S_CSELECT will overwrite SCC.
+// To avoid it, skip case when DefMI and UseMI has implicit define use.
+bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
+  if (DefMI->getDesc().NumImplicitDefs == 0)
+    return false;
+
+  auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo();
+  for (MachineOperand &def : DefMI->implicit_operands()) {
+    if (!def.isReg())
+      continue;
+    if (def.isUse())
+      continue;
+    unsigned Reg = def.getReg();
+    if (UseMI->readsRegister(Reg, TRI))
+      return true;
+  }
+  return false;
+}
+
+void AddOneDefOneUseCandidate(RematNode &Node,
+                              std::vector<RematNode> &RematList,
+                              MachineRegisterInfo &MRI, int &rematCnt,
+                              MachineDominatorTree *DT,
+                              MachinePostDominatorTree *PDT,
+                              MachineLoopInfo *MLI, bool bVGPR,
+                              bool bMemBound) {
+  unsigned Reg = Node.Reg;
+  MachineInstr *DefMI = Node.DefMI;
+
+  unsigned size = Node.Size;
+  MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin();
+  MachineBasicBlock *InsertBB = UseMI->getParent();
+
+  // For VGPR, always move next to the only user to avoid wqm or exec issue.
+  // But doing this will cause issue when DefMI is in wqm but single user not in
+  // wqm. Disable VGPR remat for now.
+  // TODO: make sure single user don't need wqm.
+  if (!bVGPR) {
+    if (MachineBasicBlock *NewInsertBB =
+            FindInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, bMemBound)) {
+      if (InsertBB != NewInsertBB) {
+        InsertBB = NewInsertBB;
+        // If can find a non-loop insert block, go to the insert block.
+        if (DefMI->getParent() != InsertBB) {
+          if (!InsertBB->empty()) {
+            auto it = InsertBB->getFirstNonPHI();
+            it = skipDebugInstructionsForward(it, InsertBB->end());
+            if (it == InsertBB->end())
+              UseMI = nullptr;
+            else
+              UseMI = &*it;
+          }
+        }
+      }
+    }
+  }
+
+  if (bVGPR) {
+    // Don't count reg in same block for valu.
+    if (UseMI->getParent() == DefMI->getParent())
+      return;
+  }
+
+  // Skip case when DefMI has implicit define which used by UseMI.
+  if (isImplicitDefUse(DefMI, UseMI)) {
+    return;
+  }
+
+  Node.InsertBlock = InsertBB;
+  Node.InsertPointMI = UseMI;
+  Node.Kind = RematNode::RematKind::OneDefOneUse;
+  RematList.emplace_back(Node);
+  rematCnt += size;
+}
+
+void AddCloneCandidate(std::vector<RematNode *> &cloneList,
+                       std::vector<RematNode> &RematList,
+                       DenseSet<unsigned> &PinnedRegSet,
+                       MachineRegisterInfo &MRI, int &rematCnt,
+                       SlotIndexes *SlotIndexes, MachineFunction &MF) {
+  // Group user in same blocks.
+  std::vector<BlockSet> UserSetList(cloneList.size());
+
+  for (int i = 0; i < cloneList.size(); i++) {
+    auto *Node = cloneList[i];
+    unsigned Reg = Node->Reg;
+    MachineInstr *DefMI = Node->DefMI;
+    // Group user in same blocks.
+    BlockSet &UserSet = UserSetList[i];
+
+    for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
+         useIt != MRI.use_instr_nodbg_end();) {
+      MachineInstr &UseMI = *(useIt++);
+      UserSet.insert(UseMI.getParent());
+    }
+
+    if (UserSet.size() == 1) {
+      // All users are in same block with DefMI.
+      if (*UserSet.begin() == DefMI->getParent()) {
+        // Mark cannot remat for now.
+        // TODO: try to split if is bigger than 4 and only used once per
+        // channel.
+        PinnedRegSet.insert(Reg);
+        continue;
+      }
+    }
+
+    int size = Node->Size;
+    size <<= 16;
+    // Pack userSet size to size.
+    size |= UserSet.size();
+    Node->UserCount = size;
+  }
+
+  std::sort(cloneList.begin(), cloneList.end(),
+            // Sort based on userSet size.
+            [](const RematNode *a, const RematNode *b) {
+              static constexpr int mask = 0xffff;
+              return (a->UserCount & mask) < (b->UserCount & mask);
+            });
+
+  for (RematNode *Node : cloneList) {
+    Node->Kind = RematNode::RematKind::Clone;
+    RematList.emplace_back(*Node);
+    rematCnt += Node->Size;
+  }
+}
+
+int FilterRematCandiates(std::vector<RematNode> &Candidates,
+                         std::vector<RematNode> &RematList,
+                         DenseSet<unsigned> &PinnedRegSet,
+                         MachineDominatorTree *DT,
+                         MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                         MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                         MachineFunction &MF, SlotIndexes *SlotIndexes,
+                         bool bVGPR, bool bMemBound) {
+  int rematCnt = 0;
+  // Work one def one use first.
+  for (auto &Node : Candidates) {
+    unsigned Reg = Node.Reg;
+    if (!MRI.hasOneNonDBGUse(Reg)) {
+      continue;
+    }
+    MachineInstr *DefMI = Node.DefMI;
+    if (!IsSafeToMove(DefMI, MRI)) {
+      PinnedRegSet.insert(Reg);
+      continue;
+    }
+
+    AddOneDefOneUseCandidate(Node, RematList, MRI, rematCnt, DT, PDT, MLI,
+                             bVGPR, bMemBound);
+  }
+
+  if (!bVGPR) {
+    std::vector<RematNode *> cloneList;
+    // Try multi use case.
+    for (auto &Node : Candidates) {
+      unsigned Reg = Node.Reg;
+      if (MRI.hasOneNonDBGUse(Reg)) {
+        continue;
+      }
+      MachineInstr *DefMI = Node.DefMI;
+      if (!IsSafeToMove(DefMI, MRI)) {
+        PinnedRegSet.insert(Reg);
+        continue;
+      }
+
+      // Clone for each user.
+      cloneList.emplace_back(&Node);
+    }
+
+    AddCloneCandidate(cloneList, RematList, PinnedRegSet, MRI, rematCnt,
+                      SlotIndexes, MF);
+  }
+
+  return rematCnt;
+}
+
+void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
+                SmallVector<MachineInstr *, 2> &userMIs) {
+  for (MachineInstr *UseMI : userMIs) {
+    for (MachineOperand &MO : UseMI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == Reg) {
+        MO.setReg(NewReg);
+        if (bSubRegDef)
+          MO.setSubReg(0);
+      }
+    }
+  }
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+    unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &userBlocks,
+    DenseSet<MachineBasicBlock *> &UserMBBSet,
+    std::vector<BlockLiveInfo> &hotBlocks, MachineDominatorTree *pDT) {
+  // Collect hot blocks which Exp is live in.
+  DenseSet<MachineBasicBlock *> hotBlockSet;
+  for (BlockLiveInfo &hotBlock : hotBlocks) {
+    if (hotBlock.inputLive.count(Reg)) {
+      hotBlockSet.insert(hotBlock.BB);
+    }
+  }
+
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
+  for (MachineBasicBlock *MBB : UserMBBSet) {
+    // Always clone in hot block.
+    if (hotBlockSet.count(MBB))
+      continue;
+
+    bool bDomAllHotBlocks = true;
+    bool bDomedByAllHotBlocks = true;
+    for (MachineBasicBlock *hotMBB : hotBlockSet) {
+      if (!pDT->dominates(MBB, hotMBB)) {
+        bDomAllHotBlocks = false;
+      }
+      if (!pDT->dominates(hotMBB, MBB)) {
+        bDomedByAllHotBlocks = false;
+      }
+      if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+        break;
+      }
+    }
+    if (bDomAllHotBlocks) {
+      userBlocks.erase(MBB);
+    } else if (bDomedByAllHotBlocks) {
+      afterHotRangeMBBs.insert(MBB);
+    }
+  }
+
+  // Split after hotRange block set by domtree.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+  if (!afterHotRangeMBBs.empty()) {
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      for (auto it2 : afterHotRangeMBBs) {
+        MachineBasicBlock *MBB2 = it2;
+        if (MBB == MBB2)
+          continue;
+        if (pDT->dominates(MBB, MBB2)) {
+          auto &Dom = DomMap[MBB];
+          Dom.insert(MBB2);
+          auto &Dom2 = DomMap[MBB2];
+          Dom.insert(Dom2.begin(), Dom2.end());
+        }
+      }
+    }
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      auto &Dom = DomMap[MBB];
+      for (MachineBasicBlock *domedMBB : Dom) {
+        // Remove domedMBB.
+        DomMap.erase(domedMBB);
+        UserMBBSet.erase(domedMBB);
+      }
+    }
+  }
+
+  return DomMap;
+}
+
+// Look for an earlier insert point if the InstructionToMove
+// writes to scc and scc is live at the CurrentInsertPoint.
+static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
+    MachineInstr *InstructionToMove,
+    MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint,
+    MachineRegisterInfo &MRI,
+   const SIRegisterInfo *SIRI,
+   const SIInstrInfo *SIII
+) 
+{
+    const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+    if (WillSmashScc)
+    {
+        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
+            CurrentInsertPoint,
+            SIRI,
+            SIII,
+            &MRI
+        );
+    }
+
+    return CurrentInsertPoint;
+}
+
+// Look for an earlier insert point if the SubExp
+// writes to scc and scc is live at the CurrentInsertPoint.
+static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
+    const SubExp &SubExpToMove,
+    MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint,
+    MachineRegisterInfo& MRI,
+    const SIRegisterInfo* SIRI,
+    const SIInstrInfo* SIII
+)
+{
+    const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
+    if (WillSmashScc)
+    {
+        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
+            CurrentInsertPoint,
+            SIRI,
+            SIII,
+            &MRI
+        );
+    }
+
+    return CurrentInsertPoint;
+}
+
+// Return trun if moving MI to Location will smash a live scc value.
+static bool WillSmashSccAtLocation(
+    MachineInstr* MI,
+    MachineBasicBlock* MBB,
+    MachineBasicBlock::iterator Location
+)
+{
+    // It is ok to pass nullptr to `modifiesRegister` for TRI here since
+    // SCC has no subreg/suprereg relationships.
+    return MI->modifiesRegister(AMDGPU::SCC, nullptr)
+        && llvm::IsSccLiveAt(MBB, Location);
+}
+
+void ApplyCloneRemat(Remat *Remat,
+                     RematNode &Node, std::vector<BlockLiveInfo> &hotBlocks,
+                     MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+                     SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+                     const SIInstrInfo *SIII, MachineFunction &MF) {
+  unsigned Reg = Node.Reg;
+
+  MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+  auto DefOp = DefMI->getOperand(0);
+  const MCInstrDesc &Desc = DefMI->getDesc();
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  // When the unique def has subReg, just create newReg for the subReg part.
+  bool bSubRegDef = false;
+  if (DefOp.getSubReg() != 0) {
+    RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
+    bSubRegDef = true;
+  }
+  const DebugLoc DL = DefMI->getDebugLoc();
+  unsigned OpNum = DefMI->getNumOperands();
+
+  Node.Kind = RematNode::RematKind::Clone;
+
+  // Group user in same blocks.
+  BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
+  DenseSet<MachineBasicBlock *> UserMBBSet;
+  for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
+       useIt != MRI.use_instr_nodbg_end();) {
+    MachineInstr &UseMI = *(useIt++);
+    UserMap[UseMI.getParent()].emplace_back(&UseMI);
+    UserMBBSet.insert(UseMI.getParent());
+  }
+
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+      reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, pDT);
+
+  for (auto useIt : UserMap) {
+    MachineBasicBlock *MBB = useIt.first;
+    // Skip same block uses.
+    if (MBB == DefMI->getParent()) {
+      continue;
+    }
+    // Skip MBB which share clone from other MBBs.
+    if (UserMBBSet.count(MBB) == 0)
+      continue;
+
+    unsigned NewReg = MRI.createVirtualRegister(RC);
+    auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
+    for (unsigned i = 1; i < OpNum; i++) {
+      NewDef = NewDef.add(DefMI->getOperand(i));
+    }
+
+    MachineInstr *InsertPointMI = useIt.second.front();
+    SlotIndex lastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+
+    for (MachineInstr *UseMI : useIt.second) {
+      SlotIndex slot = SlotIndexes->getInstructionIndex(*UseMI);
+      if (lastSlot > slot) {
+        lastSlot = slot;
+        InsertPointMI = UseMI;
+      }
+    }
+    
+    MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash(
+        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII
+    );
+
+    for (MachineMemOperand *MO : DefMI->memoperands()) {
+      NewDef->addMemOperand(MF, MO);
+    }
+
+    MBB->insert(InsertPoint, NewDef);
+
+    SlotIndexes->insertMachineInstrInMaps(*NewDef);
+
+    SmallVector<MachineInstr *, 2> &userMIs = useIt.second;
+    updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+
+    // update users in dom MBBs.
+    auto domMapIt = DomMap.find(MBB);
+    if (domMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
+        SmallVector<MachineInstr *, 2> &userMIs = UserMap[UpdateMBB];
+        updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+      }
+    }
+
+    llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes);
+  }
+  if (MRI.use_empty(Reg)) {
+    SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    Remat->RemoveInst(DefMI);
+    DefMI->eraseFromParent();
+  }
+}
+
+void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+                            SlotIndexes *slotIndexes,
+                            const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  MachineInstr *DefMI = Node.DefMI;
+  MachineInstr *InsertPointMI = Node.InsertPointMI;
+  MachineBasicBlock* MBB = nullptr;
+
+  // Find a valid insert point.
+  MachineBasicBlock::iterator InsertPoint;
+  if (InsertPointMI) {
+    InsertPoint = InsertPointMI->getIterator();
+    MBB = InsertPointMI->getParent();
+  } else {
+    InsertPoint = Node.InsertBlock->getFirstTerminator();
+    MBB = Node.InsertBlock;
+  }
+
+  InsertPoint = AdjustInsertPointToAvoidSccSmash(
+      DefMI, MBB, InsertPoint, MRI, SIRI, SIII
+  );
+  
+  // Move instruction to new location.
+  DefMI->removeFromParent();
+  InsertPoint->getParent()->insert(InsertPoint, DefMI);
+
+  // Update slot index.
+  slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+  slotIndexes->insertMachineInstrInMaps(*DefMI);
+}
+
+void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
+                std::vector<BlockLiveInfo> &hotBlocks,
+                MachineDominatorTree *pDT, SlotIndexes *slotIndexes,
+                MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                const SIInstrInfo *SIII, MachineFunction &MF) {
+  std::vector<RematNode> UpdateList;
+  for (auto &it : RematMap) {
+    UpdateList.emplace_back(it.second);
+  }
+  // Sort update list with slotIndex to make sure def moved before use.
+  // If use moved before def, it might not be the first use anymore.
+  std::sort(UpdateList.begin(), UpdateList.end(),
+            [&slotIndexes](RematNode &i, RematNode &j) {
+              SlotIndex a = slotIndexes->getInstructionIndex(*i.DefMI);
+              SlotIndex b = slotIndexes->getInstructionIndex(*j.DefMI);
+              return a < b;
+            });
+
+  for (RematNode &Node : UpdateList) {
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
+    } else if (Node.Kind == RematNode::RematKind::Clone) {
+      ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF);
+    }
+  }
+}
+
+void dumpRematMap(MapVector<unsigned, RematNode> &RematMap,
+                  const SIRegisterInfo *SIRI) {
+  dbgs() << "\n rematMap: \n";
+  for (auto it : RematMap) {
+    int Reg = it.first;
+    dbgs() << printReg(Reg, SIRI);
+    dbgs() << "\n";
+  }
+}
+
+int DebugBlockIndex = 42;
+
+void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet,
+                  MapVector<unsigned, RematNode> &VRematMap,
+                  MapVector<unsigned, RematNode> &SRematMap, int BlockIndex,
+                  const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  llvm::dumpLiveSet(LiveSet, SIRI);
+  dumpRematMap(VRematMap, SIRI);
+  dumpRematMap(SRematMap, SIRI);
+}
+
+void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
+                    const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  dbgs() << "\n Candidates: \n";
+  unsigned TotalSize = 0;
+  for (RematNode &Node : RematCandidates) {
+    dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size;
+    dbgs() << "\n";
+    TotalSize += Node.Size;
+  }
+  dbgs() << "Total Size:" << TotalSize << "\n";
+}
+
+} // namespace
+
+bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
+                   LiveIntervals *LIS, MachineDominatorTree *pDT,
+                   MachinePostDominatorTree *pPDT, bool &bNearTarget) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
+  for (MachineBasicBlock *MBB : RPOT) {
+    RPOTIndexMap[MBB] = RPOTIndexMap.size();
+  }
+
+  auto &MRI = MF.getRegInfo();
+
+  bool bUpdated = false;
+  RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (status.TargetOcc >= MaxOcc)
+    return false;
+
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+
+  int rematSCnt = status.MaxSPressure - SLimit;
+  // when agressive sgpr remat, reserve some for allocation lost.
+  if (EnableAggressive)
+    rematSCnt += NearTargetRegLimit;
+
+  bool bSGPRSpill = false;
+  if (rematSCnt > 0) {
+    bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+  }
+
+  bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+
+  // If bound by lds, skip.
+  if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) &&
+      !bForceRematSgpr)
+    return false;
+
+  MachineBasicBlock *EntryMBB = &MF.front();
+
+  auto *SlotIndexes = LIS->getSlotIndexes();
+
+  // Reg which already marked remat.
+  MapVector<unsigned, RematNode> VRematMap;
+  MapVector<unsigned, RematNode> SRematMap;
+  // Reg which cannot move around to remat.
+  DenseSet<unsigned> PinnedRegSet;
+  std::vector<BlockLiveInfo> hotBlocks;
+  for (auto it = po_begin(EntryMBB); it != po_end(EntryMBB); it++) {
+    MachineBasicBlock *MBB = *it;
+    auto &RP = status.MBBPressureMap[MBB];
+    // ignore block not hot.
+    if (RP.getVGPRNum(ST->hasGFX90AInsts()) < status.TargetVLimit &&
+        (RP.getMaxSGPR() + RegForVCC + status.InputPhysicalSPressure) <
+            status.TargetSLimit)
+      continue;
+    // Collect reg pressure.
+    unsigned maxVPressure = 0;
+    unsigned maxSPressure = 0;
+    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+
+    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+    LLVM_DEBUG(
+        dumpHotBlock(inputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
+
+    GCNDownwardRPTracker Tracker(*LIS);
+
+    Tracker.reset(*MBB->begin(), &inputLive);
+
+    for (MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      Tracker.advance();
+      auto LISLR = Tracker.getLiveRegs();
+      // Update live set for things already remated.
+      UpdateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+      UpdateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+
+      const GCNRPTracker::LiveRegSet &liveSet = LISLR;
+      unsigned VPressure = 0;
+      unsigned SPressure = 0;
+      CollectLiveSetPressure(liveSet, MRI, SIRI, VPressure, SPressure);
+      if (maxVPressure < VPressure)
+        maxVPressure = VPressure;
+      if (maxSPressure < SPressure)
+        maxSPressure = SPressure;
+    }
+    maxSPressure += RegForVCC + status.InputPhysicalSPressure;
+    if (maxVPressure <= VLimit && maxSPressure <= SLimit)
+      continue;
+
+    // Build block live info.
+    // Use outputLive for EntryMBB.
+    BlockLiveInfo LiveInfo = {MBB, maxSPressure, maxVPressure,
+                              MBB != EntryMBB ? inputLive : outputLive};
+    // Skip entry block when save hotBlock to reduce clone because not clone in
+    // entry block.
+    if (MBB != EntryMBB)
+      hotBlocks.emplace_back(LiveInfo);
+    GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.inputLive;
+
+    // Update reg pressure based on remat list.
+    InstSet VReducedInsts;
+    InstSet SReducedInsts;
+    int VReduced =
+        GetReducedSize(VRematMap, /*bVGPR*/ true, CandidateRegs, VReducedInsts,
+                       MRI, SIRI, LiveInfo, RPOTIndexMap);
+    int SReduced =
+        GetReducedSize(SRematMap, /*bVGPR*/ false, CandidateRegs, SReducedInsts,
+                       MRI, SIRI, LiveInfo, RPOTIndexMap);
+
+    // Calculate size need to be remat.
+    int rematVCnt = maxVPressure - VReduced - VLimit;
+    int rematSCnt = maxSPressure - SReduced - SLimit;
+
+    bool bSGPRSpill = false;
+    if (rematSCnt > 0) {
+      bSGPRSpill = nearSgprSpill(maxSPressure, ST, MF);
+    }
+    bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+    // Try to add candidates into remat list.
+
+    int newRematSCnt = 0;
+    if (rematSCnt > 0) {
+      // Build candidate nodes.
+      std::vector<RematNode> SRematCandidates;
+      BuildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
+                          SIII, SIRI, /*bVGPR*/ false);
+
+      LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
+      std::vector<RematNode> SRematList;
+      // Filter candidates.
+      newRematSCnt =
+          FilterRematCandiates(SRematCandidates, SRematList, PinnedRegSet, pDT,
+                               pPDT, MLI, MRI, SIRI, MF, SlotIndexes,
+                               /*bVGPR*/ false, status.bMemBound);
+      if (newRematSCnt > rematSCnt) {
+        // Has enough remat node to cover rematCnt.
+        int rematCnt = 0;
+        for (RematNode &Node : SRematList) {
+          SRematMap[Node.Reg] = Node;
+          rematCnt += Node.Size;
+          if (rematCnt > rematSCnt && !EnableAggressive)
+            break;
+        }
+        newRematSCnt = 0;
+      } else {
+
+        for (RematNode &Node : SRematList) {
+          SReducedInsts.insert(Node.DefMI);
+        }
+        // Check shared size.
+        int SharedReducedSize =
+            GetSharedReducedSize(SReducedInsts, /*bVGPR*/ false, MRI, SIRI);
+        if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
+            rematSCnt) {
+          for (RematNode &Node : SRematList) {
+            SRematMap[Node.Reg] = Node;
+          }
+        } else {
+          if (!bForceRematSgpr) {
+            return false;
+          } else {
+            for (RematNode &Node : SRematList) {
+              SRematMap[Node.Reg] = Node;
+            }
+            // Find local one def one use candidates.
+            for (MachineInstr &MI : *MBB) {
+              if (MI.isDebugInstr())
+                continue;
+              if (MI.getDesc().NumDefs != 1)
+                continue;
+              MachineOperand &DstMO = MI.getOperand(0);
+              Register Reg = DstMO.getReg();
+              if (!SIRI->isSGPRReg(MRI, Reg))
+                continue;
+              if (!MRI.hasOneNonDBGUse(Reg))
+                continue;
+              if (!MRI.hasOneDef(Reg))
+                continue;
+              if (Reg.isPhysical())
+                continue;
+              MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
+              if (UseMI.getParent() != MBB)
+                continue;
+              int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false);
+              if (gain > 0) {
+                // Skip case when DefMI has implicit define which used by UseMI.
+                if (isImplicitDefUse(&MI, &UseMI)) {
+                  continue;
+                }
+                RematNode Node = {Reg, &MI, (unsigned)gain >> 5};
+                Node.InsertPointMI = &UseMI;
+                Node.Kind = RematNode::RematKind::OneDefOneUse;
+                SRematMap[Reg] = Node;
+                SharedReducedSize += Node.Size;
+              }
+            }
+          }
+        }
+        newRematSCnt = rematSCnt - newRematSCnt - SharedReducedSize;
+      }
+    }
+    // If works, continue.
+
+    // Collect live range from hot inst.
+    // find common live range in hot insts.
+    // Remat these common live range.
+    // Apply the remat.
+
+    int newRematVCnt = 0;
+    if (rematVCnt > 0) {
+      // TODO: V remat.
+    }
+
+    bool bNeedSRemat = rematSCnt > 0;
+    bool bNeedVRemat = rematVCnt > 0;
+    // If sgpr spill, always do remat.
+    bool bSRematOK =
+        (newRematSCnt <= 0 && !SRematMap.empty()) ||
+        bForceRematSgpr;
+    bool bVRematOK =
+        (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty();
+    if (bNeedSRemat && bNeedVRemat) {
+      if (bVRematOK && bSRematOK) {
+        bUpdated = true;
+      } else if (bSGPRSpill) {
+        bUpdated = true;
+      }
+    } else if (bNeedSRemat) {
+      if (bSRematOK) {
+        bUpdated = true;
+      }
+    } else if (bNeedVRemat) {
+      if (bVRematOK) {
+        bUpdated = true;
+      }
+    }
+    // TODO: what to do when cannot reach target?
+    if (newRematSCnt > 0) {
+      if (newRematSCnt <= NearTargetRegLimit) {
+        bNearTarget = true;
+      } else {
+        if (!bSGPRSpill)
+          return false;
+      }
+    }
+  }
+
+  if (SRematMap.empty() && VRematMap.empty()) {
+    return bUpdated;
+  }
+
+  if (!SRematMap.empty()) {
+    bUpdated = true;
+    ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF);
+    LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
+  }
+
+  // Balance between vector and scalar if possible.
+  return bUpdated;
+}
+
+namespace {
+bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  DenseSet<MachineInstr *> DefMIs;
+  for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+    // skip implicit def.
+    if (DefMI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+      continue;
+    DefMIs.insert(&DefMI);
+  }
+  return DefMIs.size() == 1;
+}
+
+static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg)
+{
+    if (!MO.isImplicit() || !MO.isUse() || !MO.isReg())
+    {
+        return false;
+    }
+
+    return MO.getReg() == Reg;
+}
+
+static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg)
+{
+    if (!MO.isImplicit() || !MO.isDef() || !MO.isReg())
+    {
+        return false;
+    }
+
+    return MO.getReg() == Reg;
+}
+
+static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII)
+{
+    // Make sure UseMI is not wqm like sample.
+    if (SIII->isWQM(UseMI->getOpcode()))
+        return false;
+    if (UseMI->getOpcode() == AMDGPU::PHI)
+        return false;
+    
+    return true;
+}
+
+static bool isConvergent(Remat *Remat, const MachineInstr &MI) {
+  return MI.isConvergent() &&
+    // This flag is set on readfirstlane's to indicate that they
+    // are redundant (the value being read is already uniform).
+    // Normally, readfirstlanes are convergent, because different exec
+    // will cause a different value to be read; a known uniform
+    // readfirstlane is safe to move or clone and not actually convergent.
+    !Remat->TotalUniformInsts.count(&MI);
+}
+
+bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
+                     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) {
+  if (Reg.isPhysical())
+    return false;
+  bool bVGPR = SIRI->isVGPR(MRI, Reg);
+
+  MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+  if (!DefMI)
+    return false;
+  if (DefMI->getOpcode() == AMDGPU::PHI)
+    return false;
+
+  // Skip convergent.
+  if (isConvergent(Remat, *DefMI))
+    return false;
+
+  // Skip inst has more than 1 def.
+  if (DefMI->getDesc().NumDefs > 1)
+    return false;
+
+  unsigned OpNum = DefMI->getNumOperands();
+
+  // Only move DefMI which all operand is unique def.
+  for (unsigned i = 0; i < OpNum; i++) {
+    MachineOperand &Op = DefMI->getOperand(i);
+    if (!Op.isReg())
+      continue;
+    Register OpReg = Op.getReg();
+    if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
+      continue;
+    if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
+      continue;
+    // Alow unused scc define.
+    if (Op.isImplicit() && Op.isDead() && Op.isDef())
+      continue;
+    if (OpReg.isPhysical())
+      return false;
+    if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
+      return false;
+    }
+  }
+
+  if (bVGPR && bSink) {
+    // Skip mem related inst.
+    if (DefMI->mayLoadOrStore()) {
+      return false;
+    }
+
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (!IsSafeRematCandidateUser(&UseMI, SIII))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+std::vector<SubExp> buildSubExpFromCandidates(
+    Remat *Remat,
+    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+    const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
+    GCNRPTracker::LiveRegSet &unUsedPassThrus,
+    bool bAllowPartialUseInSubExp) {
+  InstSet CandidateDefs;
+  DenseSet<unsigned> RemovedCandidates;
+  std::vector<unsigned> CandidateRegs;
+  CandidateRegs.reserve(Candidates.size());
+  for (auto it : Candidates) {
+    unsigned Reg = it.first;
+    CandidateRegs.emplace_back(Reg);
+  }
+  // Sort candidate by defMI order to make sure defMI has dependent check after
+  // all its dependent node.
+  std::sort(CandidateRegs.begin(), CandidateRegs.end(),
+            [&MRI, &slotIndexes](const unsigned a, unsigned b) {
+              MachineInstr *MIa = MRI.getUniqueVRegDef(a);
+
+              MachineInstr *MIb = MRI.getUniqueVRegDef(b);
+              // Later instr first.
+              return !SlotIndex::isEarlierInstr(
+                  slotIndexes->getInstructionIndex(*MIa),
+                  slotIndexes->getInstructionIndex(*MIb));
+            });
+
+  // If Candidate def has user in MBB, add it when allow partial candidates.
+  // And the subExp has the define could only be clone, cannot move cross blocks
+  // because user in MBB.
+  DenseSet<MachineInstr *> PartialCandidates;
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
+  for (unsigned Reg : CandidateRegs) {
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+    bool bHasNoCandidatesSameBlockUser = false;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (UseMI.getParent() == MI->getParent()) {
+        if (UseMI.getNumExplicitDefs() == 1) {
+          // Skip user which already in Candidates.
+          unsigned UserDefReg = UseMI.getOperand(0).getReg();
+          if (Candidates.count(UserDefReg) > 0 &&
+              RemovedCandidates.count(UserDefReg) == 0)
+            continue;
+        }
+        if (!bAllowPartialUseInSubExp)
+          bHasNoCandidatesSameBlockUser = true;
+        else
+          PartialCandidates.insert(MI);
+        break;
+      }
+    }
+    if (bHasNoCandidatesSameBlockUser) {
+      RemovedCandidates.insert(Reg);
+      continue;
+    }
+    LLVM_DEBUG(MI->dump());
+    CandidateDefs.insert(MI);
+  }
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";);
+
+  if (CandidateDefs.empty())
+    return std::vector<SubExp>();
+  for (unsigned Reg : RemovedCandidates) {
+    unUsedPassThrus[Reg] = Candidates[Reg];
+    Candidates.erase(Reg);
+  }
+
+  // iterate MBB backward.
+  // add inst which only used for candidate defines.
+  for (auto it = MBB->rbegin(); it != MBB->rend(); it++) {
+    MachineInstr &MI = *it;
+    if (CandidateDefs.count(&MI) > 0) {
+      continue;
+    }
+
+    if (isConvergent(Remat, MI))
+      continue;
+    // Skip if MI is not safe to move.
+    if (MI.getNumDefs() != 1) {
+      // allow to move unused implicit def.
+      bool bDeadImplictDef = false;
+      for (MachineOperand &MO : MI.implicit_operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isDef())
+          continue;
+        bDeadImplictDef = MO.isDead();
+      }
+      if (!bDeadImplictDef)
+        continue;
+    }
+
+    unsigned Reg = -1;
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      Reg = MO.getReg();
+      break;
+    }
+
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true))
+      continue;
+
+    // If all users of MI are in candidate defs, add MI into candidate defs.
+    // If part of user of MI is in candidate defs, add MI into candidate defs
+    // when allow partialUse.
+    bool bAllUserInCandidate = true;
+    bool bHasCandidateUser = false;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (CandidateDefs.count(&UseMI) == 0)
+        bAllUserInCandidate = false;
+      else
+        bHasCandidateUser = true;
+    }
+    if (!bHasCandidateUser)
+      continue;
+    if (!bAllUserInCandidate) {
+      if (!bAllowPartialUseInSubExp)
+        continue;
+      PartialCandidates.insert(&MI);
+    }
+
+    CandidateDefs.insert(&MI);
+  }
+
+  // Collect input for CandidateDefs.
+  GCNRPTracker::LiveRegSet CandidateInput;
+  for (MachineInstr *MI : CandidateDefs) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (MO.isImplicit() && Reg.isPhysical())
+        continue;
+
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+      assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+             "UseMI should be safe to move");
+      if (DefMI && CandidateDefs.count(DefMI) > 0)
+        continue;
+      // Add to input.
+      CandidateInput[Reg] |= llvm::getRegMask(MO, MRI);
+    }
+  }
+
+  // Build defs in order.
+  std::vector<MachineInstr *> defs;
+  defs.reserve(CandidateDefs.size());
+  for (MachineInstr &MI : *MBB) {
+    MachineInstr *pMI = &MI;
+    if (CandidateDefs.count(pMI) == 0)
+      continue;
+    defs.emplace_back(pMI);
+  }
+
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
+                                                            : defs) {
+    MI->dump();
+  } dbgs() << "\nFinished Candidate Defs End\n";);
+
+  // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
+  // Candidates as output.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  dag.build(CandidateInput, Candidates, defs);
+  if (bAllowPartialUseInSubExp) {
+    for (auto &subExp : dag.SubExps) {
+      for (auto *MI : subExp.SUnits) {
+        if (PartialCandidates.count(MI)) {
+          subExp.bCloneOnly = true;
+          break;
+        }
+      }
+    }
+  }
+  return dag.SubExps;
+}
+
+
+std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
+    Remat* Remat,
+    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+    const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) {
+  InstSet CandidateDefs;
+
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
+  for (auto it : Candidates) {
+    unsigned Reg = it.first;
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (isConvergent(Remat, UseMI))
+        continue;
+      MachineBasicBlock *UseMBB = UseMI.getParent();
+      if (UseMBB == MI->getParent())
+        continue;
+      assert(UseMBB == MBB && "block mismatch");
+      // If all operands in CandidateRegs, add to candidateDefs.
+      bool bHasOpRegNotInCandidates = false;
+      for (MachineOperand &MO : UseMI.operands()) {
+        if (!MO.isReg())
+          continue;
+        if (MO.isDef())
+          continue;
+        Register OpReg = MO.getReg();
+        if (MO.isImplicit() && OpReg.isPhysical())
+          continue;
+        if (Candidates.count(OpReg) == 0) {
+          bHasOpRegNotInCandidates = true;
+          break;
+        }
+      }
+      if (bHasOpRegNotInCandidates)
+        continue;
+
+      LLVM_DEBUG(UseMI.dump());
+      CandidateDefs.insert(&UseMI);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";);
+
+  if (CandidateDefs.empty())
+    return std::vector<SubExp>();
+
+  // iterate MBB.
+  GCNRPTracker::LiveRegSet LocalCandidates = Candidates;
+  // add inst which only used by candidate defines.
+  for (auto it = MBB->begin(); it != MBB->end(); it++) {
+    MachineInstr &MI = *it;
+    if (CandidateDefs.count(&MI) > 0) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isDef())
+          continue;
+        Register Reg = MO.getReg();
+        if (Reg.isPhysical())
+          continue;
+        LocalCandidates[Reg];
+      }
+      continue;
+    }
+
+    // Skip if MI is not safe to move.
+    if (isConvergent(Remat, MI))
+      continue;
+
+    if (MI.getNumDefs() != 1)
+      continue;
+
+    if (MI.mayLoadOrStore()) {
+      continue;
+    }
+
+    unsigned Reg = -1;
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      Reg = MO.getReg();
+      break;
+    }
+
+    // Still use bsink to skip mem load/store.
+    // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*bSink*/true))
+    //  continue;
+
+    // If all user of MI is in candidate defs, add MI into candidate defs.
+    bool bAllOperandInCandidate = true;
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      Register OpReg = MO.getReg();
+      if (LocalCandidates.count(OpReg))
+        continue;
+
+      if (MO.isImplicit() &&
+          (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO))
+        continue;
+      if (OpReg.isPhysical()) {
+        bAllOperandInCandidate = false;
+        break;
+      }
+      MachineInstr *OpMI = MRI.getUniqueVRegDef(OpReg);
+      if (!OpMI) {
+        bAllOperandInCandidate = false;
+        break;
+      }
+      if (CandidateDefs.count(OpMI) == 0) {
+        bAllOperandInCandidate = false;
+        break;
+      }
+      if (MO.isTied())
+        continue;
+    }
+    if (!bAllOperandInCandidate)
+      continue;
+    LLVM_DEBUG(llvm::dbgs() << "Add local candidates:";
+               pressure::print_reg(Reg, MRI, SIRI, llvm::dbgs()););
+    LocalCandidates[Reg];
+    CandidateDefs.insert(&MI);
+  }
+
+  // Collect input for CandidateDefs.
+  GCNRPTracker::LiveRegSet CandidateInput;
+  for (MachineInstr *MI : CandidateDefs) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+        continue;
+      if (Reg.isPhysical())
+        continue;
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+      if (!DefMI) {
+        // Skip local def which is not unique.
+        if (MO.isTied())
+          continue;
+        if (Candidates.count(Reg) == 0 && LocalCandidates.count(Reg) != 0)
+          continue;
+      }
+      assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+             "UseMI should be safe to move");
+      if (DefMI && CandidateDefs.count(DefMI) > 0)
+        continue;
+      // Add to input.
+      CandidateInput[Reg] = llvm::getRegMask(MO, MRI);
+    }
+  }
+
+  // Build defs in order.
+  std::vector<MachineInstr *> defs;
+  defs.reserve(CandidateDefs.size());
+  for (MachineInstr &MI : *MBB) {
+    MachineInstr *pMI = &MI;
+    if (CandidateDefs.count(pMI) == 0)
+      continue;
+    defs.emplace_back(pMI);
+  }
+
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
+                                                            : defs) {
+    MI->dump();
+  } dbgs() << "\nFinished Candidate Defs End\n";);
+
+  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it
+                                                            : LocalCandidates) {
+    pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs());
+  } dbgs() << "\nLocalCandidates End\n";);
+  // Make sure all input reg are uniqueDef.
+  // Input is Candidates, output is?
+  // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
+  // Candidates as output.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  dag.build(Candidates, LocalCandidates, defs);
+  return dag.SubExps;
+}
+
+
+void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
+  if (Reg.isVirtual()) {
+    StringRef Name = MRI.getVRegName(Reg);
+    if (Name != "") {
+      dbgs() << '%' << Name;
+    } else {
+      dbgs() << '%' << Register::virtReg2Index(Reg);
+    }
+  }
+}
+
+MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
+                                   const MachineRegisterInfo &MRI,
+                                   MachineDominatorTree *pDT) {
+  BlockSet userBlocks;
+  for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+    MachineBasicBlock *UserBB = UseMI.getParent();
+    // Skip current BB.
+    if (UserBB != FromBB)
+      userBlocks.insert(UserBB);
+    else
+      // When has user in FromBB, userBlock will be FromBB.
+      return nullptr;
+  }
+  if (userBlocks.empty())
+    return nullptr;
+  MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
+  if (!pDT->dominates(FromBB, userBlock)) {
+    return nullptr;
+  }
+  if (userBlock == FromBB)
+    return nullptr;
+  return userBlock;
+}
+
+void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
+                             MachineDominatorTree *pDT,
+                             SlotIndexes *slotIndexes,
+                             const SIInstrInfo *SIII,
+                             const SIRegisterInfo *SIRI) {
+  // Move from bottom.
+  MachineBasicBlock *FromBB = Exp.FromBB;
+  for (auto it = Exp.SUnits.rbegin(); it != Exp.SUnits.rend(); it++) {
+    MachineInstr *DefMI = *it;
+    if (DefMI->getNumExplicitDefs() != 1)
+      continue;
+
+    unsigned Reg = DefMI->getOperand(0).getReg();
+    MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, pDT);
+    if (!ToBB)
+      continue;
+
+    // Do not overwrite a live scc.
+    MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin());
+    if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint))
+      continue;
+
+    DefMI->removeFromParent();
+    assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point");
+    ToBB->insert(InsertPoint, DefMI);
+    // Debug insts don't need slot index.
+    if (DefMI->isDebugInstr())
+      continue;
+    // Update slot index.
+    slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    slotIndexes->insertMachineInstrInMaps(*DefMI);
+  }
+}
+
+
+void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
+                             MachineDominatorTree *pDT,
+                             SlotIndexes *slotIndexes,
+                             const SIInstrInfo *SIII,
+                             const SIRegisterInfo *SIRI) {
+  // Move from top.
+  // Find lowest input def.
+  MachineBasicBlock *ToBB = Exp.ToBB;
+  assert(!ToBB->empty() && "ToBB have instructions for define of input nodes");
+  auto Terminator = ToBB->getFirstTerminator();
+  if (Terminator == ToBB->end() && ToBB->succ_size() == 1) {
+    MachineInstr &EndMI = *ToBB->rbegin();
+    if (SIII->isSchedulingBoundary(EndMI, ToBB, *ToBB->getParent()))
+      // Insert before the scheduling boundary instruction.
+      Terminator = EndMI.getIterator();
+    else
+      // No boundary so just insert inst at the end of the block.
+      Terminator = ToBB->end();
+  }
+
+  Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(
+      Exp, ToBB, Terminator, MRI, SIRI, SIII
+  );
+
+  for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+    MachineInstr *DefMI = *it;
+    if (DefMI->getNumExplicitDefs() != 1)
+      continue;
+    if (SIII->isEXP(DefMI->getOpcode()))
+      continue;
+    if (DefMI->mayStore())
+      continue;
+    // Find def for DefMI operands as insert point.
+    DefMI->removeFromParent();
+    ToBB->insert(Terminator, DefMI);
+
+    // Debug insts don't need slot index.
+    if (DefMI->isDebugInstr())
+      continue;
+    // Update slot index.
+    slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    slotIndexes->insertMachineInstrInMaps(*DefMI);
+  }
+}
+
+DenseSet<MachineInstr *> buildCloneSet(ExpDag &dag,
+                                       DenseSet<SUnit *> &dagBottoms,
+                                       GCNRPTracker::LiveRegSet &usedOutput) {
+  DenseSet<MachineInstr *> copySet;
+  for (auto it = dag.SUnits.rbegin(); it != dag.SUnits.rend(); it++) {
+    SUnit &SU = *it;
+    // Skip non-inst node.
+    if (!SU.isInstr())
+      continue;
+    MachineInstr *MI = SU.getInstr();
+    if (dagBottoms.find(&SU) != dagBottoms.end()) {
+      bool bUsed = false;
+      // For bottom SU, if in usedOutput, add to copySet;
+      for (MachineOperand &DefMO : MI->defs()) {
+        if (!DefMO.isReg())
+          continue;
+        unsigned Reg = DefMO.getReg();
+        if (usedOutput.count(Reg) > 0) {
+          bUsed = true;
+          break;
+        }
+      }
+      if (bUsed) {
+        copySet.insert(MI);
+        continue;
+      }
+      // bottom SU may still have succNode when it used both inExp and outExp.
+      // So continue check succNode.
+    }
+
+    // If any SuccNode is in copySet, add to copySet.
+    bool bSuccCopied = false;
+    for (SDep &SucDep : SU.Succs) {
+      SUnit *SucSU = SucDep.getSUnit();
+      MachineInstr *SuccMI = SucSU->getInstr();
+      if (copySet.count(SuccMI) > 0) {
+        bSuccCopied = true;
+        break;
+      }
+    }
+    if (bSuccCopied)
+      copySet.insert(MI);
+  }
+  return copySet;
+}
+
+void updateUsers(SmallVector<MachineInstr *, 2> &userMIs,
+                 DenseMap<unsigned, unsigned> &RegMap) {
+
+  for (MachineInstr *UserMI : userMIs) {
+    for (MachineOperand &MO : UserMI->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      auto it = RegMap.find(Reg);
+      if (it == RegMap.end())
+        continue;
+      unsigned NewReg = it->second;
+      MO.setReg(NewReg);
+    }
+  }
+}
+
+struct HotBlock {
+  MachineBasicBlock *MBB = nullptr;
+  GCNRPTracker::LiveRegSet inputLive;
+  std::pair<unsigned, unsigned> maxPressures;
+  // Info about vmemLd.
+  int vmemLdInputSize;
+  int vmemLdOutputSize;
+};
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+    SubExp &Exp,
+    MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &userBlocks,
+    DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &userBlocksLiveRegs,
+    std::vector<HotBlock> &hotBlocks, MachineDominatorTree *pDT) {
+  // Collect hot blocks which Exp is live in.
+  DenseSet<MachineBasicBlock *> hotBlockSet;
+  for (HotBlock &hotBlock : hotBlocks) {
+    for (unsigned Reg : Exp.BottomRegs) {
+      if (hotBlock.inputLive.count(Reg)) {
+        hotBlockSet.insert(hotBlock.MBB);
+        break;
+      }
+    }
+  }
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
+  for (auto it : userBlocksLiveRegs) {
+    MachineBasicBlock *MBB = it.first;
+    // Always clone in hot block.
+    if (hotBlockSet.count(MBB))
+      continue;
+
+    bool bDomAllHotBlocks = true;
+    bool bDomedByAllHotBlocks = true;
+    for (MachineBasicBlock *hotMBB : hotBlockSet) {
+      if (!pDT->dominates(MBB, hotMBB)) {
+        bDomAllHotBlocks = false;
+      }
+      if (!pDT->dominates(hotMBB, MBB)) {
+        bDomedByAllHotBlocks = false;
+      }
+      if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+        break;
+      }
+    }
+    if (bDomAllHotBlocks) {
+      userBlocks.erase(MBB);
+    } else if (bDomedByAllHotBlocks) {
+      afterHotRangeMBBs.insert(MBB);
+    }
+  }
+
+  // Split after hotRange block set by domtree.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+  if (!afterHotRangeMBBs.empty()) {
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      for (auto it2 : afterHotRangeMBBs) {
+        MachineBasicBlock *MBB2 = it2;
+        if (MBB == MBB2)
+          continue;
+        if (pDT->dominates(MBB, MBB2)) {
+          auto &Dom = DomMap[MBB];
+          Dom.insert(MBB2);
+          auto &Dom2 = DomMap[MBB2];
+          Dom.insert(Dom2.begin(), Dom2.end());
+        }
+      }
+    }
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      auto &usedOutput = userBlocksLiveRegs[MBB];
+      auto &Dom = DomMap[MBB];
+      for (MachineBasicBlock *domedMBB : Dom) {
+        // Merge domed use to MBB use.
+        mergeLiveRegSet(usedOutput, userBlocksLiveRegs[domedMBB]);
+        // Remove domedMBB.
+        DomMap.erase(domedMBB);
+        userBlocksLiveRegs.erase(domedMBB);
+      }
+    }
+  }
+
+  return DomMap;
+}
+
+void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
+                              MachineDominatorTree *pDT,
+                              MachineRegisterInfo &MRI,
+                              SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+                              const SIRegisterInfo *SIRI) {
+  MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> userBlocks;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> userBlocksLiveRegs;
+  for (unsigned Reg : Exp.BottomRegs) {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      MachineBasicBlock *UserBB = UseMI.getParent();
+      // Skip current BB.
+      if (UserBB == Exp.FromBB)
+        continue;
+
+      userBlocks[UserBB].emplace_back(&UseMI);
+      auto &userLives = userBlocksLiveRegs[UserBB];
+      for (MachineOperand &MO : UseMI.uses()) {
+        if (!MO.isReg())
+          continue;
+        unsigned UseReg = MO.getReg();
+        if (Reg != UseReg)
+          continue;
+        userLives[Reg] |= getRegMask(MO, MRI);
+      }
+    }
+  }
+  // Build dag for SubExp to help remove unused inst when clone.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits);
+  DenseSet<SUnit *> dagBottoms;
+  for (SUnit &SU : dag.SUnits) {
+    if (!SU.isInstr())
+      continue;
+    if (SU.NumSuccs == 0) {
+      dagBottoms.insert(&SU);
+    } else {
+      MachineInstr *MI = SU.getInstr();
+      // Add SU which def value in Exp.outputLive.
+      for (MachineOperand &DefMO : MI->defs()) {
+        if (!DefMO.isReg())
+          continue;
+        unsigned Reg = DefMO.getReg();
+        if (Exp.BottomRegs.count(Reg) > 0) {
+          dagBottoms.insert(&SU);
+          break;
+        }
+      }
+    }
+  }
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+      reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT);
+
+  // Sort to make stable order.
+  std::sort(userBlocks.begin(), userBlocks.end(),
+    [](std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it0,
+      std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it1) {
+        return it0.first->getNumber() < it1.first->getNumber();
+    });
+
+  const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+
+  // Clone for each userBlocks. Not share clone thru dom tree which cannot help
+  // reg pressure.
+  for (auto it : userBlocks) {
+    MachineBasicBlock *MBB = it.first;
+    // Skip MBB which share clone from other MBBs.
+    if (userBlocksLiveRegs.count(MBB) == 0)
+      continue;
+    auto &usedOutput = userBlocksLiveRegs[MBB];
+    auto copySet = buildCloneSet(dag, dagBottoms, usedOutput);
+    // Clone to MBB.
+    // Create new regs first.
+    DenseMap<unsigned, unsigned> RegMap;
+    auto insertPtr = MBB->getFirstNonPHI();
+    // If Exp has scc read/write, make sure MBB not have scc in liveins.
+    if (bModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr))
+      continue;
+    MachineFunction *MF = MBB->getParent();
+    for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+      MachineInstr *DefMI = *it;
+      // Not clone if already in MBB.
+      if (DefMI->getParent() == MBB)
+        continue;
+      // Not clone if not used for MBB.
+      if (copySet.count(DefMI) == 0)
+        continue;
+
+      auto ClonedMI =
+          BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+
+      for (MachineOperand &Def : DefMI->defs()) {
+        Register Reg = Def.getReg();
+        if (Reg.isPhysical()) {
+          if (Def.isImplicit())
+            continue;
+          ClonedMI.addDef(Reg, 0, Def.getSubReg());
+        } else {
+          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+          RegMap[Reg] = NewReg;
+          ClonedMI.addDef(NewReg, 0, Def.getSubReg());
+        }
+      }
+
+      for (MachineOperand &MO : DefMI->uses()) {
+        if (MO.isReg()) {
+          Register Reg = MO.getReg();
+          if (Reg.isPhysical()) {
+            if (MO.isImplicit())
+              continue;
+            ClonedMI.addReg(Reg, 0, MO.getSubReg());
+          } else {
+            auto it = RegMap.find(Reg);
+            if (it == RegMap.end()) {
+              ClonedMI.addReg(Reg, 0, MO.getSubReg());
+            } else {
+              ClonedMI.addReg(it->second, 0, MO.getSubReg());
+            }
+          }
+        } else {
+          ClonedMI.add(MO);
+        }
+      }
+
+      MachineInstr *NewDef = ClonedMI.getInstr();
+      slotIndexes->insertMachineInstrInMaps(*NewDef);
+      // Set mem operand
+      for (MachineMemOperand *MO : DefMI->memoperands()) {
+        NewDef->addMemOperand(*MF, MO);
+      }
+    }
+
+    // update users in MBB.
+    SmallVector<MachineInstr *, 2> &userMIs = it.second;
+    updateUsers(userMIs, RegMap);
+
+    // update users in dom MBBs.
+    auto domMapIt = DomMap.find(MBB);
+    if (domMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
+        SmallVector<MachineInstr *, 2> &userMIs = userBlocks[UpdateMBB];
+        updateUsers(userMIs, RegMap);
+      }
+    }
+  }
+}
+
+
+void ApplySubExpCloneNearUserInBlock(
+    SubExp &Exp,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+    const SIRegisterInfo *SIRI) {
+  MachineBasicBlock *MBB = Exp.FromBB;
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *hotVMI = inBlockHotVInstMap[MBB];
+  MachineInstr *hotSMI = inBlockHotSInstMap[MBB];
+  // Exp is build with hotVMI or hotSMI, cannot mix.
+  assert(!(hotVMI && hotSMI) && "cannot mix hot MI");
+  MachineInstr *hotMI = hotVMI;
+  if (!hotMI) {
+    hotMI = hotSMI;
+  }
+
+  SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex();
+  const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+
+  for (unsigned Reg : Exp.BottomRegs) {
+
+    SmallVector<MachineInstr *, 2> useMIs;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      MachineBasicBlock *UserBB = UseMI.getParent();
+      // Skip current BB.
+      if (UserBB != Exp.FromBB)
+        continue;
+      // Skip inst in Exp.
+      if (Exp.BottomRoots.find(&UseMI) != Exp.BottomRoots.end())
+        continue;
+      SlotIndex useSlot =
+          slotIndexes->getInstructionIndex(UseMI).getBaseIndex();
+      // Only clone for use after hot slot.
+      if (useSlot < hotSlot)
+        continue;
+
+      // Do not overwrite a live scc.
+      if (bModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI))
+        continue;
+
+      useMIs.emplace_back(&UseMI);
+    }
+    if (useMIs.empty())
+      continue;
+    DenseMap<unsigned, unsigned> RegMap;
+
+    std::sort(useMIs.begin(), useMIs.end(),
+              [&slotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) {
+                return slotIndexes->getInstructionIndex(*MIa).getBaseIndex() <
+                       slotIndexes->getInstructionIndex(*MIb).getBaseIndex();
+              });
+    auto insertPtr = useMIs.front()->getIterator();
+
+    for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+      MachineInstr *DefMI = *it;
+      auto ClonedMI =
+          BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+
+      for (MachineOperand &Def : DefMI->defs()) {
+        Register Reg = Def.getReg();
+        if (Reg.isPhysical()) {
+          ClonedMI.addDef(Reg, 0, Def.getSubReg());
+        } else {
+          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+          RegMap[Reg] = NewReg;
+          ClonedMI.addDef(NewReg, 0, Def.getSubReg());
+        }
+      }
+
+      for (MachineOperand &MO : DefMI->uses()) {
+        if (MO.isReg()) {
+          if (MO.isImplicit()) {
+            continue;
+          }
+          Register Reg = MO.getReg();
+          if (Reg.isPhysical()) {
+            ClonedMI.addReg(Reg, 0, MO.getSubReg());
+          } else {
+            auto it = RegMap.find(Reg);
+            if (it == RegMap.end()) {
+              ClonedMI.addReg(Reg, 0, MO.getSubReg());
+            } else {
+              ClonedMI.addReg(it->second, 0, MO.getSubReg());
+            }
+          }
+        } else {
+          ClonedMI.add(MO);
+        }
+      }
+
+      MachineInstr *NewDef = ClonedMI.getInstr();
+      slotIndexes->insertMachineInstrInMaps(*NewDef);
+      // Set mem operand
+      for (MachineMemOperand *MO : DefMI->memoperands()) {
+        NewDef->addMemOperand(*MF, MO);
+      }
+    }
+    // TODO: only clone to cross hot range.
+    for (MachineInstr *UseMI : useMIs) {
+      for (MachineOperand &MO : UseMI->uses()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        auto it = RegMap.find(Reg);
+        if (it == RegMap.end())
+          continue;
+        unsigned NewReg = it->second;
+        MO.setReg(NewReg);
+      }
+    }
+  }
+}
+
+bool isInLiveSet(unsigned Reg, LaneBitmask mask,
+                 const GCNRPTracker::LiveRegSet &live) {
+  auto it = live.find(Reg);
+  if (it == live.end())
+    return false;
+
+  LaneBitmask liveMask = it->second;
+  return (liveMask | mask) == liveMask;
+}
+
+unsigned getPacifistLevel(unsigned Reg,
+                          DenseMap<MachineInstr *, unsigned> &pacifistLevels,
+                          const MachineRegisterInfo &MRI) {
+  unsigned level = 0;
+  for (MachineInstr &MI : MRI.def_instructions(Reg)) {
+    auto it = pacifistLevels.find(&MI);
+    if (it == pacifistLevels.end())
+      continue;
+    level = it->second;
+  }
+  return level;
+}
+
+bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
+                                  const MachineRegisterInfo &MRI) {
+  for (MachineInstr &def : MRI.def_instructions(Reg)) {
+    if (def.getParent() != MBB)
+      continue;
+    return true;
+  }
+  return false;
+}
+
+MachineInstr *getInBlockUniqueDef(unsigned Reg, MachineBasicBlock *MBB,
+                                  const GCNRPTracker::LiveRegSet &inputLive,
+                                  const GCNRPTracker::LiveRegSet &outputLive,
+                                  const MachineRegisterInfo &MRI) {
+  MachineInstr *DefMI = nullptr;
+  // If live as input for MBB, cannot be unique def.
+  if (inputLive.count(Reg))
+    return DefMI;
+  for (MachineInstr &def : MRI.def_instructions(Reg)) {
+    if (def.getParent() != MBB)
+      continue;
+    if (DefMI) {
+      // Not unique.
+      DefMI = nullptr;
+      break;
+    }
+    DefMI = &def;
+  }
+  return DefMI;
+}
+
+bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive,
+                const GCNRPTracker::LiveRegSet &outputLive) {
+  return inputLive.count(Reg) && outputLive.count(Reg);
+}
+
+// Instructions which only use imm/passThru reg/output only reg will not kill any
+// live reg, so name them pacifist here.
+bool collectPacifist(MachineInstr &MI,
+                     const GCNRPTracker::LiveRegSet &inputLive,
+                     const GCNRPTracker::LiveRegSet &outputLive,
+                     const MachineRegisterInfo &MRI,
+                     const SIRegisterInfo *SIRI) {
+  // If has implicit def, not move.
+  if (MI.getDesc().NumImplicitDefs != 0)
+    return false;
+
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+      continue;
+    if (Reg.isPhysical())
+      return false;
+    // The def for reg must be unique def in block or pass thru which not has
+    // def in block. If not, it is not safe to move.
+    if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), inputLive,
+                                         outputLive, MRI) ||
+          (isPassThru(Reg, inputLive, outputLive) &&
+           !hasInBlockDef(Reg, MI.getParent(), MRI))))
+      return false;
+
+    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+
+    if (isInLiveSet(Reg, mask, outputLive))
+      continue;
+
+    return false;
+  }
+  bool bHasDef = false;
+  for (MachineOperand &MO : MI.defs()) {
+    Register Reg = MO.getReg();
+
+    if (Reg.isPhysical())
+      return false;
+
+    if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+      return false;
+
+    bHasDef = true;
+  }
+  // If no def, it will not increase pressure, don't mark it.
+  return bHasDef;
+}
+
+static MachineInstr* findFirstAliasingLoadOrStoreInMBB(
+    MachineInstr &MI,
+    MachineBasicBlock &MBB,
+    AliasAnalysis *AA
+)
+{
+    if (MI.mayLoadOrStore())
+    {
+        for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I)
+        {
+            const bool UseTBAA = false;
+            if (MI.mayAlias(AA, *I, UseTBAA))
+            {
+                return &*I;
+            }
+        }
+    }
+
+    return nullptr;
+}
+
+static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI,
+                            AliasAnalysis *AA,
+                            SlotIndexes *slotIndexes) {
+
+  SmallVector<MachineInstr *, 2> users;
+
+  // We cannot move the pacifist instruction past any memory
+  // op with which it aliases. Find the first instruction
+  // that aliases the pacifist MI (if any) and add it to the list
+  // of users. The sort() below will select the earliest user instruction.
+  if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
+    users.push_back(AliasMI);
+  }
+
+  for (MachineOperand &MO : MI.defs()) {
+    unsigned Reg = MO.getReg();
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg))
+    {
+      if (&MBB != UseMI.getParent())
+        continue;
+      users.emplace_back(&UseMI);
+    }
+  }
+  if (users.empty())
+    return nullptr;
+
+  std::sort(users.begin(), users.end(),
+            [&slotIndexes](const MachineInstr *MIa, MachineInstr *MIb) {
+              // Early instr first.
+              return SlotIndex::isEarlierInstr(
+                  slotIndexes->getInstructionIndex(*MIa),
+                  slotIndexes->getInstructionIndex(*MIb));
+            });
+  return users.front();
+}
+
+// Pacifist inst will only add pressure since they don't kill.
+// Try to hold them as late as possible in a MBB to help pressure.
+bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
+                     MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                     const SIInstrInfo *SIII, AliasAnalysis *AA,
+                     RematStatus &status) 
+{
+  const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
+  const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+
+  SmallVector<MachineInstr *, 32> pacifistList;
+  LLVM_DEBUG(dbgs() << "pacifist begin\n");
+  for (MachineInstr &MI : MBB) {
+    if (MI.isDebugInstr())
+      continue;
+    if (collectPacifist(MI, inputLive, outputLive, MRI, SIRI)) {
+      pacifistList.emplace_back(&MI);
+      LLVM_DEBUG(MI.dump());
+    }
+  }
+  LLVM_DEBUG(dbgs() << "pacifist end\n");
+
+  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  bool bUpdated = false;
+
+  // Move pacifist to its first user.
+  for (MachineInstr *MI : pacifistList) {
+    MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
+    if (firstUser == MI)
+      continue;
+    if (firstUser == MI->getNextNode())
+      continue;
+
+    auto insertPoint = MBB.getFirstInstrTerminator();
+    if (firstUser) {
+      insertPoint = firstUser->getIterator();
+    } else {
+      // When there's no terminator.
+      if (insertPoint == MBB.end())
+        insertPoint--;
+      else
+        // BRANCH may have exec update before it.
+        insertPoint--;
+
+      insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+
+      while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
+              insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
+             insertPoint != MI->getIterator())
+      {
+        insertPoint--;
+        insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+      }
+      if (insertPoint == MI->getIterator())
+        continue;
+    }
+    // Do not overwrite a live scc.
+    if (WillSmashSccAtLocation(MI, &MBB, insertPoint))
+      continue;
+    MI->removeFromParent();
+    MBB.insert(insertPoint, MI);
+
+    LIS->handleMove(*MI);
+    bUpdated = true;
+  }
+
+  return bUpdated;
+}
+
+DenseMap<unsigned, MachineInstr *>
+collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI,
+                    const SIRegisterInfo *SIRI) {
+  DenseMap<unsigned, MachineInstr *> UniformMap;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      if (!Remat->TotalUniformInsts.count(&MI))
+        continue;
+      if (MI.getNumDefs() != 1)
+        continue;
+      unsigned dstIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
+      if (dstIdx == -1)
+        continue;
+      MachineOperand &DstMO = MI.getOperand(dstIdx);
+      if (DstMO.getSubReg() != 0)
+        continue;
+      if (DstMO.isTied())
+        continue;
+      unsigned Reg = DstMO.getReg();
+      if (MRI.getUniqueVRegDef(Reg) == nullptr)
+        continue;
+
+      auto *VRC = SIRI->getRegClassForReg(MRI, Reg);
+      if (SIRI->isSGPRClass(VRC))
+        continue;
+      // TODO: Support more reg class.
+      if (VRC != &AMDGPU::VGPR_32RegClass)
+        continue;
+
+      UniformMap[Reg] = &MI;
+    }
+  }
+  return UniformMap;
+}
+
+// Try insert readfirstlane on uniform vgpr to turn it in sgpr and save vgpr
+// pressure.
+bool collectVToSCrossHotSpot(
+    MachineBasicBlock &MBB, RematStatus &status,
+    DenseMap<unsigned, MachineInstr *> &UniformMap,
+    SmallMapVector<unsigned, MachineInstr *, 4> &VToSMap, LiveIntervals *LIS,
+    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII) {
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+
+  GCNDownwardRPTracker Tracker(*LIS);
+
+  bool bUpdated = false;
+  const auto inputLive = status.MBBInputLiveMap[&MBB];
+  Tracker.reset(*MBB.begin(), &inputLive);
+  for (MachineInstr &MI : MBB) {
+    if (MI.isDebugInstr()) {
+      continue;
+    }
+
+    unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts());
+    unsigned SPressure = Tracker.getPressure().getMaxSGPR();
+
+    SPressure += RegForVCC;
+
+    Tracker.advance();
+    // Sgpr bound, vtos cannot help.
+    if (SPressure > SLimit)
+      return false;
+
+    if (VPressure <= VLimit) {
+      continue;
+    }
+
+    // Try to make all possible vtos to reduce vpressure.
+    int VExtra = VPressure - VLimit;
+
+    const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs();
+    for (auto it : CurLives) {
+      unsigned Reg = it.first;
+      auto UniformIt = UniformMap.find(Reg);
+      if (UniformIt == UniformMap.end())
+        continue;
+      VToSMap[UniformIt->first] = UniformIt->second;
+      VExtra--;
+      bUpdated = true;
+    }
+
+  }
+  return bUpdated;
+}
+
+// Return true if the user is outside of the def's loop.
+static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI)
+{
+  MachineLoop* L = MLI->getLoopFor(Def->getParent());
+  return L && !L->contains(User->getParent());
+}
+
+bool rematUniformVgprToSgpr(
+    Remat *Remat,
+    MachineFunction &MF, RematStatus &status,
+    DenseMap<MachineBasicBlock *, GCNRegPressure> &MBBPressureMap,
+    std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
+  DenseMap<unsigned, MachineInstr *> UniformVgprMap =
+      collectUniformVgprs(Remat, MF, MRI, SIRI);
+
+  SmallMapVector<unsigned, MachineInstr *, 4> VToSMap;
+
+  for (auto &hotBlock : hotBlocks) {
+    MachineBasicBlock &MBB = *hotBlock.MBB;
+    collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS, MRI,
+                            SIRI, SIII);
+  }
+
+  if (VToSMap.empty())
+    return false;
+  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  const MCInstrDesc &ReadFirstLaneDesc = SIII->get(AMDGPU::V_READFIRSTLANE_B32);
+  for (auto it : VToSMap) {
+    unsigned Reg = it.first;
+    MachineInstr *MI = it.second;
+
+    auto *VRC = SIRI->getRegClassForReg(MRI, Reg);
+    // TODO: support bigger vgpr to sgpr.
+    if (VRC != &AMDGPU::VGPR_32RegClass)
+      continue;
+    auto *NewRC = SIRI->getEquivalentSGPRClass(VRC);
+    unsigned newDst = MRI.createVirtualRegister(NewRC);
+
+    auto ReadFirstLane =
+        BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, newDst);
+    SmallVector<MachineInstr *, 2> userMIs;
+    for (MachineInstr &userMI : MRI.use_nodbg_instructions(Reg)) {
+      // Do not replace v->s across loops. Even if the value is uniform
+      // branch divergence can cause a uniform value in a loop to be
+      // non-uniform when used outside a loop.
+      if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI))
+        userMIs.emplace_back(&userMI);
+    }
+
+    // Finish readfirstlane
+    ReadFirstLane.addReg(Reg);
+    MachineInstr *VToSMI = ReadFirstLane.getInstr();
+    Remat->TotalUniformInsts.insert(VToSMI);
+    Remat->SafeToRemoveInsts.insert(VToSMI);
+    MachineBasicBlock *MBB = MI->getParent();
+    MBB->insertAfter(MI->getIterator(), VToSMI);
+    slotIndexes->insertMachineInstrInMaps(*VToSMI);
+
+    for (MachineInstr *userMI : userMIs) {
+      const auto &Desc = userMI->getDesc();
+      bool bIllegal = false;
+      for (unsigned i=0;i<userMI->getNumOperands();i++) {
+        MachineOperand &MO = userMI->getOperand(i);
+        if (!MO.isReg())
+          continue;
+        if (MO.isDef())
+          continue;
+        if (MO.getReg() != Reg)
+          continue;
+        if (i >= Desc.getNumOperands()) {
+          bIllegal = true;
+          break;
+        }
+
+        MO.setReg(newDst);
+        if (userMI->getDesc().operands()[i].RegClass != -1) {
+          if (!SIII->isOperandLegal(*userMI, i, &MO)) {
+            SIII->legalizeOperands(*userMI);
+            // In case legalizeOperands not help, just legalize with mov.
+            if (userMI->getDesc().operands()[i].RegClass != -1 &&
+                !SIII->isOperandLegal(*userMI, i)) {
+              SIII->legalizeOpWithMove(*userMI, i);
+            }
+          }
+        } else {
+          // consider not have limit on reg class.
+        }
+      }
+      if (bIllegal)
+        continue;
+
+      auto rit = userMI->getReverseIterator();
+      rit++;
+      auto endIt = userMI->getParent()->rend();
+      while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit))
+        slotIndexes->insertMachineInstrInMaps(*(rit++));
+    }
+  }
+
+  return true;
+}
+
+bool collectRematableHotReg(
+    MachineInstr &MI, const GCNRPTracker::LiveRegSet &hotLive,
+    GCNRPTracker::LiveRegSet &pureHotRematSet,
+    DenseMap<MachineInstr *, unsigned> &pureHotRematLevels, unsigned &DefReg,
+    const GCNRPTracker::LiveRegSet &inputLive,
+    const GCNRPTracker::LiveRegSet &outputLive, const MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI) {
+  // Ignore inst not have def or more than 1 def.
+  if (MI.getDesc().getNumDefs() != 1)
+    return false;
+
+  DefReg = MI.defs().begin()->getReg();
+
+  unsigned level = 0;
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+
+    Register Reg = MO.getReg();
+
+    // If user is in same MI like
+    //  %4:vgpr_32 = V_MAD_LEGACY_F32 %2:vgpr_32, %3:vgpr_32, %4:vgpr_32
+    // remat it will not help.
+    if (Reg == DefReg) {
+      return false;
+    }
+
+    if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+      continue;
+    if (Reg.isPhysical())
+      return false;
+
+    if (nullptr ==
+        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+      return false;
+
+    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+
+    if (isInLiveSet(Reg, mask, hotLive))
+      continue;
+
+    if (isInLiveSet(Reg, mask, pureHotRematSet)) {
+      unsigned regLevel = getPacifistLevel(Reg, pureHotRematLevels, MRI);
+      level = std::max(level, regLevel);
+      continue;
+    }
+
+    return false;
+  }
+
+  for (MachineOperand &MO : MI.defs()) {
+    Register Reg = MO.getReg();
+
+    if (Reg.isPhysical())
+      return false;
+
+    if (nullptr ==
+        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+      return false;
+
+    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+    pureHotRematSet[Reg] |= mask;
+  }
+
+  pureHotRematLevels[&MI] = level + 1;
+  // If no def, it will not increase pressure, don't mark it.
+  return true;
+}
+
+bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
+              std::vector<SubExp> &inBlockCloneSubExps, bool bVGPR,
+              const GCNRPTracker::LiveRegSet &inputLive,
+              const GCNRPTracker::LiveRegSet &outputLive,
+              DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
+              unsigned VLimit, unsigned SLimit,
+              const DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
+              LiveIntervals *LIS,
+              const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+              const SIInstrInfo *SIII) {
+  auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex();
+  const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
+
+  GCNRPTracker::LiveRegSet hotLive = LISLR;
+
+  GCNRPTracker::LiveRegSet pureHotRematSet;
+  std::vector<MachineInstr *> pureHotRematList;
+  DenseMap<MachineInstr *, unsigned> pureHotRematLevels;
+
+  GCNRPTracker::LiveRegSet outputSet;
+  LLVM_DEBUG(dbgs() << "pure hot remat begin\n");
+  // Find reg which could remat from other reg in liveSet.
+  const unsigned kMaxRematLevel = 6;
+  GCNDownwardRPTracker Tracker(*LIS);
+  Tracker.reset(*MBB.begin(), &inputLive);
+  for (auto it = MBB.begin(); it != MBB.end(); it++) {
+    MachineInstr &MI = *it;
+    const GCNRegPressure &RP = Tracker.getPressure();
+
+    if (MI.isDebugInstr())
+      continue;
+
+    // Igonre inst in hot range.
+    if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) {
+      Tracker.advance();
+      continue;
+    }
+
+    // Stop at hotMI.
+    if (&MI == hotMI)
+      break;
+
+    Tracker.advance();
+
+    unsigned DefReg = 0;
+    if (collectRematableHotReg(MI, hotLive, pureHotRematSet, pureHotRematLevels,
+                               DefReg, inputLive, outputLive, MRI, SIRI)) {
+      unsigned level = pureHotRematLevels[&MI];
+      if (level >= kMaxRematLevel)
+        continue;
+
+      // If the def reg is in hot reg.
+      // Add to output.
+      if (hotLive.find(DefReg) != hotLive.end()) {
+        bool bUserIsHot = false;
+        for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) {
+          if (UseMI.getParent() != &MBB)
+            continue;
+          if (0 == hotSet.count(&UseMI))
+            continue;
+
+          const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex();
+          // When has a hot user after hotMI, remat it may not help.
+          if (useSI > SI) {
+            bUserIsHot = true;
+            break;
+          }
+        }
+
+        if (bUserIsHot)
+          continue;
+        outputSet[DefReg];
+        LLVM_DEBUG(dbgs() << "hotRemat:");
+        LLVM_DEBUG(MI.getOperand(0).dump());
+        // remove it from hotLive to avoid it as input when build dag.
+        hotLive.erase(DefReg);
+      }
+      pureHotRematList.emplace_back(&MI);
+      LLVM_DEBUG(dbgs() << "level:" << level);
+      LLVM_DEBUG(MI.dump());
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "pure hot remat end\n");
+
+  // Create input/output for pure hot remat.
+  // Input is things hot reg in level 1 and output is things level > 1.
+  // Build SubExp with pureHotRematList as Nodes, hotLive as input
+  // rematHot as output.
+  // Not join input when build ExpDag to get small subExps.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ false);
+  dag.build(hotLive, outputSet, pureHotRematList);
+  // Find best subExp add to inBlockCloneSubExps.
+  // Sort by size of subExp.
+  std::sort(dag.SubExps.begin(), dag.SubExps.end(),
+            [](const SubExp &a, const SubExp &b) {
+              return a.SUnits.size() < b.SUnits.size();
+            });
+  std::vector<SubExp> cloneSubExps;
+  int distance = bVGPR ? vDistance : sDistance;
+  for (SubExp &subExp : dag.SubExps) {
+    if (subExp.bNotSafeToCopy)
+      continue;
+    if (bVGPR) {
+      if (subExp.vOutputSize == 0)
+        continue;
+    } else {
+      if (subExp.sOutputSize == 0)
+        continue;
+    }
+    if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+      continue;
+    // Not clone big subExp.
+    if (subExp.SUnits.size() > 10)
+      continue;
+    // Do not allow remat in the block when the expression has a memory op and
+    // the block has a write. We could allow this in some cases with better
+    // analysis.
+    if (subExp.bHasMemInst && MemWriteMBBSet.count(&MBB))
+      continue;
+    if (bVGPR) {
+      distance -= subExp.vOutputSize;
+    } else {
+      distance -= subExp.sOutputSize;
+    }
+    cloneSubExps.emplace_back(subExp);
+    if (distance <= 0)
+      break;
+  }
+  if (distance <= 0) {
+    inBlockCloneSubExps.insert(inBlockCloneSubExps.end(), cloneSubExps.begin(),
+                               cloneSubExps.end());
+  }
+  return distance <= 0;
+}
+
+// Try to remat live reg in hot spot from other live reg in hot spot.
+//
+bool tryRematInHotSpot(
+    MachineBasicBlock &MBB, RematStatus &status, int vDistance, int sDistance,
+    int vSaved, int sSaved, std::vector<SubExp> &inBlockCloneSubExps,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+
+  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
+
+  const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+
+  // Collect reg pressure.
+  unsigned maxLocalVPressure = 0;
+  unsigned maxLocalSPressure = 0;
+  // Build a DAG or only on demand?
+  MachineInstr *hotVMI = nullptr;
+  MachineInstr *hotSMI = nullptr;
+  DenseSet<MachineInstr *> hotSet;
+
+  GCNDownwardRPTracker Tracker(*LIS);
+
+  Tracker.reset(*MBB.begin(), &inputLive);
+  for (auto it = MBB.begin(); it != MBB.end(); it++) {
+    MachineInstr &MI = *it;
+    if (MI.isDebugInstr()) {
+      continue;
+    }
+
+    unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts());
+    unsigned SPressure = Tracker.getPressure().getMaxSGPR();
+
+    SPressure += RegForVCC;
+
+    VPressure -= vSaved;
+    SPressure -= sSaved;
+    Tracker.advance();
+
+    if (VPressure <= VLimit && SPressure <= SLimit) {
+      continue;
+    }
+    hotSet.insert(&MI);
+    if (maxLocalVPressure < VPressure) {
+      maxLocalVPressure = VPressure;
+      hotVMI = &MI;
+    }
+    if (maxLocalSPressure < SPressure) {
+      maxLocalSPressure = SPressure;
+      hotSMI = &MI;
+    }
+  }
+
+  inBlockHotVInstMap[&MBB] = hotVMI;
+  inBlockHotSInstMap[&MBB] = hotSMI;
+  if (vDistance > 0 && hotVMI) {
+    // Use hotVMI when apply.
+    inBlockHotSInstMap[&MBB] = nullptr;
+    if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive,
+                    outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
+                    status.MemWriteMBBSet,
+                    LIS, MRI, SIRI, SIII))
+      return true;
+  }
+
+  if (sDistance > 0 && hotSMI) {
+    // Use hotSMI when apply.
+    inBlockHotSInstMap[&MBB] = hotSMI;
+    inBlockHotVInstMap[&MBB] = nullptr;
+    return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false,
+                    inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
+                    SLimit, status.MemWriteMBBSet,
+                    LIS, MRI, SIRI, SIII);
+  }
+  return false;
+}
+// Sort subExpCandidates to make sure deeper subExp apply first.
+// If subExp0 use result of subExp1, subExp0 is deeper than subExp1.
+// When apply subExp1 before subExp0, new clone of subExp0 which use result of
+// subExp1 will have old reg of subExp1. And reg pressure will not be reduced.
+void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
+  MapVector<unsigned, SetVector<SubExp *>> inputMap;
+  MapVector<unsigned, SetVector<SubExp *>> outputMap;
+  struct SortNode {
+    SubExp Exp;
+    unsigned Depth;
+    bool bDepthDirty;
+    SmallDenseSet<SubExp *, 2> Preds;
+    SmallDenseSet<SubExp *, 2> Succs;
+  };
+
+  {
+    SmallVector<unsigned, 10> RegSortStorage;
+    for (SubExp &Exp : subExpCandidates) {
+      RegSortStorage.assign(Exp.TopRegs.begin(), Exp.TopRegs.end());
+      std::sort(RegSortStorage.begin(), RegSortStorage.end());
+      for (auto it : RegSortStorage) {
+        unsigned Reg = it;
+        inputMap[Reg].insert(&Exp);
+      }
+
+      RegSortStorage.assign(Exp.BottomRegs.begin(), Exp.BottomRegs.end());
+      std::sort(RegSortStorage.begin(), RegSortStorage.end());
+      for (auto it : RegSortStorage) {
+        unsigned Reg = it;
+        outputMap[Reg].insert(&Exp);
+      }
+    }
+  }
+
+  MapVector<SubExp *, SortNode> sortMap;
+  for (auto it : inputMap) {
+    unsigned Reg = it.first;
+    auto outIt = outputMap.find(Reg);
+    if (outIt == outputMap.end())
+      continue;
+    auto &inExps = it.second;
+    auto &outExps = outIt->second;
+    for (SubExp *inExp : inExps) {
+      for (SubExp *outExp : outExps) {
+        if (inExp->bHoist != outExp->bHoist) {
+          // Different direction.
+          // If output (def) move up, input(use) move down, nothing happens.
+          if (outExp->bHoist)
+            continue;
+          // Canot input(use) move up, output(def) move down.
+          // Choose the exp which save more.
+          int inExpGain = inExp->vOutputSize - inExp->vInputSize;
+          int outExpGain = outExp->vInputSize - inExp->vOutputSize;
+          if (inExpGain >= outExpGain) {
+            outExp->SUnits.clear();
+          } else {
+            inExp->SUnits.clear();
+          }
+          continue;
+        }
+        // Link outExp to inExp.
+        if (inExp->bHoist) {
+          sortMap[outExp].Preds.insert(inExp);
+          sortMap[inExp].Succs.insert(outExp);
+        } else {
+          sortMap[inExp].Preds.insert(outExp);
+          sortMap[outExp].Succs.insert(inExp);
+        }
+      }
+    }
+  }
+
+  if (sortMap.empty())
+    return;
+
+  SmallVector<SubExp *, 8> WorkList;
+  for (SubExp &Exp : subExpCandidates) {
+    SortNode &Node = sortMap[&Exp];
+    Node.Depth = 0;
+    Node.Exp = Exp;
+    Node.bDepthDirty = !Node.Preds.empty();
+    if (!Node.bDepthDirty)
+      WorkList.emplace_back(&Exp);
+  }
+  // Calc depth.
+  while (!WorkList.empty()) {
+    SubExp *Exp = WorkList.pop_back_val();
+    SortNode &Node = sortMap[Exp];
+    for (SubExp *Succ : Node.Succs) {
+      SortNode &SuccNode = sortMap[Succ];
+      SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1);
+      bool bAllPrevClean = true;
+      for (SubExp *Prev : SuccNode.Preds) {
+        SortNode &PrevNode = sortMap[Prev];
+        if (PrevNode.bDepthDirty) {
+          bAllPrevClean = false;
+          break;
+        }
+      }
+      if (bAllPrevClean) {
+        SuccNode.bDepthDirty = false;
+        WorkList.push_back(Succ);
+      }
+    }
+  }
+
+  std::vector<SortNode *> nodes;
+  for (auto &it : sortMap) {
+    SortNode &node = it.second;
+    nodes.emplace_back(&node);
+  }
+
+  struct sorter {
+    bool operator()(const SortNode *a, const SortNode *b) {
+      return a->Depth > b->Depth;
+    }
+  };
+
+  // subExp deeper should be apply first.
+  std::sort(nodes.begin(), nodes.end(), sorter());
+
+  subExpCandidates.clear();
+  for (auto &node : nodes) {
+    subExpCandidates.emplace_back(node->Exp);
+  }
+}
+
+// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1.
+bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
+                    unsigned maxS1, const GCNSubtarget *ST) {
+  unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0);
+  unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(maxV1);
+  unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(maxS0);
+  unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1);
+  unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0);
+  unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1);
+  // big occupancy is low pressure.
+  if (Occ0 > Occ1)
+    return false;
+  if (Occ0 < Occ1)
+    return true;
+  // When sgpr bound, big sgpr is high pressure.
+  if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) {
+    return maxS0 > maxS1;
+  }
+  // When vgpr bound or mix, vgpr higher is higher pressure.
+  return maxV0 > maxV1;
+}
+
+// Return true if the subExp can help pressure for passThrus.
+bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
+                     const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                     const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+                     MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) {
+  LLVM_DEBUG(subExp.dump(MRI, SIRI));
+  if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+    return false;
+
+  // Update input size to ignore lives in which already in
+  // passThrus.
+  for (auto it : subExp.inputLive) {
+    unsigned Reg = it.first;
+    if (passThrus.count(Reg) == 0)
+      continue;
+    unsigned Size = getRegSize(Reg, it.second, MRI, SIRI);
+    if (SIRI->isVGPR(MRI, Reg)) {
+      subExp.vInputSize -= Size;
+    } else {
+      subExp.sInputSize -= Size;
+    }
+  }
+
+  if (subExp.vInputSize > subExp.vOutputSize)
+    return false;
+
+  if (subExp.sInputSize > subExp.sOutputSize && bSgprBound)
+    return false;
+
+  if (subExp.sInputSize >= subExp.sOutputSize &&
+      subExp.vInputSize == subExp.vOutputSize)
+    return false;
+
+  // Try to find a Insert Block.
+  // Skip multi def output sub exp.
+  // Collect user blocks, find common dom.
+  BlockSet userBlocks;
+  for (unsigned Reg : subExp.BottomRegs) {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      MachineBasicBlock *UserBB = UseMI.getParent();
+      // Skip current BB.
+      if (UserBB != subExp.FromBB)
+        userBlocks.insert(UserBB);
+    }
+  }
+  if (userBlocks.empty())
+    return false;
+  MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
+  if (!pDT->dominates(subExp.FromBB, userBlock)) {
+    return false;
+  }
+  if (userBlock == subExp.FromBB &&
+      // When allow clone, could go clone path if cannot move subExp.
+      !bCanClone)
+    return false;
+
+  subExp.ToBB = userBlock;
+  if (auto *toLoop = MLI->getLoopFor(userBlock)) {
+    auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
+    if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
+      subExp.bMoveIntoLoop = true;
+  } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
+    auto *toLoop = MLI->getLoopFor(userBlock);
+    // not safe to move out of loop.
+    if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
+        toLoop != fromLoop)
+      return false;
+  }
+  return true;
+}
+
+bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
+                              const SIRegisterInfo *SIRI,
+                              const SIInstrInfo *SIII,
+                              const MachineLoopInfo *MLI, bool bSgprBound) {
+  if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ true))
+    return false;
+  if (subExp.vInputSize < subExp.vOutputSize)
+    return false;
+  if (subExp.sInputSize < subExp.sOutputSize && bSgprBound)
+    return false;
+
+  if (subExp.sInputSize <= subExp.sOutputSize &&
+      subExp.vInputSize == subExp.vOutputSize)
+    return false;
+
+  // Try to find a Insert Block.
+  // Skip multi def output sub exp.
+  // Collect user blocks, find common dom.
+  BlockSet defBlocks;
+  for (unsigned Reg : subExp.TopRegs) {
+    MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+    if (!DefMI)
+      continue;
+    defBlocks.insert(DefMI->getParent());
+  }
+  if (defBlocks.size() != 1)
+    return false;
+  MachineBasicBlock *defBlock = *defBlocks.begin();
+  subExp.ToBB = defBlock;
+  // Not do same block hoist.
+  if (subExp.ToBB == subExp.FromBB)
+    return false;
+
+  if (auto *toLoop = MLI->getLoopFor(defBlock)) {
+    auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
+    // TODO: enable move into loop when hoist.
+    if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
+      return false;
+  } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
+    auto *toLoop = MLI->getLoopFor(defBlock);
+    // not safe to move out of loop.
+    if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
+        toLoop != fromLoop)
+      return false;
+  }
+  return true;
+}
+
+SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+groupPassThruByDefBlock(Remat *Remat,
+                        const GCNRPTracker::LiveRegSet &passThrus,
+                        GCNRPTracker::LiveRegSet &usedPassThrus,
+                        MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                        const SIInstrInfo *SIII) {
+  MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> Candidates;
+
+  // Group safe candidates by define block.
+  for (auto it : passThrus) {
+    unsigned Reg = it.first;
+    // Skip used pass thru reg to avoid count it twice for different hot block.
+    if (usedPassThrus.count(Reg))
+      continue;
+    LLVM_DEBUG(print_vreg(Reg, MRI));
+    LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr ";
+               else dbgs() << " vgpr ";);
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) {
+      LLVM_DEBUG(dbgs() << " is not safe\n");
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << " is safe\n");
+    // DefMI is already checked in isSafeCandidate.
+    MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+
+    GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()];
+    DefInMBB[Reg] = it.second;
+  }
+  
+  llvm::SmallVector<std::pair<MachineBasicBlock*, GCNRPTracker::LiveRegSet>> result = Candidates.takeVector();
+
+  LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it
+                                                              : result) {
+    MachineBasicBlock *MBB = it.first;
+    auto &defInMBB = it.second;
+    MBB->dump();
+    llvm::dumpLiveSet(defInMBB, SIRI);
+  } llvm::dbgs() << "end of candidates\n";);
+
+  std::sort(result.begin(), result.end(),
+            [](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it0,
+               std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it1) {
+              return it0.first->getNumber() < it1.first->getNumber();
+            });
+
+  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it
+                                                              : result) {
+    MachineBasicBlock *MBB = it.first;
+    auto &defInMBB = it.second;
+    MBB->dump();
+    llvm::dumpLiveSet(defInMBB, SIRI);
+  } llvm::dbgs() << "end of candidates\n";);
+
+  return result;
+}
+
+// collect pass thru regs of MBB.
+GCNRPTracker::LiveRegSet
+collectPassThrus(MachineBasicBlock *MBB,
+                 const GCNRPTracker::LiveRegSet &inputLive,
+                 const GCNRPTracker::LiveRegSet &outputLive,
+                 const GCNRPTracker::LiveRegSet &usedPassThrus,
+                 const GCNRPTracker::LiveRegSet &liveRegCandidates,
+                 MachineRegisterInfo &MRI, bool bCanClone) {
+  GCNRPTracker::LiveRegSet passThrus;
+  llvm::mergeLiveRegSet(passThrus, inputLive);
+  llvm::andLiveRegSet(passThrus, outputLive);
+
+  // Remove reg which not in liveRegCandidates.
+  GCNRPTracker::LiveRegSet tmpPassThrus = passThrus;
+  for (auto it : tmpPassThrus) {
+    unsigned Reg = it.first;
+    if (!liveRegCandidates.count(Reg)) {
+      passThrus.erase(Reg);
+    }
+  }
+  tmpPassThrus = passThrus;
+  // Remove reg which has read/write in MBB.
+  for (auto it : tmpPassThrus) {
+    unsigned Reg = it.first;
+    DenseSet<MachineBasicBlock *> DefMBBs;
+    for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+      MachineBasicBlock *MBB = DefMI.getParent();
+      DefMBBs.insert(MBB);
+    }
+    DenseSet<MachineBasicBlock *> UseMBBs;
+    // Allow use for pass thru if clone is OK.
+    if (!bCanClone) {
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        MachineBasicBlock *UserMBB = UseMI.getParent();
+        UseMBBs.insert(UserMBB);
+      }
+    }
+    bool bW = DefMBBs.count(MBB) > 0;
+    bool bR = UseMBBs.count(MBB) > 0;
+
+    bool bPassThru = !bW && !bR;
+    if (!bPassThru)
+      passThrus.erase(Reg);
+  }
+  return passThrus;
+}
+// Try to build a free subExp which all input is passThrus.
+SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus,
+                       MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+  SubExp freeExp;
+  // Try to split the subExp to find a help case.
+  // Scan all inst in subExp, propagate free inst which input is from
+  // passThrus.
+  SmallDenseSet<unsigned, 4> freeRegs;
+  SmallDenseSet<unsigned, 8> freeInstUseRegs;
+  SmallVector<MachineInstr *, 4> freeInsts;
+  for (MachineInstr *MI : subExp.SUnits) {
+    bool bIsFree = true;
+    // Check all use regs are free.
+    for (MachineOperand &MO : MI->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isImplicit() && Reg == AMDGPU::EXEC)
+        continue;
+      if (MRI.getUniqueVRegDef(Reg) == nullptr) {
+        bIsFree = false;
+        break;
+      }
+      // Skip local pass thrus unless it is free.
+      if (passThrus.count(Reg) && subExp.TopRegs.count(Reg))
+        continue;
+      if (freeRegs.count(Reg))
+        continue;
+      bIsFree = false;
+      break;
+    }
+    // Check def is unique.
+    for (MachineOperand &MO : MI->defs()) {
+      unsigned Reg = MO.getReg();
+      if (MRI.getUniqueVRegDef(Reg) == nullptr) {
+        bIsFree = false;
+        break;
+      }
+    }
+    if (!bIsFree)
+      continue;
+    // Save inst as free inst.
+    freeInsts.emplace_back(MI);
+    // Save def as free reg.
+    for (MachineOperand &MO : MI->defs()) {
+      unsigned Reg = MO.getReg();
+      freeRegs.insert(Reg);
+    }
+    // Save use regs as free use reg.
+    for (MachineOperand &MO : MI->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      freeInstUseRegs.insert(Reg);
+    }
+  }
+  // Then remove local inst has no output use.
+  for (MachineInstr *MI : freeInsts) {
+    bool bIsFreeUsed = false;
+    for (MachineOperand &MO : MI->defs()) {
+      unsigned Reg = MO.getReg();
+      // Used as freeInst or output.
+      bIsFreeUsed |=
+          freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg);
+    }
+    if (!bIsFreeUsed)
+      continue;
+    freeExp.SUnits.emplace_back(MI);
+  }
+  if (freeExp.SUnits.empty()) {
+    // mark has terminator to make it unsafe.
+    freeExp.bHasTerminatorInst = true;
+    return freeExp;
+  }
+  // Build BottomRegs and TopRegs for freeExp.
+  // BottomRegs is freeRegs in subExp.BottomRegs.
+  for (unsigned freeReg : freeRegs) {
+    if (subExp.BottomRegs.count(freeReg))
+      freeExp.BottomRegs.insert(freeReg);
+  }
+  // TopRegs is freeInstUseRegs in subExp.TopRegs.
+  for (unsigned freeInstUseReg : freeInstUseRegs) {
+    if (subExp.TopRegs.count(freeInstUseReg))
+      freeExp.TopRegs.insert(freeInstUseReg);
+  }
+  freeExp.FromBB = subExp.FromBB;
+  freeExp.ToBB = subExp.ToBB;
+  // must be clone since is partial of subExp.
+  freeExp.bCloneOnly = true;
+
+  // Calc reg for freeExp.
+  for (unsigned Reg : freeExp.TopRegs) {
+    freeExp.inputLive[Reg];
+  }
+
+  for (unsigned Reg : freeExp.BottomRegs) {
+    freeExp.outputLive[Reg];
+  }
+
+  CollectLiveSetPressure(freeExp.inputLive, MRI, SIRI, freeExp.vInputSize,
+                         freeExp.sInputSize);
+  CollectLiveSetPressure(freeExp.outputLive, MRI, SIRI, freeExp.vOutputSize,
+                         freeExp.sOutputSize);
+  return freeExp;
+}
+
+std::vector<SubExp> buildSubExpCandidates(
+    Remat *Remat,
+    SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+        &Candidates,
+    GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+    const MachineLoopInfo *MLI, SlotIndexes *slotIndexes,
+    MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound,
+    GCNRPTracker::LiveRegSet &unUsedPassThrus,
+    DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
+    bool bAllowPartialUseInSubExp) {
+  std::vector<SubExp> subExpCandidates;
+  // Build exp dag on define blocks.
+  // Save profit candidates into list.
+  for (auto &it : Candidates) {
+    MachineBasicBlock *DefMBB = it.first;
+    // Try to remove out reg def sub exp from DefMBB.
+    GCNRPTracker::LiveRegSet &DefInMBB = it.second;
+    // Go up on the dag until reach share node.
+    auto subExps =
+        buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI,
+                                  slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp);
+    for (SubExp &subExp : subExps) {
+      if (subExp.bHasMemInst) {
+        // Skip when memory ld/st inst need to cross MBB which write memory.
+        // TODO: check all MBBs in between FromBB and ToBB not write memory.
+        // Currently just skip when any memory write exist.
+        if (!MemWriteMBBSet.empty()) {
+          MachineBasicBlock *FromBB = subExp.FromBB;
+          MachineBasicBlock *ToBB = subExp.ToBB;
+          if (subExp.bHoist) {
+            FromBB = subExp.ToBB;
+            ToBB = subExp.FromBB;
+          }
+          bool bCrossMemWriteMBB = false;
+          for (MachineBasicBlock *MemMBB : MemWriteMBBSet) {
+            if (pDT->dominates(ToBB, MemMBB))
+              continue;
+            if (pDT->dominates(MemMBB, FromBB))
+              continue;
+            bCrossMemWriteMBB = true;
+            break;
+          }
+          if (bCrossMemWriteMBB)
+            continue;
+        }
+      }
+      if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
+                           bCanClone, bSgprBound)) {
+        if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
+          SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
+          if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
+                              bCanClone, bSgprBound)) {
+            subExpCandidates.emplace_back(freeSubExp);
+          }
+        }
+        continue;
+      }
+
+      subExpCandidates.emplace_back(subExp);
+    }
+  }
+  return subExpCandidates;
+}
+
+std::pair<int, int>
+calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
+                GCNRPTracker::LiveRegSet &inputLive,
+                GCNRPTracker::LiveRegSet &outputLive, bool bVOutBound,
+                bool bSOutBound, bool bCanClone, MachineDominatorTree *pDT,
+                const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+  int vgpr = 0;
+  int sgpr = 0;
+  MachineBasicBlock *MBB = hotBB.MBB;
+  // Sink saving.
+  for (SubExp &Exp : subExpCandidates) {
+    if (Exp.bHoist) {
+      // ToMBB -> MBB -> FromMBB.
+      // If ToMBB not dom hot block, reg will not live in MBB.
+      if (!pDT->dominates(Exp.ToBB, MBB))
+        continue;
+    } else {
+      // If FromBB not dom hot block, reg will not live in MBB.
+      if (!pDT->dominates(Exp.FromBB, MBB))
+        continue;
+      // When subExp is from hotBB, check output instead of input.
+      if (Exp.FromBB == MBB) {
+        if (bVOutBound && Exp.vOutputSize < Exp.vInputSize)
+          continue;
+        if (bSOutBound && Exp.sOutputSize < Exp.sInputSize)
+          continue;
+        vgpr += Exp.vInputSize;
+        vgpr -= Exp.vOutputSize;
+        sgpr += Exp.sInputSize;
+        sgpr -= Exp.sOutputSize;
+        continue;
+      }
+    }
+    int vgprDiff = 0;
+    int sgprDiff = 0;
+    MachineBasicBlock *ToMBB = Exp.ToBB;
+    // If subExp is to hotBB, it is crossing output instead of input.
+    GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive;
+
+    bool bClone = false;
+    GCNRPTracker::LiveRegSet newInput;
+    if (!Exp.bMoveIntoLoop) {
+      if (Exp.bHoist) {
+        // If FromBB dom hot block, it will not change live for MBB.
+        if (Exp.FromBB != MBB && pDT->dominates(Exp.FromBB, MBB))
+          continue;
+      } else {
+        // If ToBB dom hot block, it will not change live for MBB.
+        if (ToMBB != MBB && pDT->dominates(ToMBB, MBB)) {
+          if (bCanClone && !Exp.bNotSafeToCopy) {
+            bClone = true;
+          } else {
+            continue;
+          }
+        }
+      }
+
+      for (auto outIt : Exp.outputLive) {
+        unsigned Reg = outIt.first;
+        LaneBitmask outMask = outIt.second;
+        LaneBitmask MBBBeginMask;
+        if (crossLive.find(Reg) != crossLive.end())
+          MBBBeginMask = crossLive[Reg];
+        // Check mask which live in both BeginSlot and exp output when sink to
+        // kill the output. Check mask which not live in BeginSlot but live in
+        // exp output when hoist to live the output.
+        LaneBitmask profitMask =
+            Exp.bHoist ? (outMask & (~MBBBeginMask)) : (outMask & MBBBeginMask);
+        if (MBBBeginMask.any()) {
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          LLVM_DEBUG(std::string movStr =
+                         Exp.bHoist ? "output hoist:" : "output sink:";
+                     dbgs() << movStr << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          // Exp out live at block input.
+          // It will descrease live for MBB when sink and increase when hoist.
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            if (Exp.bHoist)
+              vgprDiff += Size;
+            else
+              vgprDiff -= Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            if (Exp.bHoist)
+              sgprDiff += Size;
+            else
+              sgprDiff -= Size;
+          }
+        }
+      }
+
+      for (auto inIt : Exp.inputLive) {
+        unsigned Reg = inIt.first;
+        LaneBitmask inMask = inIt.second;
+        LaneBitmask MBBBeginMask;
+        if (crossLive.find(Reg) != crossLive.end())
+          MBBBeginMask = crossLive[Reg];
+        // Check mask which not live in BeginSlot but live in exp input when
+        // sink to live the input. Check mask which live in both BeginSlot and
+        // exp output when hoist to kill the input.
+        LaneBitmask profitMask =
+            Exp.bHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask));
+        if (profitMask.any()) {
+          // Update input live to avoid count same input more than once.
+          newInput[Reg] |= inMask;
+          // Exp in not live at block input.
+          // It will increase live for MBB.
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+
+          LLVM_DEBUG(std::string movStr =
+                         Exp.bHoist ? "input hoist:" : "input sink:";
+                     dbgs() << movStr << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            if (Exp.bHoist)
+              vgprDiff -= Size;
+            else
+              vgprDiff += Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            if (Exp.bHoist)
+              sgprDiff -= Size;
+            else
+              sgprDiff += Size;
+          }
+        }
+      }
+    } else {
+      // When sink into loop, the input will live for every block inside loop.
+      // The output will only lived between to blocks and the use blocks.
+      // If MBB dominate any user of output live reg, it will still live in
+      // MBB. So cannot count that output live reg as profit.
+      // Hoist into loop is not supported now.
+      for (auto outIt : Exp.outputLive) {
+        unsigned Reg = outIt.first;
+        bool bDomUser = false;
+        for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
+          MachineBasicBlock *UserMBB = MI.getParent();
+          if (pDT->dominates(MBB, UserMBB)) {
+            bDomUser = true;
+            break;
+          }
+        }
+        if (bDomUser)
+          continue;
+
+        LaneBitmask outMask = outIt.second;
+        LaneBitmask MBBBeginMask;
+        if (inputLive.find(Reg) != inputLive.end())
+          MBBBeginMask = inputLive[Reg];
+        LaneBitmask profitMask = outMask & MBBBeginMask;
+        if (MBBBeginMask.any()) {
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          // Exp out live at block input.
+          // It will descrease live for MBB.
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            vgprDiff -= Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            sgprDiff -= Size;
+          }
+        }
+      }
+
+      for (auto inIt : Exp.inputLive) {
+        unsigned Reg = inIt.first;
+        LaneBitmask inMask = inIt.second;
+        LaneBitmask MBBBeginMask;
+        if (inputLive.find(Reg) != inputLive.end())
+          MBBBeginMask = inputLive[Reg];
+        // Check mask which not live in BeginSlot but live in exp input.
+        LaneBitmask profitMask = inMask & (~MBBBeginMask);
+        if (profitMask.any()) {
+          // Update input live to avoid count same input more than once.
+          newInput[Reg] |= inMask;
+          // Exp in not live at block input.
+          // It will increase live for MBB.
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+
+          LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            vgprDiff += Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            sgprDiff += Size;
+          }
+        }
+      }
+    }
+
+    if (bVOutBound && vgprDiff > 0)
+      continue;
+
+    if (bSOutBound && sgprDiff > 0)
+      continue;
+    llvm::mergeLiveRegSet(crossLive, newInput);
+    vgpr += vgprDiff;
+    sgpr += sgprDiff;
+    if (bClone)
+      Exp.bCloneOnly = true;
+  }
+
+  return std::make_pair(vgpr, sgpr);
+}
+
+void addExpCandidates(std::vector<SubExp> &subExpCandidates,
+                      std::vector<SubExp> &subExps,
+                      GCNRPTracker::LiveRegSet &usedRegs) {
+  subExpCandidates.insert(subExpCandidates.end(), subExps.begin(),
+                          subExps.end());
+  for (auto &Exp : subExps) {
+    if (Exp.bHoist) {
+      for (auto &Reg : Exp.TopRegs) {
+        usedRegs[Reg];
+      }
+    } else {
+      for (auto &Reg : Exp.BottomRegs) {
+        usedRegs[Reg];
+      }
+    }
+  }
+}
+
+bool tryToAddSubExps(
+    Remat *Remat,
+    HotBlock &hotBB, RematStatus &status, std::vector<SubExp> &subExpCandidates,
+    std::vector<SubExp> &inBlockCloneSubExps,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+        Candidates,
+    int vgpr, int sgpr, const GCNRPTracker::LiveRegSet &savingInputLive,
+    const GCNRPTracker::LiveRegSet &savingOutputLive,
+    GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs,
+    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+    SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT,
+    bool bCanClone, bool bVOutBound, bool bSOutBound,
+    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
+  std::vector<SubExp> partialSubExps = buildSubExpCandidates(Remat,
+      Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone,
+      bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
+      bAllowPartialUseInSubExp);
+
+  GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
+  GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive;
+  std::pair<int, int> curSaving = calculateSaving(
+      hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive,
+      bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+  const int VLimit = status.TargetVLimit;
+  const int SLimit = status.TargetSLimit;
+
+  vgpr += curSaving.first;
+  sgpr += curSaving.second;
+
+  if (vgpr <= VLimit && sgpr <= SLimit) {
+    // nrmSubExps can help reach target occupancy, add it to
+    // subExpCandidates.
+    addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+    return true;
+  }
+
+  if (EnableSubExpAggressive) {
+    // Build candidates from passThrus but not used in partialSubExps.
+    GCNRPTracker::LiveRegSet sinkUsedRegs;
+    for (auto &Exp : partialSubExps) {
+      for (auto &Reg : Exp.BottomRegs) {
+        sinkUsedRegs[Reg];
+      }
+    }
+    MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> HoistCandidates;
+    for (auto &it : hotBB.inputLive) {
+      unsigned Reg = it.first;
+      // Skip reg which already used for sink exp.
+      if (sinkUsedRegs.count(Reg))
+        continue;
+      if (usedRegs.count(Reg))
+        continue;
+      // Skip unsafe reg.
+      if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ false)) {
+        LLVM_DEBUG(dbgs() << " is not safe to hoist\n");
+        continue;
+      }
+      // DefMI is already checked in isSafeCandidate.
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+      MachineBasicBlock *DefMBB = DefMI->getParent();
+      DenseSet<MachineBasicBlock *> UseMBBSet;
+      // Make sure all uses not in Def block are in same block.
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        MachineBasicBlock *UseMBB = UseMI.getParent();
+        if (UseMBB == DefMBB)
+          continue;
+        UseMBBSet.insert(UseMBB);
+      }
+
+      if (UseMBBSet.size() != 1)
+        continue;
+      MachineBasicBlock *UseMBB = *UseMBBSet.begin();
+      GCNRPTracker::LiveRegSet &UseInMBB = HoistCandidates[UseMBB];
+      UseInMBB[Reg] = getRegMask(DefMI->getOperand(0), MRI);
+    }
+
+    SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+    // Build exp dag on define blocks.
+    std::vector<SubExp> hoistSubExpCandidates;
+    // Save profit candidates into list.
+    for (auto it : HoistCandidates) {
+      MachineBasicBlock *UseMBB = it.first;
+      // Try to remove out reg def sub exp from DefMBB.
+      GCNRPTracker::LiveRegSet &UseInMBB = it.second;
+      // Go up on the dag until reach share node.
+      auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI,
+                                                        SIII, MRI, slotIndexes);
+      for (SubExp &subExp : subExps) {
+        if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound))
+          continue;
+        subExp.bHoist = true;
+        hoistSubExpCandidates.emplace_back(subExp);
+      }
+    }
+
+    std::pair<int, int> hoistSaving = calculateSaving(
+        hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive,
+        bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+
+    int hoistVgpr = vgpr + hoistSaving.first;
+    int hoistSgpr = sgpr + hoistSaving.second;
+
+    if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) ||
+        // If status not balance, do the remat even cannot reach target.
+        // TODO: check the result not help even one occupancy.
+        (!hoistSubExpCandidates.empty() && !status.bNotBalance &&
+         TargetOccupancy != 0)) {
+      // nrmSubExps can help reach target occupancy, add it to
+      // subExpCandidates.
+      addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+      addExpCandidates(subExpCandidates, hoistSubExpCandidates, usedRegs);
+
+      return true;
+    }
+  }
+
+  if (EnableVmemDegree &&
+      // Only expect vmem when last tryToAddSubExps.
+      // If not, bAllowPartialUseInSubExp will no chance to be true.
+      (bAllowPartialUseInSubExp ||
+       !EnableSubExpAggressive)) {
+    // Assume vmemLdSize could be optimized by not parallel.
+    if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
+         (vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
+        sgpr <= SLimit) {
+      // nrmSubExps can help reach target occupancy, add it to
+      // subExpCandidates.
+      addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+      return true;
+    }
+  }
+
+  int vDistance = vgpr - (int)VLimit;
+  int sDistance = status.TargetOcc > 4 ? (sgpr - (int)SLimit) : 0;
+  int vSaved = hotBB.maxPressures.first - vgpr;
+  int sSaved = hotBB.maxPressures.second - sgpr;
+  // Try to add inBlockCloneSubExps.
+  if (!tryRematInHotSpot(*hotBB.MBB, status, vDistance, sDistance, vSaved,
+                         sSaved, inBlockCloneSubExps, inBlockHotVInstMap,
+                         inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) {
+    // return false always when not allow partialUseInSubExp, it will try again
+    // with partialUseInSubExp enabled.
+    if (!bAllowPartialUseInSubExp)
+      return false;
+    // If status not balance, do the remat even cannot reach target.
+    // TODO: check the result not help even one occupancy.
+    if (!status.bNotBalance && TargetOccupancy == 0)
+      return false;
+  }
+  // nrmSubExps can help reach target occupancy, add it to
+  // subExpCandidates.
+  addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+  return true;
+}
+
+// Remat passthru regs per hot block.
+// Reason to do it per block is to make sure passthru reuse is precise.
+// If try remat on all hot blocks together, the passthru might be on one block,
+// but the reuse in on another block which the reg is not passthru there.
+bool perBlockPassthruRemat(Remat *Remat,
+                           std::vector<HotBlock> &hotBlocks,
+                           RematStatus &status,
+                           GCNRPTracker::LiveRegSet &liveRegCandidates,
+                           const GCNSubtarget *ST, LiveIntervals *LIS,
+                           const MachineLoopInfo *MLI,
+                           MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+                           const SIRegisterInfo *SIRI,
+                           const SIInstrInfo *SIII) {
+  bool bUpdated = false;
+  bool bCanClone = EnableSubExpClone |
+                   EnableSubExpAggressive;
+
+  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  // Sort hot blocks by pressure first.
+  // The hot block with higher pressure is easier to fail.
+  // If fail, fail fast. It it works, save the subExpCandidates. The
+  // subExpCandidates may help other hotblocks.
+  std::sort(hotBlocks.begin(), hotBlocks.end(),
+            [&ST](const HotBlock &a, const HotBlock &b) {
+              return pressureHigher(a.maxPressures.first, a.maxPressures.second,
+                                    b.maxPressures.first, b.maxPressures.second,
+                                    ST);
+            });
+
+  std::vector<SubExp> subExpCandidates;
+  // For inBlock remat clone.
+  std::vector<SubExp> inBlockCloneSubExps;
+  DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotVInstMap;
+  DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotSInstMap;
+
+  // Save used passThrus to avoid use same reg on different MBB.
+  GCNRPTracker::LiveRegSet usedPassThrus;
+  // Save moved regs to avoid use same reg hoist and sink.
+  GCNRPTracker::LiveRegSet usedRegs;
+
+  const int VLimit = status.TargetVLimit;
+  const int SLimit = status.TargetSLimit;
+  // Collect passthru for hot block.
+  // Try remat on it.
+  for (auto &it : hotBlocks) {
+    MachineBasicBlock *MBB = it.MBB;
+
+    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+
+    it.inputLive = inputLive;
+
+    // Add pressure by 1 to consider spill to vgpr.
+    const int PressureDelta = -1;
+    int vgpr = it.maxPressures.first - PressureDelta;
+    int sgpr = it.maxPressures.second;
+    bool bVOutBound = vgpr > VLimit;
+    bool bSOutBound = sgpr > SLimit;
+    // savingInputLive is used to calculate saving which will be modified to
+    // avoid count same input multiple times.
+    GCNRPTracker::LiveRegSet savingInputLive = inputLive;
+    GCNRPTracker::LiveRegSet savingOutputLive = outputLive;
+    std::pair<int, int> curSaving =
+        calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive,
+                        bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+
+    vgpr += curSaving.first;
+    sgpr += curSaving.second;
+
+    if (vgpr <= VLimit && sgpr <= SLimit)
+      continue;
+
+    // Collect pass thru regs.
+    GCNRPTracker::LiveRegSet passThrus =
+        collectPassThrus(MBB, inputLive, outputLive, usedPassThrus,
+                         liveRegCandidates, MRI, bCanClone);
+
+    // Group pass thru regs by def MBB.
+    SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+        Candidates =
+        groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII);
+    // unUsedPassThrus used to collect passThru which is skipped when build
+    // subExp.
+    GCNRPTracker::LiveRegSet unusedPassThrus;
+    // Build exp dag on define blocks.
+    bool bAllowPartialUseInSubExp = false;
+    if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+                        inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
+                        vgpr, sgpr, savingInputLive, savingOutputLive,
+                        passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
+                        LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+                        unusedPassThrus, bAllowPartialUseInSubExp)) {
+      // Remove unusedPassThrus from passThrus first.
+      llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
+      llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+      continue;
+    }
+    // If cannot clone, don't need to try partialUseInSubExp which must clone.
+    if (!bCanClone)
+      return false;
+
+    // Partial use subExp may result big alu count caused by clone.
+    // Only try it when enable aggressive remat.
+    if (!EnableSubExpAggressive)
+      return false;
+
+    bAllowPartialUseInSubExp = true;
+    if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+                         inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
+                         vgpr, sgpr, savingInputLive, savingOutputLive,
+                         passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
+                         LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+                         unusedPassThrus, bAllowPartialUseInSubExp)) {
+      return false;
+    }
+    // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
+    llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+  }
+
+  // Apply changes.
+  {
+    // sort subExpCandidates to make sure input use apply before output use if a
+    // reg is input and output of subExps.
+    LLVM_DEBUG(for (SubExp &Exp : subExpCandidates) { Exp.dump(MRI, SIRI); });
+    sortSubExpCandidates(subExpCandidates);
+
+    for (SubExp &Exp : subExpCandidates) {
+      // Skip exp which is cleared in sort for hoist sink conflict.
+      if (Exp.SUnits.empty())
+        continue;
+      LLVM_DEBUG(Exp.dump(MRI, SIRI));
+      if (Exp.bHoist) {
+        ApplySubExpMoveNearDefine(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+      } else {
+        if (Exp.bCloneOnly)
+          ApplySubExpCloneNearUser(Exp, hotBlocks, pDT, MRI, slotIndexes, SIII,
+                                   SIRI);
+        else
+          ApplySubExpMoveNearUser(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+      }
+    }
+
+    for (SubExp &Exp : inBlockCloneSubExps) {
+      ApplySubExpCloneNearUserInBlock(Exp, inBlockHotVInstMap,
+                                      inBlockHotSInstMap, MRI, slotIndexes,
+                                      SIII, SIRI);
+    }
+    // Try to see possible occupancy could reach, then dicide a target.
+    // Apply remat.
+    bUpdated = subExpCandidates.size();
+  }
+
+  return bUpdated;
+}
+
+int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
+                  const SIRegisterInfo *SIRI, const MachineRegisterInfo &MRI) {
+  int vmemLdSize = 0;
+  // Collect vmemLd when enable split.
+  for (MachineInstr &MI : MBB) {
+    bool bIsHighLatency = SIII->isHighLatencyInstruction(MI);
+    if (!bIsHighLatency)
+      continue;
+    if (!(MI.mayLoad() &&
+          // Skip case like atomic which not return value.
+          MI.getNumDefs() > 0))
+      continue;
+    // a vmem ld.
+    MachineOperand &Dst = MI.getOperand(0);
+    LaneBitmask mask = llvm::getRegMask(Dst, MRI);
+    unsigned size = llvm::getRegSize(Dst.getReg(), mask, MRI, SIRI);
+    vmemLdSize += size;
+  }
+  return vmemLdSize;
+}
+
+} // namespace
+
+bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
+                MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT,
+                AliasAnalysis *AA)
+{
+  if (MF.size() < 2)
+    return false;
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  auto &MRI = MF.getRegInfo();
+
+  RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (status.TargetOcc >= MaxOcc)
+    return false;
+
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+
+  int rematVCnt = status.MaxVPressure - VLimit;
+  int rematSCnt = status.MaxSPressure - SLimit;
+
+  bool bSGPRSpill = false;
+  if (rematSCnt > 0) {
+    bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+  }
+
+  // If bound by lds, skip.
+  if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) &&
+      !bSGPRSpill)
+    return false;
+
+  bool bBothOutLimit = rematVCnt > 0 && rematSCnt > 0;
+  // TODO: use check wqm and support vreg remat.
+  bool bCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+  rematVCnt = bCheckWQM & false;
+
+  // Remat on every hot block.
+
+  // Collect all hot blocks.
+  std::vector<HotBlock> hotBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    // Collect reg pressure.
+    auto &RP = status.MBBPressureMap[&MBB];
+    unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+    unsigned maxLocalSPressure = RP.getMaxSGPR();
+
+    maxLocalSPressure += RegForVCC;
+
+    if (!EnableInBlockRemat) {
+      if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+        continue;
+    }
+
+    // Move inst which input is imm/pass thru reg/out reg to help pressure.
+    if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) {
+      maxLocalVPressure = 0;
+      maxLocalSPressure = 0;
+      CollectMBBPressure(MBB, LIS, MRI, ST, maxLocalVPressure,
+                         maxLocalSPressure, status);
+
+      maxLocalSPressure += RegForVCC;
+
+    }
+    if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+      continue;
+
+    // When both vgpr sgpr out limit, only help vgpr.
+    if (bBothOutLimit && maxLocalVPressure <= VLimit)
+      continue;
+    GCNRPTracker::LiveRegSet liveSet;
+    hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 });
+  }
+  // Collect vmemLdInput/OutputSize.
+  if (EnableVmemDegree) {
+    DenseMap<MachineBasicBlock *, unsigned> outputVMemLdSizeMap;
+    for (auto it : hotBlocks) {
+      MachineBasicBlock *MBB = it.MBB;
+      // Collect vmemLd when enable split.
+      int vmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI);
+      if (vmemLdSize) {
+        outputVMemLdSizeMap[MBB] = vmemLdSize;
+      }
+    }
+    for (auto &it : hotBlocks) {
+      MachineBasicBlock *MBB = it.MBB;
+
+      auto oit = outputVMemLdSizeMap.find(MBB);
+      if (oit != outputVMemLdSizeMap.end())
+        it.vmemLdOutputSize = oit->second;
+
+      if (MBB->pred_size() != 1)
+        continue;
+
+      MachineBasicBlock *Pred = *MBB->pred_begin();
+      oit = outputVMemLdSizeMap.find(Pred);
+      if (oit != outputVMemLdSizeMap.end()) {
+        it.vmemLdInputSize = oit->second;
+      } else {
+        if (Pred->getFirstTerminator() != Pred->end())
+          continue;
+        if (Pred->empty())
+          continue;
+        bool bIsHighLatency = SIII->isHighLatencyInstruction(Pred->back());
+        if (!bIsHighLatency)
+          continue;
+        int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI);
+        it.vmemLdInputSize = vmemLdSize;
+      }
+    }
+  }
+
+  if (EnableUniformVectorToScalar) {
+    if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI,
+                               SIRI, SIII, MLI)) {
+      // Rebuild LIS.
+      LIS->reanalyze(MF);
+      status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+      bool bSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF);
+      if (bSgprSpilled) {
+        bool bNearTarget = false;
+        hotBlockRemat(Remat, MF, MLI, LIS, pDT, pPDT, bNearTarget);
+        // Rebuild LIS.
+        LIS->reanalyze(MF);
+        status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+      }
+
+      for (auto &it : hotBlocks) {
+        MachineBasicBlock *MBB = it.MBB;
+
+        // Update pressure.
+        auto &RP = status.MBBPressureMap[MBB];
+        unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+        unsigned maxLocalSPressure = RP.getMaxSGPR();
+
+        maxLocalSPressure += RegForVCC;
+        it.maxPressures.first = maxLocalVPressure;
+        it.maxPressures.second = maxLocalSPressure;
+      }
+    }
+  }
+
+  // Collect all live reg which cross hot blocks.
+  GCNRPTracker::LiveRegSet liveRegCandidates;
+  for (auto it : hotBlocks) {
+    MachineBasicBlock *MBB = it.MBB;
+
+    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+
+    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+
+    llvm::mergeLiveRegSet(liveRegCandidates, inputLive);
+    llvm::mergeLiveRegSet(liveRegCandidates, outputLive);
+  }
+
+  // Check min VGPR bound.
+  BlockSet PressureUnderLimitSet;
+  if (EnableSubExpMinReg) {
+    for (auto &it : hotBlocks) {
+      MachineBasicBlock *MBB = it.MBB;
+      unsigned MaxLocalVGPR = 0;
+      unsigned MaxLocalSGPR = 0;
+      llvm::getRegBound(MBB, MRI, SIRI, SIII, LIS, MaxLocalVGPR, MaxLocalSGPR);
+
+      if (MaxLocalVGPR < VLimit && MaxLocalSGPR < SLimit) {
+        PressureUnderLimitSet.insert(MBB);
+      } else {
+        if (MaxLocalVGPR < it.maxPressures.first)
+          it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second);
+        if (MaxLocalSGPR < it.maxPressures.second)
+          it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR);
+      }
+    }
+  }
+
+  bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates,
+                                        ST, LIS, MLI, pDT, MRI, SIRI, SIII);
+
+  return bUpdated;
+}
+
+bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.size() < 2)
+    return false;
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  {
+    llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (DA.isUniform(&MI)) {
+          TotalUniformInsts.insert(&MI);
+        }
+      }
+    }
+  }
+
+  //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
+  // For non-cs/ps, set target occ as 4.
+  bool bNearTarget = false;
+  bool bFinalUpdated = false;
+  bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget);
+  bFinalUpdated |= bUpdated;
+  if (EnableSubExp) {
+    if (bUpdated) {
+      // Rebuild LIS.
+      LIS->reanalyze(MF);
+    }
+
+    bUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA);
+
+    bFinalUpdated |= bUpdated;
+  }
+  return bFinalUpdated;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                      "AMDGPU rematerialize", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize",
+                    false, false)
+
+char AMDGPUHotBlockRematerialize::ID = 0;
+char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
+
+FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
+  return new AMDGPUHotBlockRematerialize();
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
new file mode 100644
index 0000000000000..6f44fec08239c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -0,0 +1,2241 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// AMDGPUMIRUtils.cpp                                                          //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Util functions for llvm MIR Passes.                                       //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+
+//#include "dxc/DXIL/DxilMetadataHelper.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/Support/GraphWriter.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "GCNRegPressure.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUSubExpDag.h"
+#include <unordered_set>
+
+#define DEBUG_TYPE "xb-mir-util"
+using namespace llvm;
+namespace {
+class CFGWithPhi {
+public:
+  CFGWithPhi(MachineFunction &F) : F(F) {
+    // Collect phi and phi related insts.
+    MachineRegisterInfo &MRI = F.getRegInfo();
+
+    for (MachineBasicBlock &BB : F) {
+      auto &phiInsts = blockToPhiInstsMap[&BB];
+      for (MachineInstr &I : BB) {
+        if (!I.isPHI())
+          break;
+        phiInsts.insert(&I);
+        unsigned Reg = I.getOperand(0).getReg();
+        // Add incoming values.
+        for (unsigned i=1;i<I.getNumOperands();i+=2) {
+          MachineOperand &MO = I.getOperand(i);
+          if (!MO.isReg())
+            continue;
+          MachineInstr *DefMI = MRI.getUniqueVRegDef(MO.getReg());
+          if (!DefMI)
+            continue;
+          blockToPhiInstsMap[DefMI->getParent()].insert(DefMI);
+        }
+        // Add users.
+        for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+          blockToPhiInstsMap[UseMI.getParent()].insert(&UseMI);
+        }
+      }
+    }
+  } /// Adds custom features for a visualization of the ScheduleDAG.
+  void addCustomGraphFeatures(llvm::GraphWriter<CFGWithPhi *> &) const {}
+  MachineFunction &F;
+  DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>> blockToPhiInstsMap;
+  void dump();
+};
+
+void CFGWithPhi::dump() {
+#ifdef DBG
+  for (MachineBasicBlock &BB : F) {
+    dbgs() << BB.getName() << "\n";
+    auto &phiInsts = blockToPhiInstsMap[&BB];
+    for (MachineInstr *I : phiInsts) {
+      if (!I->isPHI())
+        continue;
+      I->dump();
+    }
+    for (MachineInstr *I : phiInsts) {
+      if (I->isPHI())
+        continue;
+      I->dump();
+    }
+  }
+#endif
+}
+
+} // namespace
+
+// CFGWithPhi dump.
+namespace llvm {
+
+template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const CFGWithPhi *G) {
+    return "CFG with Phi graph";
+  }
+
+  static std::string getNodeIdentifierLabel(const MachineBasicBlock *Node,
+                                            const CFGWithPhi *Graph) {
+    std::string R;
+    raw_string_ostream OS(R);
+    OS << static_cast<const void *>(Node);
+    return R;
+  }
+
+  static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) {
+    enum { MaxColumns = 8000 };
+    std::string Str;
+    raw_string_ostream OS(Str);
+
+    OS << "BB:" << BB->getName();
+    auto it = G->blockToPhiInstsMap.find(BB);
+    if (it != G->blockToPhiInstsMap.end()) {
+
+      auto &phiInsts = it->second;
+      for (MachineInstr *I : phiInsts) {
+        if (!I->isPHI())
+          continue;
+        I->print(OS);
+        OS << "\n";
+      }
+      for (MachineInstr *I : phiInsts) {
+        if (I->isPHI())
+          continue;
+        I->print(OS);
+        OS << "\n";
+      }
+    }
+    std::string OutStr = OS.str();
+    if (OutStr[0] == '\n')
+      OutStr.erase(OutStr.begin());
+
+    // Process string output to make it nicer...
+    unsigned ColNum = 0;
+    unsigned LastSpace = 0;
+    for (unsigned i = 0; i != OutStr.length(); ++i) {
+      if (OutStr[i] == '\n') { // Left justify
+        OutStr[i] = '\\';
+        OutStr.insert(OutStr.begin() + i + 1, 'l');
+        ColNum = 0;
+        LastSpace = 0;
+      } else if (OutStr[i] == ';') {             // Delete comments!
+        unsigned Idx = OutStr.find('\n', i + 1); // Find end of line
+        OutStr.erase(OutStr.begin() + i, OutStr.begin() + Idx);
+        --i;
+      } else if (ColNum == MaxColumns) { // Wrap lines.
+        // Wrap very long names even though we can't find a space.
+        if (!LastSpace)
+          LastSpace = i;
+        OutStr.insert(LastSpace, "\\l...");
+        ColNum = i - LastSpace;
+        LastSpace = 0;
+        i += 3; // The loop will advance 'i' again.
+      } else
+        ++ColNum;
+      if (OutStr[i] == ' ')
+        LastSpace = i;
+    }
+    return OutStr;
+  }
+  static std::string getNodeDescription(const MachineBasicBlock *SU,
+                                        const CFGWithPhi *G) {
+    return SU->getName().str();
+  }
+
+  static void addCustomGraphFeatures(CFGWithPhi *G,
+                                     GraphWriter<CFGWithPhi *> &GW) {
+    return G->addCustomGraphFeatures(GW);
+  }
+};
+
+template <> struct GraphTraits<CFGWithPhi *> {
+  using NodeRef = MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::succ_iterator;
+  using nodes_iterator = pointer_iterator<MachineFunction::iterator>;
+
+  // static NodeRef getEntryNode(const CFGWithPhi *G) {
+  //  return G->F.getFunctionEntry();
+  //}
+
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return N->succ_begin();
+  }
+
+  static ChildIteratorType child_end(const NodeRef N) { return N->succ_end(); }
+
+  static nodes_iterator nodes_begin(const CFGWithPhi *G) {
+    return nodes_iterator(G->F.begin());
+  }
+
+  static nodes_iterator nodes_end(const CFGWithPhi *G) {
+    return nodes_iterator(G->F.end());
+  }
+};
+
+} // namespace llvm
+
+namespace llvm {
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI) {
+  unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  Size >>= 5;
+  LaneBitmask mask = Mask;
+  if (mask.any()) {
+    if (unsigned maskSize = mask.getNumLanes()) {
+      if (maskSize < Size)
+        Size = maskSize;
+    }
+  }
+  return Size;
+}
+
+void CollectLiveSetPressure(const LiveSet &liveSet,
+                            const MachineRegisterInfo &MRI,
+                            const SIRegisterInfo *SIRI, unsigned &VPressure,
+                            unsigned &SPressure) {
+  VPressure = 0;
+  SPressure = 0;
+  for (auto liveIt : liveSet) {
+    unsigned Reg = liveIt.first;
+    unsigned Size = getRegSize(Reg, liveIt.second, MRI, SIRI);
+    if (SIRI->isVGPR(MRI, Reg)) {
+      VPressure += Size;
+    } else {
+      SPressure += Size;
+    }
+  }
+}
+
+bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) {
+  bool isExecUpdate = false;
+  unsigned opcode = MI.getOpcode();
+  if (opcode == AMDGPU::S_MOV_B64 || opcode == AMDGPU::S_MOV_B32 ||
+      opcode == AMDGPU::S_OR_B64_term || opcode == AMDGPU::S_OR_B32_term ||
+      opcode == AMDGPU::S_OR_SAVEEXEC_B64 ||
+      opcode == AMDGPU::S_OR_SAVEEXEC_B32 || opcode == AMDGPU::S_AND_B64 ||
+      opcode == AMDGPU::S_AND_B32 || opcode == AMDGPU::S_ANDN2_B64 ||
+      opcode == AMDGPU::S_ANDN2_B32) {
+    MachineOperand &Dst = MI.getOperand(0);
+    if (Dst.getReg() == AMDGPU::EXEC || Dst.getReg() == AMDGPU::EXEC_LO) {
+      isExecUpdate = true;
+    }
+  }
+  return isExecUpdate;
+}
+
+bool IsSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  // Support multi def for pattern of pointer:
+  // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+  // %808.sub1:sgpr_64 = S_MOV_B32 0
+  bool bHasSub0 = false;
+  bool bHasSub1 = false;
+  for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) {
+    if (unsigned SubReg = UserDefMO.getSubReg()) {
+      bool bSingleSubReg = false;
+      switch (SubReg) {
+      default:
+        break;
+      case AMDGPU::sub0:
+        if (!bHasSub0) {
+          bHasSub0 = true;
+          bSingleSubReg = true;
+        }
+        break;
+      case AMDGPU::sub1:
+        if (!bHasSub1) {
+          bHasSub1 = true;
+          bSingleSubReg = true;
+        }
+        break;
+      }
+      if (!bSingleSubReg) {
+        bHasSub0 = false;
+        break;
+      }
+    } else {
+      bHasSub0 = false;
+      break;
+    }
+  }
+
+  return (bHasSub0 && bHasSub1);
+}
+
+LaneBitmask getRegMask(const MachineOperand &MO,
+                       const MachineRegisterInfo &MRI) {
+  // We don't rely on read-undef flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0
+             ? MRI.getMaxLaneMaskForVReg(MO.getReg())
+             : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(
+                   MO.getSubReg());
+}
+
+void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+  for (auto Reg : inputSet) {
+    unsigned reg = Reg.first;
+    LaneBitmask mask = Reg.second;
+    auto targetReg = targetSet.find(reg);
+    if (targetReg != targetSet.end()) {
+      LaneBitmask targetMask = targetReg->second;
+      mask |= targetMask;
+    }
+    targetSet[reg] = mask;
+  }
+}
+
+void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+  GCNRPTracker::LiveRegSet AndSet;
+  for (auto Reg : inputSet) {
+    unsigned reg = Reg.first;
+    LaneBitmask mask = Reg.second;
+    auto targetReg = targetSet.find(reg);
+    if (targetReg != targetSet.end()) {
+      LaneBitmask targetMask = targetReg->second;
+      mask &= targetMask;
+      AndSet[reg] = mask;
+    }
+  }
+
+  targetSet = AndSet;
+}
+
+void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+  for (auto Reg : inputSet) {
+    unsigned reg = Reg.first;
+    LaneBitmask mask = Reg.second;
+    auto targetReg = targetSet.find(reg);
+    if (targetReg != targetSet.end()) {
+      LaneBitmask targetMask = targetReg->second;
+      if ((targetMask | mask) == mask)
+        targetSet.erase(reg);
+      else
+        targetSet[reg] = targetMask & (~mask);
+    }
+  }
+}
+
+MachineBasicBlock *split(MachineInstr *Inst) {
+  
+  // Create the fall-through block.
+  MachineBasicBlock *MBB = Inst->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+  auto MBBIter = ++(MBB->getIterator());
+  MF->insert(MBBIter, SuccMBB);
+  SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(SuccMBB);
+
+  // Splice the code over.
+  SuccMBB->splice(SuccMBB->end(), MBB, ++Inst->getIterator(), MBB->end());
+
+  return SuccMBB;
+}
+
+struct Piece {
+  unsigned Reg;
+  unsigned offset;
+  unsigned size;
+  static SmallVector<Piece, 8> split(std::bitset<32> mask) {
+
+    SmallVector<Piece, 8> pieces;
+    Piece piece = {0, 0, 0};
+    for (unsigned i = 0; i < 32; i++) {
+      if (mask.test(i)) {
+        if (piece.size == 0)
+          piece.offset = i;
+
+        piece.size++;
+        // Make sure no piece bigger than 8.
+        if (piece.size == 8) {
+          pieces.emplace_back(piece);
+          piece.size = 0;
+        }
+      } else {
+        if (piece.size == 0) {
+          continue;
+        }
+        pieces.emplace_back(piece);
+        piece.size = 0;
+      }
+    }
+    return pieces;
+  }
+};
+
+void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC,
+                  unsigned offset, const SIRegisterInfo *SIRI,
+                  const SIInstrInfo *SIII) {
+  unsigned size = NewRC->getLaneMask().getNumLanes();
+  if (size == 1) {
+    UseMO.setSubReg(0);
+  } else {
+    const uint32_t SubReg = UseMO.getSubReg();
+    LaneBitmask Mask = SIRI->getSubRegIndexLaneMask(SubReg);
+
+    unsigned mask = Mask.getAsInteger() >> offset;
+
+    unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask(
+                                 NewRC, LaneBitmask(mask))
+                             .front();
+
+    UseMO.setSubReg(NewSubReg);
+  }
+}
+
+bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
+                   MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
+  MachineOperand &DstMO = MI.getOperand(0);
+  // Skip case when dst subReg not 0.
+  if (DstMO.getSubReg()) {
+    return false;
+  }
+  unsigned Reg = DstMO.getReg();
+
+  SmallVector<MachineOperand *, 2> UseMOs;
+  for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
+    UseMOs.emplace_back(&UseMO);
+  }
+
+  const llvm::TargetRegisterClass *NewRC =
+      SIRI->getRegClass(desc.operands().front().RegClass);
+  unsigned size = NewRC->getLaneMask().getNumLanes();
+  if (offset > 0) {
+    // Update offset operand in MI.
+    MachineOperand *OffsetOp =
+        SIII->getNamedOperand(MI, AMDGPU::OpName::offset);
+
+    const uint32_t LaneSize = sizeof(uint32_t);
+    if (OffsetOp) {
+      if (OffsetOp->isImm()) {
+        assert(OffsetOp != nullptr);
+        int64_t Offset = OffsetOp->getImm();
+        Offset += offset * LaneSize;
+        if (!SIII->isLegalMUBUFImmOffset(Offset)) {
+          return false;
+        }
+        OffsetOp->setImm(Offset);
+      } else {
+        return false;
+      }
+    } else {
+      OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+      if (OffsetOp) {
+        unsigned NewOffsetReg =
+            MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+        auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(),
+                                 SIII->get(AMDGPU::S_ADD_U32))
+                             .addDef(NewOffsetReg)
+                             .add(*OffsetOp)
+                             .addImm(offset * LaneSize);
+        MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
+        MachineBasicBlock::iterator InsertPoint =
+            llvm::FindOrCreateInsertionPointForSccDef(
+                MI.getParent(), MI, SIRI, SIII, &MRI
+            );
+        MI.getParent()->insert(InsertPoint, OffsetAddMI);
+        SIII->legalizeOperands(*OffsetAddMI);
+        OffsetOp->setReg(NewOffsetReg);
+        OffsetOp->setSubReg(0);
+        if (SlotIndexes)
+          SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI);
+      } else {
+        return false;
+      }
+    }
+    // Update subReg for users.
+    for (MachineOperand *UseMO : UseMOs) {
+      updateSubReg(*UseMO, NewRC, offset, SIRI, SIII);
+    }
+  } else if (size == 1) {
+    // Clear subReg when size is 1.
+    for (MachineOperand *UseMO : UseMOs) {
+      UseMO->setSubReg(0);
+    }
+  }
+
+  MI.setDesc(desc);
+  // Mutate reg class of Reg.
+  MRI.setRegClass(Reg, NewRC);
+  return true;
+}
+
+bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
+                       const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                       SlotIndexes *SlotIndexes) {
+  bool bImm = false;
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
+    bImm = true;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+    unsigned Reg = MI.getOperand(0).getReg();
+    if (!MRI.getUniqueVRegDef(Reg))
+      return false;
+    LaneBitmask dstMask = getRegMask(MI.getOperand(0), MRI);
+    LaneBitmask UseMask;
+    for (MachineOperand &MO : MRI.use_operands(Reg)) {
+      UseMask |= llvm::getRegMask(MO, MRI);
+    }
+
+    const unsigned fullMask = dstMask.getAsInteger();
+    unsigned mask = UseMask.getAsInteger();
+    if (mask == fullMask)
+      return false;
+    // Split mask when there's gap. Then group mask to 2/4/8.
+    auto pieces = Piece::split(std::bitset<32>(mask));
+    // Now only support 1 piece.
+    if (pieces.size() != 1)
+      return false;
+    auto piece = pieces[0];
+    if (piece.size > 8)
+      return false;
+
+    // TODO: enable offset support when bImm is true.
+    // Now if break different test when mul LaneSize or not mul for the offset.
+    if (bImm && piece.offset != 0)
+      return false;
+
+    switch (piece.size) {
+    default:
+      return false;
+    case 1:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 2:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 3:
+      if (fullMask == 0xf)
+        return false;
+    case 4:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 5:
+    case 6:
+    case 7:
+      if (fullMask == 0xff)
+        return false;
+    case 8:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    }
+
+  } break;
+  }
+  return false;
+}
+
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI,
+                                    const MachineBasicBlock *BB) {
+  const MachineLoop *L = LI->getLoopFor(BB);
+  if (L) {
+    while (const MachineLoop *Parent = L->getParentLoop())
+      L = Parent;
+  }
+  return L;
+}
+
+// True if there is a loop which contains both BB1 and BB2.
+bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
+                      const MachineBasicBlock *BB2) {
+  const MachineLoop *L1 = getOutermostLoop(LI, BB1);
+  const MachineLoop *L2 = getOutermostLoop(LI, BB2);
+  return L1 != nullptr && L1 == L2;
+}
+
+bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
+                 MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+                 MachineBasicBlock *ToBB) {
+  if (FromBB == ToBB) {
+    return true;
+  }
+
+  if (DT->dominates(FromBB, ToBB)) {
+    return true;
+  }
+
+  if (PDT->dominates(ToBB, FromBB)) {
+    return true;
+  }
+
+  if (loopContainsBoth(LI, ToBB, FromBB)) {
+    return true;
+  }
+  // TODO: cover case hotBB in loop,
+  //       one block in that loop dom BB or
+  //       BB post dom one block in that loop.
+  return false;
+}
+
+// If BB can reach hotMBBs.
+bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT,
+                  MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+                  DenseSet<MachineBasicBlock *> &hotMBBs) {
+  bool bCross = false;
+  for (MachineBasicBlock *hotBB : hotMBBs) {
+    if (reach_block(BB, DT, PDT, LI, hotBB)) {
+      bCross = true;
+      break;
+    }
+  }
+  return bCross;
+}
+
+}
+
+namespace llvm {
+void viewCFGWithPhi(llvm::MachineFunction &F) {
+#ifdef DBG
+  CFGWithPhi G(F);
+  ViewGraph(const_cast<CFGWithPhi *>(&G), F.getName(), false, F.getName());
+  G.dump();
+#endif
+}
+} // namespace llvm
+
+namespace llvm {
+bool GetNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+                       MachineBasicBlock &MBB) {
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  while (BBEnd != MBB.rend() && BBEnd->isDebugInstr())
+    BBEnd++;
+  return BBEnd != MBB.rend();
+}
+} // namespace llvm
+
+// Helper functions to write jason.
+namespace {
+void json_name(StringRef Val, raw_ostream &os) { os << "\"" << Val << "\":"; }
+
+template <typename write_fn>
+void json_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
+  json_name(Val, os);
+  os << "\"";
+  fn();
+  os << "\"";
+}
+
+template <typename write_fn>
+void json_obj_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
+  json_name(Val, os);
+
+  fn();
+}
+
+template <typename write_fn>
+void json_array(StringRef Val, write_fn &fn, raw_ostream &os) {
+  json_name(Val, os);
+  os << "[";
+  fn();
+  os << "]";
+}
+} // namespace
+
+namespace llvm {
+namespace pressure {
+
+void write_inst(MachineInstr &MI, const SlotIndexes *SlotIndexes,
+                const SIInstrInfo *SIII, raw_ostream &os) {
+  os << "{";
+  SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeOpcode = [&MI, &SIII, &os]() {
+    os << SIII->getName(MI.getOpcode());
+  };
+
+  json_pair("opcode", writeOpcode, os);
+
+  os << ",";
+
+  auto writeAsm = [&MI, &SIII, &os]() {
+    MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+             /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
+  };
+  json_pair("asm", writeAsm, os);
+
+  os << "}";
+}
+
+void print_reg(Register Reg, const MachineRegisterInfo &MRI,
+               const SIRegisterInfo *SIRI, raw_ostream &os) {
+  if (Reg.isVirtual()) {
+    StringRef Name = MRI.getVRegName(Reg);
+    if (Name != "") {
+      os << '%' << Name;
+    } else {
+      os << '%' << Register::virtReg2Index(Reg);
+    }
+  } else if (Reg < SIRI->getNumRegs()) {
+    os << '$';
+    printLowerCase(SIRI->getName(Reg), os);
+  } else {
+    llvm_unreachable("invalid reg");
+  }
+}
+
+void write_reg(unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI,
+               const SIRegisterInfo *SIRI, raw_ostream &os) {
+  os << "{";
+
+  auto writeReg = [&MRI, &SIRI, &Reg, &os]() { print_reg(Reg, MRI, SIRI, os); };
+  json_pair("reg", writeReg, os);
+
+  os << ",";
+
+  auto writeSubReg = [&SubReg, &os]() { os << SubReg; };
+
+  json_pair("sub_reg", writeSubReg, os);
+
+  os << ",";
+  auto writeIsSgpr = [&Reg, &MRI, &SIRI, &os]() {
+    if (SIRI->isSGPRReg(MRI, Reg))
+      os << "true";
+    else
+      os << "false";
+  };
+  json_obj_pair("is_sgpr", writeIsSgpr, os);
+  os << "}";
+}
+
+unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI,
+                      const SIRegisterInfo *SIRI) {
+  return SIRI->getRegClassForReg(MRI, Reg)->getLaneMask().getNumLanes();
+}
+
+void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI,
+                const SIRegisterInfo *SIRI, raw_ostream &os) {
+  if (Mask.none()) {
+    unsigned size = get_reg_size(Reg, MRI, SIRI);
+    Mask = LaneBitmask((1 << size) - 1);
+  }
+  unsigned mask = Mask.getAsInteger();
+  for (unsigned i = 0; i <= Mask.getHighestLane(); i++) {
+    if (mask & (1 << i)) {
+      write_reg(Reg, i, MRI, SIRI, os);
+      os << ",\n";
+    }
+  }
+}
+
+void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask,
+                          const MachineRegisterInfo &MRI,
+                          const SIRegisterInfo *SIRI, raw_ostream &os) {
+  os << "{";
+  auto writeID = [&ID, &os]() { os << ID; };
+
+  json_pair("ID", writeID, os);
+
+  os << ",";
+
+  auto writeReg = [&reg, &MRI, &SIRI, &os]() { print_reg(reg, MRI, SIRI, os); };
+
+  json_pair("reg", writeReg, os);
+
+  os << ",";
+
+  auto writeMask = [&mask, &os]() { os << mask; };
+
+  json_pair("mask", writeMask, os);
+
+  os << "},\n";
+}
+
+void write_dag_inst_node(unsigned ID, SlotIndex Slot,
+                         GCNRPTracker::LiveRegSet LiveReg,
+                         const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI, SUnit *SU,
+                         raw_ostream &os) {
+  os << "{";
+  auto writeID = [&ID, &os]() { os << ID; };
+
+  json_pair("ID", writeID, os);
+
+  os << ",";
+
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeRegs = [&LiveReg, &MRI, &SIRI, &os]() {
+    for (auto it : LiveReg) {
+      unsigned Reg = it.first;
+      LaneBitmask Mask = it.second;
+      write_live(Reg, Mask, MRI, SIRI, os);
+    }
+  };
+  json_array("regs", writeRegs, os);
+
+  os << ",";
+
+  auto writePreds = [&SU, &os]() {
+    for (auto &Pred : SU->Preds) {
+
+      os << Pred.getSUnit()->NodeNum << ",";
+    }
+  };
+
+  json_array("preds", writePreds, os);
+
+  os << "},\n";
+}
+
+void write_block(MachineBasicBlock &Blk, LiveIntervals *LIS,
+                 const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                 const SIInstrInfo *SIII, raw_ostream &os) {
+  os << "{\n";
+  auto writeName = [&Blk, &os]() { os << Blk.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",";
+
+  auto writeIndex = [&Blk, &os]() { os << Blk.getNumber(); };
+  json_pair("id", writeIndex, os);
+
+  os << ",";
+
+  const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
+
+  SlotIndex BeginSlot = SlotIndexes->getMBBStartIdx(&Blk);
+  auto writeSlot = [&BeginSlot, &os]() { BeginSlot.print(os); };
+  json_pair("begin_slot", writeSlot, os);
+
+  os << ",";
+
+  SlotIndex EndSlot = SlotIndexes->getMBBEndIdx(&Blk);
+  auto writeEndSlot = [&EndSlot, &os]() { EndSlot.print(os); };
+  json_pair("end_slot", writeEndSlot, os);
+
+  os << ",";
+
+  auto writeInsts = [&Blk, &SlotIndexes, &SIII, &os]() {
+    for (MachineInstr &MI : Blk) {
+      if (MI.isDebugInstr())
+        continue;
+      write_inst(MI, SlotIndexes, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("instructions", writeInsts, os);
+
+  os << ",";
+
+  BlockExpDag dag(&Blk, LIS, MRI, SIRI, SIII);
+  dag.buildWithPressure();
+
+  const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *dag.LIS, dag.MRI);
+  auto writeInputs = [&StartLiveReg, &dag, &os]() {
+    for (auto it : StartLiveReg) {
+      unsigned Reg = it.first;
+      LaneBitmask mask = it.second;
+      SUnit *SU = dag.InputSUnitMap[Reg];
+      // Write Reg and mask to the nodes.
+      write_dag_input_node(SU->NodeNum, Reg, mask.getAsInteger(), dag.MRI,
+                           dag.SIRI, os);
+    }
+  };
+
+  json_array("input_nodes", writeInputs, os);
+
+  os << ",";
+
+  auto writeNodes = [&SlotIndexes, &dag, &os]() {
+    for (auto it : dag.MISUnitMap) {
+      MachineInstr *MI = it.first;
+      SUnit *SU = it.second;
+      // Use SlotIndex of MI.
+      SlotIndex SlotIndex;
+      if (!MI->isDebugInstr())
+        SlotIndex = SlotIndexes->getInstructionIndex(*MI);
+      GCNRPTracker::LiveRegSet LiveReg = dag.DagPressureMap[SU];
+      // Write slot, live to the nodes.
+      write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, dag.MRI, dag.SIRI,
+                          SU, os);
+    }
+  };
+
+  json_array("inst_nodes", writeNodes, os);
+
+  os << ",";
+
+  auto writePreds = [&Blk, &os]() {
+    for (MachineBasicBlock *Pred : Blk.predecessors()) {
+      os << Pred->getNumber() << ",";
+    }
+  };
+
+  json_array("preds", writePreds, os);
+
+  os << ",";
+
+  auto writeSuccs = [&Blk, &os]() {
+    for (MachineBasicBlock *Succ : Blk.successors()) {
+      os << Succ->getNumber() << ",";
+    }
+  };
+
+  json_array("succs", writeSuccs, os);
+
+  os << "}";
+}
+
+void write_define(SlotIndex &Slot, unsigned Reg, unsigned SubReg,
+                  const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                  raw_ostream &os) {
+  os << "{";
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeReg = [&MRI, &SIRI, &Reg, &SubReg, &os]() {
+    write_reg(Reg, SubReg, MRI, SIRI, os);
+  };
+  json_obj_pair("reg", writeReg, os);
+
+  os << "}\n";
+
+  os << ",";
+}
+
+void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes,
+                  const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                  raw_ostream &os) {
+  // Split subReg?  MO.getSubReg();
+  unsigned Reg = MO.getReg();
+  unsigned SubReg = MO.getSubReg();
+  MachineInstr *MI = MO.getParent();
+  SlotIndex Slot = SlotIndexes->getInstructionIndex(*MI);
+  if (SubReg == 0) {
+    unsigned size = get_reg_size(Reg, MRI, SIRI);
+    for (unsigned i = 0; i < size; i++) {
+      write_define(Slot, Reg, i, MRI, SIRI, os);
+    }
+  } else {
+    switch (SubReg) {
+    default:
+      assert(0 && "SubReg not supported yet.");
+      write_define(Slot, Reg, SubReg, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub1:
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub2:
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub3:
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub4:
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub5:
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub6:
+      write_define(Slot, Reg, 6, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub7:
+      write_define(Slot, Reg, 7, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub8:
+      write_define(Slot, Reg, 8, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub9:
+      write_define(Slot, Reg, 9, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub10:
+      write_define(Slot, Reg, 10, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub11:
+      write_define(Slot, Reg, 11, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub12:
+      write_define(Slot, Reg, 12, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub13:
+      write_define(Slot, Reg, 13, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub14:
+      write_define(Slot, Reg, 14, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub15:
+      write_define(Slot, Reg, 15, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub2_sub3:
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub4_sub5:
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub1_sub2:
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1_sub2:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1_sub2_sub3:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub2_sub3_sub4_sub5:
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      write_define(Slot, Reg, 6, MRI, SIRI, os);
+      write_define(Slot, Reg, 7, MRI, SIRI, os);
+      break;
+    }
+  }
+}
+
+void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes,
+                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   raw_ostream &os) {
+
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+    auto Reg = Register::index2VirtReg(i);
+
+    for (MachineOperand &MO : MRI.def_operands(Reg)) {
+      write_define(MO, SlotIndexes, MRI, SIRI, os);
+    }
+  }
+}
+
+void write_uses(MachineFunction &MF, const SlotIndexes *SlotIndexes,
+
+                const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                raw_ostream &os) {
+
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+    auto Reg = Register::index2VirtReg(i);
+
+    for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+      // TODO: create write_use if use has more info.
+      write_define(MO, SlotIndexes, MRI, SIRI, os);
+    }
+  }
+}
+
+void write_liveness(SlotIndex Slot, GCNRPTracker::LiveRegSet &LiveSet,
+                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                    raw_ostream &os) {
+  os << "{";
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeRegs = [&LiveSet, &MRI, &SIRI, &os]() {
+    for (auto it : LiveSet) {
+      unsigned Reg = it.first;
+      LaneBitmask Mask = it.second;
+      write_live(Reg, Mask, MRI, SIRI, os);
+    }
+  };
+  json_array("regs", writeRegs, os);
+  os << "\n},\n";
+}
+
+void write_segment(const LiveInterval::Segment &S, raw_ostream &os) {
+  os << "{";
+  auto writeBegin = [&S, &os]() { S.start.print(os); };
+
+  json_pair("begin", writeBegin, os);
+
+  os << ",";
+
+  auto writeEnd = [&S, &os]() { S.end.print(os); };
+
+  json_pair("end", writeEnd, os);
+
+  os << ",";
+
+  auto writeValNum = [&S, &os]() {
+    if (S.valno)
+      os << S.valno->id;
+    else
+      os << 0xFFFFFFFF;
+  };
+
+  json_pair("val_num", writeValNum, os);
+
+  os << "},\n";
+}
+
+void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &os) {
+  os << "{\n";
+  auto writeMask = [&SR, &os]() { os << SR.LaneMask.getAsInteger(); };
+
+  json_pair("mask", writeMask, os);
+
+  os << ",";
+
+  // Segments.
+  auto writeSegments = [&SR, &os]() {
+    for (auto &S : SR.segments) {
+      write_segment(S, os);
+    }
+  };
+
+  json_array("segments", writeSegments, os);
+
+  os << "\n},\n";
+}
+
+void write_live_interval(LiveInterval &LI, const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI, raw_ostream &os) {
+  os << "{\n";
+
+  auto writeReg = [&LI, &MRI, &SIRI, &os]() {
+    write_reg(LI.reg(), 0, MRI, SIRI, os);
+  };
+
+  json_obj_pair("reg", writeReg, os);
+
+  os << ",";
+
+  auto writeSegments = [&LI, &os]() {
+    for (auto &S : LI.segments) {
+      write_segment(S, os);
+    }
+  };
+
+  json_array("segments", writeSegments, os);
+
+  os << ",";
+
+  auto writeSubRanges = [&LI, &os]() {
+    for (auto &SR : LI.subranges()) {
+      write_subrange(SR, os);
+    }
+  };
+
+  json_array("subranges", writeSubRanges, os);
+
+  os << "},\n";
+}
+
+std::string get_legal_str(const MDString *MDStr) {
+  std::string str;
+  raw_string_ostream Stream(str);
+  MDStr->print(Stream);
+  Stream.flush();
+  // Remove !.
+  str = str.substr(1);
+  // Remove ""
+  str = str.substr(1);
+  str.pop_back();
+  std::replace(str.begin(), str.end(), '\\', '#');
+  return str;
+}
+
+void write_file(const MDNode *FileNode, raw_ostream &os) {
+  const MDString *FileName = cast<MDString>(FileNode->getOperand(0).get());
+  StringRef fileNameStr = FileName->getString();
+  if (fileNameStr.find("__AMDGPU_GPUMAP_") == 0)
+    return;
+  if (fileNameStr.find("__AMDGPU_DWARF_") == 0)
+    return;
+
+  os << "{";
+
+  std::string str0 = get_legal_str(FileName);
+  auto writeName = [&str0, &os]() { os << str0; };
+  json_pair("filename", writeName, os);
+
+  os << ",\n";
+
+  const MDString *Content = cast<MDString>(FileNode->getOperand(1).get());
+  std::string str = get_legal_str(Content);
+  auto writeContent = [&str, &os]() { os << str; };
+  json_pair("content", writeContent, os);
+  os << "\n},\n";
+}
+
+void write_DIFile(const DIFile *File, raw_ostream &os) {
+  if (File) {
+    std::string name = get_legal_str(File->getRawFilename());
+    std::string dir = "";
+    if (MDString *MDDir = File->getRawDirectory())
+      dir = get_legal_str(MDDir);
+    os << dir << name;
+  } else {
+    os << "ArtificialFile";
+  }
+}
+
+void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &os) {
+  os << "{";
+
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",\n";
+
+  MDNode *Scope = DL.getScope();
+  unsigned line = DL.getLine();
+  unsigned col = DL.getCol();
+
+  auto writeLine = [&line, &os]() { os << line; };
+  json_pair("line", writeLine, os);
+
+  os << ",\n";
+
+  auto writeCol = [&col, &os]() { os << col; };
+  json_pair("col", writeCol, os);
+
+  os << ",\n";
+
+  auto writeFile = [&Scope, &os]() {
+    const DIFile *File = cast<DIScope>(Scope)->getFile();
+    write_DIFile(File, os);
+  };
+  json_pair("file", writeFile, os);
+
+  if (DILocation *inlineDL = DL.getInlinedAt()) {
+    os << ",\n";
+    unsigned inlineLine = inlineDL->getLine();
+    auto writeLine = [&inlineLine, &os]() { os << inlineLine; };
+    json_pair("inline_line", writeLine, os);
+
+    os << ",\n";
+
+    unsigned inlineCol = inlineDL->getColumn();
+    auto writeCol = [&inlineCol, &os]() { os << inlineCol; };
+    json_pair("inline_col", writeCol, os);
+
+    os << ",\n";
+
+    const MDNode *InlineScope = DL.getInlinedAtScope();
+    auto writeFile = [&InlineScope, &os]() {
+      const DIFile *File = cast<DIScope>(InlineScope)->getFile();
+      write_DIFile(File, os);
+    };
+    json_pair("inline_file", writeFile, os);
+  }
+
+  os << "\n},\n";
+}
+
+void write_dbg_val(unsigned Reg, const DIVariable *V, const DIExpression *Exp,
+                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   raw_ostream &os) {
+  os << "{";
+
+  auto writeReg = [&MRI, &SIRI, &Reg, &os]() {
+    const unsigned SubReg = 0;
+    write_reg(Reg, SubReg, MRI, SIRI, os);
+  };
+  json_obj_pair("reg", writeReg, os);
+
+  os << ",\n";
+
+  if (V) {
+    auto writeName = [&V, &os]() { os << V->getName(); };
+    json_pair("debug_val_name", writeName, os);
+    os << ",\n";
+
+    auto writeFile = [&V, &os]() {
+      const DIFile *File = V->getFile();
+      write_DIFile(File, os);
+    };
+    json_pair("debug_val_file", writeFile, os);
+    os << ",\n";
+
+    auto writeLine = [&V, &os]() { os << V->getLine(); };
+    json_pair("debug_val_line", writeLine, os);
+  }
+
+  if (Exp->isValid() && Exp->getNumElements()) {
+    os << ",\n";
+    auto writeV = [&Exp, &os]() {
+      os << '[';
+      bool NeedSep = false;
+      for (auto Op : Exp->expr_ops()) {
+        if (NeedSep)
+          os << ", ";
+        else
+          NeedSep = true;
+        os << dwarf::OperationEncodingString(Op.getOp());
+        for (unsigned I = 0; I < Op.getNumArgs(); ++I)
+          os << ' ' << Op.getArg(I);
+      }
+      os << "] ";
+    };
+    json_pair("debug_exp", writeV, os);
+  }
+  os << "\n},\n";
+}
+
+void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS,
+                    const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
+                    const SIRegisterInfo *SIRI, const SlotIndexes *SlotIndexes,
+                    const NamedMDNode *SourceMD, raw_ostream &os) {
+  os << ",\n";
+
+  auto writeFiles = [&SourceMD, &os]() {
+    for (const MDNode *FileNode : SourceMD->operands()) {
+      write_file(FileNode, os);
+    }
+  };
+
+  json_array("files", writeFiles, os);
+
+  os << ",\n";
+
+  auto writeLineMapping = [&MF, &SlotIndexes, &os]() {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr()) {
+          continue;
+        }
+        const DebugLoc DL = MI.getDebugLoc();
+        if (!DL)
+          continue;
+        SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
+        write_line_mapping(Slot, DL, os);
+      }
+    }
+  };
+
+  json_array("line_mapping", writeLineMapping, os);
+
+  os << ",\n";
+
+  auto writeDebugVals = [&MF, &MRI, &SIRI, &os]() {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (!MI.isDebugValue())
+          continue;
+
+        MachineOperand &Reg = MI.getOperand(0);
+        if (!Reg.isReg())
+          continue;
+
+        if (Reg.getReg() == 0)
+          continue;
+
+        const DIVariable *V = MI.getDebugVariable();
+        const DIExpression *Exp = MI.getDebugExpression();
+        write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, os);
+      }
+    }
+  };
+
+  json_array("debug_vals", writeDebugVals, os);
+}
+
+void write_function(MachineFunction &MF, LiveIntervals *LIS,
+                    const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
+                    const SIRegisterInfo *SIRI, raw_ostream &os) {
+  const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
+
+  os << "{\n";
+  auto writeName = [&MF, &os]() { os << MF.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",\n";
+
+  auto writeBlocks = [&MF, &SlotIndexes, &LIS, &MRI, &SIRI, &SIII, &os]() {
+    for (MachineBasicBlock &MBB : MF) {
+      write_block(MBB, LIS, MRI, SIRI, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("blocks", writeBlocks, os);
+
+  os << ",\n";
+
+  auto writeDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
+    write_defines(MF, SlotIndexes, MRI, SIRI, os);
+  };
+
+  json_array("defines", writeDefines, os);
+
+  os << ",\n";
+
+  auto writeUses = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
+    write_uses(MF, SlotIndexes, MRI, SIRI, os);
+  };
+
+  json_array("uses", writeUses, os);
+
+  os << ",\n";
+
+  auto writeLiveness = [&MF, &LIS, &MRI, &SIRI, &os]() {
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr())
+          continue;
+        const SlotIndex &SI = LIS->getInstructionIndex(MI).getBaseIndex();
+        GCNRPTracker::LiveRegSet LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
+        write_liveness(SI, LISLR, MRI, SIRI, os);
+      }
+  };
+
+  json_array("liveness", writeLiveness, os);
+
+  os << ",\n";
+
+  auto writeLiveIntervals = [&MRI, &SIRI, &LIS, &os]() {
+    for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+      auto Reg = Register::index2VirtReg(i);
+      if (!LIS->hasInterval(Reg))
+        continue;
+      auto &LI = LIS->getInterval(Reg);
+      write_live_interval(LI, MRI, SIRI, os);
+    }
+  };
+
+  json_array("live_intervals", writeLiveIntervals, os);
+
+#if 0 // TODO: Do we need this?
+  // Check debug info.
+  const Function &F = MF.getFunction();
+  const Module *M = F.getParent();
+  const NamedMDNode *SourceMD =
+      M->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceContentsMDName);
+  if (SourceMD) {
+    write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, os);
+  }
+#endif
+
+  os << "\n}";
+}
+
+void write_pressure(MachineFunction &MF, LiveIntervals *LIS,
+                    const char *Filename) {
+  int FD = -1;
+  SmallString<128> TmpFilename(Filename);
+  std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename);
+  if (EC) {
+    errs() << "Error: " << EC.message() << "\n";
+    return;
+  }
+
+  raw_fd_ostream O(FD, /*shouldClose=*/true);
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIII = ST->getInstrInfo();
+  const auto *SIRI = ST->getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+  write_function(MF, LIS, MRI, SIII, SIRI, O);
+  O.flush();
+  O.close();
+}
+
+void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIII = ST->getInstrInfo();
+  const auto *SIRI = ST->getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+  write_function(MF, LIS, MRI, SIII, SIRI, os);
+  os.flush();
+}
+
+} // namespace pressure
+}// namespace llvm
+
+namespace {
+class ContributionList {
+public:
+  ContributionList(MachineFunction &MF) : MF(MF){};
+  void build();
+  bool propagateContribution();
+  MachineFunction &MF;
+  DenseMap<MachineInstr *, unsigned> MIIndexMap;
+  // Set of inst which contribute to build the key MachineInstr.
+  DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributorMap;
+  // Set of inst which been contributed by the key MachineInstr.
+  DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributedToMap;
+  void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &os);
+  void writeBlock(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
+                  raw_ostream &os);
+  void write(raw_ostream &os);
+};
+
+void buildMIContribution(MachineInstr &MI,
+                         DenseSet<MachineInstr *> &ContributorSet,
+                         DenseSet<MachineInstr *> &ContributedSet,
+                         const SIRegisterInfo &SIRI, MachineRegisterInfo &MRI) {
+  for (MachineOperand &UseMO : MI.uses()) {
+    if (!UseMO.isReg())
+      continue;
+    Register Reg = UseMO.getReg();
+    if (Reg.isPhysical())
+      continue;
+    if (UseMO.isImplicit()) {
+      // if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+      //    Reg == AMDGPU::SCC)
+      continue;
+    }
+    for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+      ContributorSet.insert(&DefMI);
+    }
+  }
+
+  for (MachineOperand &DstMO : MI.defs()) {
+    if (!DstMO.isReg())
+      continue;
+    if (DstMO.isImplicit())
+      continue;
+    Register Reg = DstMO.getReg();
+    if (Reg.isPhysical())
+      continue;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      ContributedSet.insert(&UseMI);
+    }
+  }
+}
+
+bool ContributionList::propagateContribution() {
+  bool bUpdated = false;
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto *MBB : RPOT) {
+    for (auto &MI : *MBB) {
+      auto &contributors = MIContributorMap[&MI];
+      unsigned size = contributors.size();
+      DenseSet<MachineInstr *> parentContributors;
+      for (auto *CMI : contributors) {
+        auto &pContributors = MIContributorMap[CMI];
+        parentContributors.insert(pContributors.begin(), pContributors.end());
+      }
+      contributors.insert(parentContributors.begin(), parentContributors.end());
+      bUpdated |= size < contributors.size();
+    }
+  }
+  return bUpdated;
+}
+
+void ContributionList::build() {
+  // Build contribution.
+  auto &MRI = MF.getRegInfo();
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIRI = ST->getRegisterInfo();
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      auto &contributors = MIContributorMap[&MI];
+      auto &contributed = MIContributedToMap[&MI];
+      buildMIContribution(MI, contributors, contributed, *SIRI, MRI);
+    }
+  }
+  // propagate contribution.
+  bool bUpdated = true;
+  while (bUpdated) {
+    bUpdated = propagateContribution();
+  }
+}
+
+void ContributionList::writeInst(MachineInstr &MI, const SIInstrInfo *SIII,
+                                 raw_ostream &os) {
+  os << "\n{\n";
+  unsigned ID = MIIndexMap[&MI];
+  auto writeSlot = [&ID, &os]() { os << ID; };
+
+  json_pair("ID", writeSlot, os);
+
+  os << ",";
+
+  auto writeAsm = [&MI, &SIII, &os]() {
+    MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+             /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
+  };
+  json_pair("asm", writeAsm, os);
+
+  os << ",\n";
+
+  auto &contributors = MIContributorMap[&MI];
+  auto writeContributor = [&contributors, this, &os]() {
+    for (auto *MI : contributors) {
+      unsigned ID = MIIndexMap[MI];
+      os << ID << ",";
+    }
+  };
+
+  json_array("contributors", writeContributor, os);
+  os << ",\n";
+
+  auto &contributeds = MIContributedToMap[&MI];
+  auto writeContributed = [&contributeds, this, &os]() {
+    for (auto *MI : contributeds) {
+      unsigned ID = MIIndexMap[MI];
+      os << ID << ",";
+    }
+  };
+
+  json_array("contributed", writeContributed, os);
+  os << "\n}\n";
+}
+
+void ContributionList::writeBlock(MachineBasicBlock &MBB,
+                                  const SIInstrInfo *SIII, raw_ostream &os) {
+  os << "{\n";
+  auto writeName = [&MBB, &os]() { os << MBB.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",";
+
+  auto writeIndex = [&MBB, &os]() { os << MBB.getNumber(); };
+  json_pair("id", writeIndex, os);
+
+  os << ",\n";
+
+  auto writeInsts = [this, &MBB, &SIII, &os]() {
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      writeInst(MI, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("instructions", writeInsts, os);
+
+  os << ",\n";
+
+  auto writePreds = [&MBB, &os]() {
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      os << Pred->getNumber() << ",";
+    }
+  };
+
+  json_array("preds", writePreds, os);
+
+  os << ",";
+
+  auto writeSuccs = [&MBB, &os]() {
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      os << Succ->getNumber() << ",";
+    }
+  };
+
+  json_array("succs", writeSuccs, os);
+
+  os << "}";
+}
+
+void ContributionList::write(raw_ostream &os) {
+  unsigned ID = 0;
+  // Build ID for write.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto *MBB : RPOT) {
+    for (auto &MI : *MBB) {
+      MIIndexMap[&MI] = ID++;
+    }
+  }
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIII = ST->getInstrInfo();
+
+  os << "{\n";
+  auto writeName = [this, &os]() { os << MF.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",\n";
+
+  auto writeBlocks = [this, &SIII, &RPOT, &os]() {
+    for (auto *MBB : RPOT) {
+      writeBlock(*MBB, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("blocks", writeBlocks, os);
+
+  os << "\n}";
+}
+} // namespace
+
+namespace llvm {
+
+void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) {
+  int FD = -1;
+  SmallString<128> TmpFilename(Filename);
+  std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename);
+  if (EC) {
+    errs() << "Error: " << EC.message() << "\n";
+    return;
+  }
+
+  raw_fd_ostream O(FD, /*shouldClose=*/true);
+  ContributionList CL(MF);
+  CL.build();
+
+  CL.write(O);
+
+  O.flush();
+  O.close();
+}
+} // namespace llvm
+
+static bool IsPhysReg(const MachineOperand &Op)
+{
+    return Op.isReg() && Op.getReg().isPhysical();
+}
+
+// Sometimes split bb uses physical registers defined in BB, have to add them to
+// live-in or the ir is malformed.
+void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI)
+{
+    // Initialize with current set of liveins. For new blocks this will be empty.
+    SmallDenseSet<unsigned, 8> DefSet;
+    for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins())
+    {
+        DefSet.insert(P.PhysReg);
+    }
+
+    for (auto &MI : *NewBB)
+    {
+        // Add all undefined physical registers to the live in set.
+        for (MachineOperand &Use : MI.operands())
+        {
+            // Only process physreg uses.
+            if (!IsPhysReg(Use) || !Use.isUse()) continue;
+
+            // Reserved regs do not need to be tracked through live-in sets.
+            unsigned Reg = Use.getReg();
+            if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue;
+
+            if (!DefSet.count(Reg))
+                NewBB->addLiveIn(Reg);
+        }
+
+        // Add all physical register defs (exlicit+implicit) to the def register set.
+        for (MachineOperand &Def : MI.operands()) 
+        {
+            // Only process physreg defs.
+            if (!IsPhysReg(Def) || !Def.isDef()) continue;
+            DefSet.insert(Def.getReg());
+        }
+    }
+}
+
+void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+                                      SmallDenseSet<unsigned, 8> &LiveOutSet,
+                                      const MachineRegisterInfo *MRI) {
+  for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) {
+    auto &MI = *rit;
+    // Add all physical register defs (exlicit+implicit) to the def register
+    // set.
+    for (MachineOperand &Def : MI.operands()) {
+      // Only process physreg defs.
+      if (!IsPhysReg(Def) || !Def.isDef())
+        continue;
+      LiveOutSet.erase(Def.getReg());
+    }
+    // Add all undefined physical registers to the live in set.
+    for (MachineOperand &Use : MI.operands()) {
+      // Only process physreg uses.
+      if (!IsPhysReg(Use) || !Use.isUse())
+        continue;
+
+      // Reserved regs do not need to be tracked through live-in sets.
+      unsigned Reg = Use.getReg();
+      if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
+        continue;
+
+      if (!LiveOutSet.count(Reg))
+        LiveOutSet.insert(Reg);
+    }
+  }
+  for (unsigned Reg : LiveOutSet) {
+    NewBB->addLiveIn(Reg);
+  }
+}
+
+MachineReg llvm::CreateVirtualRegForOperand(
+    MachineOpcode Opcode,
+    unsigned OpNum,
+    MachineFunction &MF
+)
+{
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+    const TargetInstrInfo *TII = ST.getInstrInfo();
+    const MCInstrDesc &Desc = TII->get(Opcode);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
+    if (!RC)
+    {
+        llvm::report_fatal_error("Unable to create virtual reg for instruction operand");
+    }
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    return MRI.createVirtualRegister(RC);
+}
+
+MachineReg llvm::CreateVirtualDstReg(
+    MachineOpcode Opcode,
+    MachineFunction &MF
+)
+{
+    return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
+}
+
+// Return true if the MI is a copy of exec.
+// If true then sets pDst to the destination register.
+bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
+{
+    enum {DST=0, SRC=1};
+    bool FoundCopy = false;
+    if (MI.getOpcode() == AMDGPU::COPY
+        || MI.getOpcode() == AMDGPU::S_MOV_B32
+        || MI.getOpcode() == AMDGPU::S_MOV_B64)
+    {
+        const MachineOperand &Src = MI.getOperand(SRC);
+        if (Src.isReg() && Src.getReg() == Exec)
+        {
+            FoundCopy = true;
+        }
+    }
+#if 0 // TODO: Delete this.
+    else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO ||
+             MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32)
+    {
+        FoundCopy = true;
+    }
+#endif
+            
+    if (FoundCopy)
+    {
+        *pDst = MI.getOperand(DST).getReg();
+    }
+
+    return FoundCopy;
+}
+
+llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF)
+{
+    llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister};
+    if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF))
+    {
+        LiveLaneMask.Reg = MI->getOperand(0).getReg();
+        LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
+    }
+
+    return LiveLaneMask;
+}
+
+MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
+{
+#if 0 // TODO: Get rid of this
+    // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
+    // This instruction is added by the SIWholeQuadMode pass.
+    MachineBasicBlock &MBB = MF.front();
+    for (MachineInstr &MI : MBB)
+    {
+        if (MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK ||
+            MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK_32)
+        {
+            return &MI;
+        }
+    }
+#endif
+
+    return nullptr;
+}
+
+bool llvm::IsFetchShaderCall(const MachineInstr *MI)
+{
+#if 0 // TODO: Get rid of this.
+    return 
+        MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
+        MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall);
+#else
+    return false;
+#endif
+}
+
+bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) {
+  const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+  for (auto it = MI; it != MBB->end(); ++it) {
+    const MachineInstr &CurMI = *it;
+    // Hit use of scc, it is live.
+    if (CurMI.readsRegister(AMDGPU::SCC, TRI))
+      return true;
+    // Hit def of scc first, not live.
+    if (CurMI.definesRegister(AMDGPU::SCC, TRI))
+      return false;
+  }
+  // Reach the end of MBB, check live-ins of MBB successors.
+  for (const MachineBasicBlock *Succ : MBB->successors()) {
+    if (Succ->isLiveIn(AMDGPU::SCC))
+      return true;
+  }
+  return false;
+}
+
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
+    MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator MI,
+    const TargetRegisterInfo* TRI,
+    const SIInstrInfo* TII,
+    MachineRegisterInfo* MRI,
+    SccDefInsertPointConstraintFlags Constraints
+)
+{
+    // If SCC is dead at MI when we can use MI as the insert point.
+    if (!llvm::IsSccLiveAt(MBB, MI))
+    {
+        return MI;
+    }
+
+    const bool CheckForExecWrite =
+        Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+
+    // Get the starting reverse iterator taking care to handle the MBB->end() case.
+    MachineBasicBlock::reverse_iterator Start;
+    if (MI == MBB->end())
+    {
+        Start = MBB->rbegin();
+    }
+    else
+    {
+        Start = MI.getReverse();
+    }
+
+    // Otherwise, walk backwards through the block looking for a location where
+    // SCC is dead.
+    for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It)
+    {
+        // If the instruction modifies exec then we cannot use it as
+        // an insertion point (if that is a constraint from the caller).
+        // The check for EXEC works for both wave64 and wave32 because
+        // it will also catch writes to the subregisters (e.g. exec_lo).
+        if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
+        {
+            break;
+        }
+
+        if (It->modifiesRegister(AMDGPU::SCC, TRI) 
+            && !It->readsRegister(AMDGPU::SCC, TRI))
+        {
+            return It->getIterator();
+        }
+    }
+
+    // If no safe location can be found in the block we can save and restore
+    // SCC around MI. There is no way to directly read or write SCC so we use
+    // s_cselect to read the current value of SCC and s_cmp to write the saved
+    // value back to SCC.
+    //
+    // The generated code will look like this;
+    //
+    //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
+    //      <----- Newly created safe insert point.
+    //      MI
+    //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
+    //
+    unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+        .addImm(-1)
+        .addImm(0);
+    BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32))
+        .addReg(TmpScc, RegState::Kill)
+        .addImm(0);
+
+    return MI;
+}
+
+
+namespace {
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
+                    SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+  MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+  // Treat non inst as not local.
+  if (!startMI || !endMI)
+    return false;
+  // is local when parent MBB the same.
+  bool bSameMBB = startMI->getParent() == endMI->getParent();
+  if (!bSameMBB)
+    return false;
+  // Collect touched MBB.
+  MachineBasicBlock *MBB = startMI->getParent();
+  touchedMBBSet.insert(MBB);
+  return true;
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
+                      SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+  for (const LiveRange::Segment &Seg : Range->segments) {
+    if (!isLocalSegment(&Seg, Indexes, touchedMBBSet))
+      return false;
+  }
+  return true;
+}
+
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
+  MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+  // Treat non inst as not local.
+  if (!startMI || !endMI)
+    return false;
+  // is local when parent MBB the same.
+  return startMI->getParent() == endMI->getParent();
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
+  for (const LiveRange::Segment &Seg : Range->segments) {
+    if (!isLocalSegment(&Seg, Indexes))
+      return false;
+  }
+  return true;
+}
+
+} // namespace
+
+// In case like float4 v, v.x used and defined in one block, v.y used and define
+// in another block, one live interval could touch more than one MBB.
+// touchedMBBSet is used for scheduling where local live interval could cross
+// multiple regions, need to calculate livereg for each region inside touched
+// MBB.
+bool llvm::isLocalLiveInterval(
+    const LiveInterval &LI, SlotIndexes *Indexes,
+    SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges()) {
+      if (!isLocalLiveRange(&S, Indexes, touchedMBBSet))
+        return false;
+    }
+  }
+  return isLocalLiveRange(&LI, Indexes, touchedMBBSet);
+}
+
+
+bool llvm::isLocalLiveInterval(
+    const LiveInterval &LI, SlotIndexes *Indexes) {
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges()) {
+      if (!isLocalLiveRange(&S, Indexes))
+        return false;
+    }
+  }
+  return isLocalLiveRange(&LI, Indexes);
+}
+
+// This is used to speed up reg pressure calculation.
+// If instruction is moved, the cached liveset will be out of date.
+// Before instruction is moved, the value will be correct.
+void llvm::buildEndLiveMap(
+    llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
+    const llvm::MachineRegisterInfo &MRI,
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
+        &MBBLiveMap, bool After) {
+  // When only have one block, end live reg must be empty.
+  if (MF.size() == 1)
+    return;
+  auto *SlotIndexes = LIS->getSlotIndexes();
+  DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+  for (MachineBasicBlock &MBB : MF) {
+    auto BBEnd = MBB.rbegin();
+
+    // R.End doesn't point to the boundary instruction.
+    // Skip Debug instr.
+    if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+      auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+      MBBOutputSlotMap[&MBB] = After ? SI.getDeadSlot() : SI.getBaseIndex();
+    }
+  }
+
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    auto Reg = Register::index2VirtReg(I);
+    if (!LIS->hasInterval(Reg))
+      continue;
+
+    LaneBitmask LiveMask;
+    const auto &LI = LIS->getInterval(Reg);
+
+    // Skip local live interval to make live input/ouput faster.
+    if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+      continue;
+
+    for (auto outputIt : MBBOutputSlotMap) {
+      MachineBasicBlock *MBB = outputIt.first;
+      auto SI = outputIt.second;
+
+      auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+      if (LiveMask.any())
+        MBBLiveMap[MBB][Reg] = LiveMask;
+    }
+  }
+}
+
+unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+  auto &MRI = MF.getRegInfo();
+  for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      return SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::VGPR0) + 1;
+    }
+  }
+  return 0;
+}
+
+unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned MaxSGPR = 0;
+  for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      // Skip scratch reserved reg, which is a big register that don't really contribute to this stat.
+      if (ScratchRSrcReg != 0) {
+        if (SIRI->isSubRegister(ScratchRSrcReg, Reg))
+          continue;
+      }
+      MaxSGPR = SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::SGPR0);
+      break;
+    }
+  }
+  return 1 + llvm::RegForVCC + MaxSGPR;
+}
+
+void llvm::dumpLiveSet(const LiveSet &LiveSet,
+                 const SIRegisterInfo *SIRI) {
+
+  dbgs() << "\n live set: \n";
+  for (auto it : LiveSet) {
+    int Reg = it.first;
+    dbgs() << printReg(Reg, SIRI);
+    if (it.second.any()) {
+      dbgs() << " mask:" << it.second.getAsInteger();
+    }
+    dbgs() << "\n";
+  }
+}
+
+// Test if all fast math flags of this Machine Instr are set. This allows
+// all non-strict floating-point transforms.
+bool llvm::isFastMathInst(llvm::MachineInstr &MI) {
+  // Follow the checks in isFast() in SelectionDAGNodes.h
+  return MI.getFlag(llvm::MachineInstr::MIFlag::FmNsz) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmArcp) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmNoNans) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmNoInfs) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmContract) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmAfn) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmReassoc);
+}
+#if 0
+bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage)
+{
+    switch (Stage)
+    {
+    case xmd::HwStage::PS:
+    case xmd::HwStage::CS:
+        return true;
+    default:
+        return false;
+    }
+}
+#endif
+
+MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ)
+{
+    for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It)
+    {
+        if (*It == Succ)
+        {
+            return It;
+        }
+    }
+
+    return MBB->succ_end();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
new file mode 100644
index 0000000000000..16b55c5c94583
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+class MachineFunction;
+class LiveIntervals;
+class LiveInterval;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class SIInstrInfo;
+class MachineInstr;
+class MachinePostDominatorTree;
+class MachineLoopInfo;
+class MachineDominatorTree;
+class raw_ostream;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+typedef unsigned MachineReg;
+typedef unsigned MachineOpcode;
+
+constexpr unsigned RegForVCC = 2;
+constexpr unsigned VGPR_LIMIT = 256;
+// Post RA remat only try to help case when pressue is OK before RA but RA
+// result is higher. The diff should not be too much. So just use 4 as threshold
+// here.
+constexpr unsigned PostRARematThreshHold = 4;
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI);
+void CollectLiveSetPressure(
+    const LiveSet &liveSet,
+    const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
+    unsigned &VPressure, unsigned &SPressure);
+
+bool isExecUpdateForControlFlow(llvm::MachineInstr &MI);
+
+bool IsSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
+
+llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO,
+                             const llvm::MachineRegisterInfo &MRI);
+void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+llvm::MachineBasicBlock *split(llvm::MachineInstr *I);
+
+// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only
+// used 4 lanes.
+bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
+                       const llvm::SIRegisterInfo *TRI,
+                       const llvm::SIInstrInfo *TII,
+                       llvm::SlotIndexes *SlotIndexes);
+
+bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT,
+                 llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
+                 llvm::MachineBasicBlock *ToBB);
+
+
+void viewCFGWithPhi(llvm::MachineFunction &MF);
+void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
+
+llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII);
+
+bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
+                       llvm::MachineBasicBlock &MBB);
+
+void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI);
+
+void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+                                 llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
+                                 const llvm::MachineRegisterInfo *MRI);
+
+MachineReg CreateVirtualRegForOperand(
+    MachineOpcode Opcode,
+    unsigned Operand,
+    llvm::MachineFunction &MF
+);
+
+MachineReg CreateVirtualDstReg(
+    MachineOpcode Opcode,
+    llvm::MachineFunction &MF
+);
+
+bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst);
+struct MachineRegWithSubReg {
+  MachineReg Reg = AMDGPU::NoRegister;
+  unsigned SubReg = AMDGPU::NoSubRegister;
+};
+MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF);
+llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
+
+// Return true if this machine instruction represents a call to the fetch shader.
+// We curently have two mechanisims for calling fetch shader:
+// 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
+// 2. A CALL instruction with the `FetchShaderCall` flag set to true.
+bool IsFetchShaderCall(const llvm::MachineInstr* MI);
+
+bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI);
+
+
+// An enum used to pass additional constraints to
+// `FindOrCreateInsertionPointForSccDef()`. This will further
+// constrain the location where the scc def can be inserted.
+enum SccDefInsertPointConstraintFlags
+{
+    None        = 0,   // No additional constraints.
+    NoExecWrite = 1,   // Should be no modification of exec between BeforeInst and insert point.
+};
+
+// Look for a safe place to insert an instruction that defines scc.
+//
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef(
+    llvm::MachineBasicBlock* MBB,
+    llvm::MachineBasicBlock::iterator BeforeInst,
+    const llvm::TargetRegisterInfo* TRI,
+    const llvm::SIInstrInfo* TII,
+    llvm::MachineRegisterInfo* MRI,
+    SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None
+);
+
+// Check if LI live cross basic blocks, save all touched basic block if is
+// local.
+bool isLocalLiveInterval(
+    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
+    llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &touchedMBBSet);
+bool isLocalLiveInterval(
+    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes);
+
+// build liveRegSet at end of each MBB.
+void buildEndLiveMap(
+    llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
+    const llvm::MachineRegisterInfo &MRI,
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
+        &MBBLiveMap, bool After);
+
+void dumpLiveSet(const LiveSet &LiveSet,
+                 const llvm::SIRegisterInfo *SIRI);
+
+unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+
+bool isFastMathInst(llvm::MachineInstr &MI);
+
+namespace pressure {
+void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI,
+               const llvm::SIRegisterInfo *SIRI,
+               llvm::raw_ostream &os);
+void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
+                    const char *Filename);
+void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
+                    llvm::raw_ostream &os);
+}
+// bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage);
+
+// Look for the successor `Succ` of the given `MBB`.
+// Returns MBB->succ_end() if `Succ` is not a successor of MBB.
+llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ);
+
+// The enum and helper function for v_perm selection mask.
+//
+// The input byte layout of v_perm is as below: 
+//
+// BYTE in[8]
+// in[0] = $src1_BYTE0;
+// in[1] = $src1_BYTE1;
+// in[2] = $src1_BYTE2;
+// in[3] = $src1_BYTE3;
+// in[4] = $src0_BYTE0;
+// in[5] = $src0_BYTE1;
+// in[6] = $src0_BYTE2;
+// in[7] = $src0_BYTE3;
+//
+enum class V_PERM_IN_BYTE_POS {
+  src1_BYTE0 = 0,
+  src1_BYTE1,
+  src1_BYTE2,
+  src1_BYTE3,
+  src0_BYTE0,
+  src0_BYTE1,
+  src0_BYTE2,
+  src0_BYTE3
+};
+
+// The 4 arguments specify which input byte will be output
+// out[0] = Sel_0;
+// out[1] = Sel_1;
+// out[2] = Sel_2;
+// out[3] = Sel_3;
+//
+constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0,
+                                   V_PERM_IN_BYTE_POS Sel_1,
+                                   V_PERM_IN_BYTE_POS Sel_2,
+                                   V_PERM_IN_BYTE_POS Sel_3) {
+  return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) |
+          ((int)Sel_1 << 8) | (int)Sel_0);
+}
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
new file mode 100644
index 0000000000000..ceb22b5ff9243
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
@@ -0,0 +1,2767 @@
+//===- MirDivergenceAnalysis.cpp -- Mir Divergence Analysis Implementation -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is based on Analysis/DivergenceAnalysis.cpp,
+// The most important difference is
+// introduction of the idea of "Bit-Divergence".
+//
+// The way booleans are represented in in AMD GPU is a 64-bit uint in a pair of
+// scalar registers, where each bit represents a boolean value for one lane. If
+// all active lanes have the same bool value (all 1's or all 0's), then we can
+// generate a scalar branch, otherwise we must use exec mask to selectively
+// execute lanes based on the boolean mask. When all values in a boolean mask
+// are the same for all active lanes, we call that mask "bit-uniform",
+// otherwise we call it "bit-divergent". This differs from the normal concept
+// of "uniform" and "divergent", which represents whether the value may be
+// different across the 64 lanes. A "bit-divergent" value is still "uniform" in
+// the sense that it is the same 64-bit value from the perspective of all the
+// lanes, but when used as branch condition, will cause the branch to be
+// divergent, which will cause the uses of any values outside of the control
+// flow region to be divergent.
+//
+// The original DA marks everything including bools as divergent or uniform
+// based on the propagation of divergent sources. However, booleans in AMDGPU
+// are in fact never "divergent". Comparison operations that receive divergent
+// operands instead produce "bit-divergent" or "bit-uniform" 64-bit booleans.
+// Between the definition of any boolean mask and its use (particularly in
+// branches, cndmasks, or anything that specifially consumes booleans), there
+// can be any arbitrary number and types of operations performed on it,
+// including combining it with other boolean masks via bit operations.
+//
+// The XDA algorithm is a modified version of the original DA algorithm to
+// simultaneously propagate regular divergence and bit-divergence.
+//
+// First off, XDA identifies all sources of divergence as well as
+// bit-divergence and adds them to the worklist. Then, just like with LLVM DA,
+// it pops values off of the worklist to propagate (bit-)divergence to all its
+// users, unless the user is always (bit-)uniform when given (bit-)divergent
+// operand. It's possible for a value to be marked as both divergent and
+// bit-divergent, in which case the regular divergence will trump
+// bit-divergence.
+//
+// The important difference in this propagation step is that there are special
+// instructions that when given bit-divergent operands, produce divergent
+// values and vice versa.
+//
+// An example is comparison:
+//
+// v0 = interp ...               ; divergent
+// v1 = interp ...               ; divergent
+// s[0:1] = v_cmp v0, v1         ; bit-divergent
+//
+// v0 and v1 are both divergent, but when propagating them, the v_cmp (and its
+// result) is bit-divergent value instead of divergent.
+//
+//
+// An example of the reverse:
+//
+// v0 = ...                                ; uniform
+// s[0:1] = v_cmp v0, v1                   ; bit-divergent
+// ...
+// branch s[0:1], label                    ; divergent!
+// ...
+// v1 = ...                                ; uniform
+// ...
+//
+// label:
+// v3 = phi v0, v1                         ; divergent! because of divergent branch.
+//
+// The boolean value is bit-divergent. When passed to the branch as an operand,
+// the branch becomes divergent, whose sync dependency will be computed as
+// normal to mark the appropriate values divergent (see description in normal
+// DA on how this works).
+//
+// Another difference is in MIR, some branch will be changed into exec update,
+// so only propagate control flow divergent on branch inst will not cover exec
+// control flow.
+// For case like
+//  %163:sreg_64_xexec = S_MOV_B64 $exec
+//bb.1:
+//; predecessors: %bb.1, %bb.0
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
+//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
+//  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//...
+//  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
+//  S_CBRANCH_EXECNZ %bb.1, implicit $exec
+// The ... code after SAVEEXEC will be divergent if %168 is divergent.
+// The PHI should be divergent when %40 is inside the ...
+// To propagate divergent from %168 to the PHI, need to start the propagate from
+// SAVEEXEC which is the control flow by update exec.
+//
+//
+// Original:
+// This file implements a general divergence analysis for loop vectorization
+// and GPU programs. It determines which branches and values in a loop or GPU
+// program are divergent. It can help branch optimizations such as jump
+// threading and loop unswitching to make better decisions.
+//
+// GPU programs typically use the SIMD execution model, where multiple threads
+// in the same execution group have to execute in lock-step. Therefore, if the
+// code contains divergent branches (i.e., threads in a group do not agree on
+// which path of the branch to take), the group of threads has to execute all
+// the paths from that branch with different subsets of threads enabled until
+// they re-converge.
+//
+// Due to this execution model, some optimizations such as jump
+// threading and loop unswitching can interfere with thread re-convergence.
+// Therefore, an analysis that computes which branches in a GPU program are
+// divergent can help the compiler to selectively run these optimizations.
+//
+// This implementation is derived from the Vectorization Analysis of the
+// Region Vectorizer (RV). That implementation in turn is based on the approach
+// described in
+//
+//   Improving Performance of OpenCL on CPUs
+//   Ralf Karrenberg and Sebastian Hack
+//   CC '12
+//
+// This DivergenceAnalysis implementation is generic in the sense that it does
+// not itself identify original sources of divergence.
+// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
+// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
+// (e.g., special variables that hold the thread ID or the iteration variable).
+//
+// The generic implementation propagates divergence to variables that are data
+// or sync dependent on a source of divergence.
+//
+// While data dependency is a well-known concept, the notion of sync dependency
+// is worth more explanation. Sync dependence characterizes the control flow
+// aspect of the propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// The sync dependence detection (which branch induces divergence in which join
+// points) is implemented in the SyncDependenceAnalysis.
+//
+// The current DivergenceAnalysis implementation has the following limitations:
+// 1. intra-procedural. It conservatively considers the arguments of a
+//    non-kernel-entry function and the return value of a function call as
+//    divergent.
+// 2. memory as black box. It conservatively considers values loaded from
+//    generic or local address as divergent. This can be improved by leveraging
+//    pointer analysis and/or by modelling non-escaping memory objects in SSA
+//    as done in RV.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMirDivergenceAnalysis.h"
+#include "GCNSubtarget.h"
+#include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUAsmUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "SIInstrInfo.h"
+//#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/Support/Debug.h"
+//#include "newbe/cli/newbe_opts.h"  // AMDGPU change.
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mir-divergence-analysis"
+
+namespace llvm {
+bool isAMDGPUOpcodeDivergent(class MachineInstr *MI);
+}
+
+//
+// TODO: TableGen these
+//
+bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  // case R600::INTERP_LOAD_P0:
+  // case R600::INTERP_PAIR_XY:
+  // case R600::INTERP_PAIR_ZW:
+  // case R600::INTERP_VEC_LOAD:
+  // case R600::INTERP_XY:
+  // case R600::INTERP_ZW:
+  case AMDGPU::V_WRITELANE_B32:
+
+  case AMDGPU::V_INTERP_MOV_F32:
+  case AMDGPU::V_INTERP_MOV_F32_e64:
+  case AMDGPU::V_INTERP_MOV_F32_e64_vi:
+  case AMDGPU::V_INTERP_MOV_F32_si:
+  case AMDGPU::V_INTERP_MOV_F32_vi:
+  case AMDGPU::V_INTERP_P1LL_F16:
+  case AMDGPU::V_INTERP_P1LL_F16_vi:
+  case AMDGPU::V_INTERP_P1LV_F16:
+  case AMDGPU::V_INTERP_P1LV_F16_vi:
+  case AMDGPU::V_INTERP_P1_F32:
+  case AMDGPU::V_INTERP_P1_F32_16bank:
+  case AMDGPU::V_INTERP_P1_F32_16bank_si:
+  case AMDGPU::V_INTERP_P1_F32_16bank_vi:
+  case AMDGPU::V_INTERP_P1_F32_e64:
+  case AMDGPU::V_INTERP_P1_F32_e64_vi:
+  case AMDGPU::V_INTERP_P1_F32_si:
+  case AMDGPU::V_INTERP_P1_F32_vi:
+  case AMDGPU::V_INTERP_P2_F16:
+  case AMDGPU::V_INTERP_P2_F16_vi:
+  case AMDGPU::V_INTERP_P2_F32:
+  case AMDGPU::V_INTERP_P2_F32_e64:
+  case AMDGPU::V_INTERP_P2_F32_e64_vi:
+  case AMDGPU::V_INTERP_P2_F32_si:
+  case AMDGPU::V_INTERP_P2_F32_vi:
+
+  case AMDGPU::V_MBCNT_HI_U32_B32_e32:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e32_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx10:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64_vi:
+  case AMDGPU::V_MBCNT_HI_U32_B32_sdwa:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e32:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e32_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx10:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64_vi:
+  case AMDGPU::V_MBCNT_LO_U32_B32_sdwa:
+
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_vi:
+
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_vi:
+
+  case AMDGPU::SI_PS_LIVE:
+
+  case AMDGPU::DS_SWIZZLE_B32:
+  case AMDGPU::DS_SWIZZLE_B32_gfx10:
+  case AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7:
+  case AMDGPU::DS_SWIZZLE_B32_vi:
+
+    return true;
+
+  default:
+    break;
+  }
+  return false;
+}
+
+namespace {
+bool hasImmOperandWithVal(const MachineInstr *MI, uint16_t srcNameIdx,
+                          uint16_t srcModNameIdx, uint64_t Val) {
+  unsigned Op = MI->getOpcode();
+  unsigned srcIdx = AMDGPU::getNamedOperandIdx(Op, srcNameIdx);
+  if (srcIdx == -1)
+    return false;
+  const MachineOperand &srcMO = MI->getOperand(srcIdx);
+  if (srcMO.isImm() && srcMO.getImm() == Val) {
+
+    unsigned modIdx = AMDGPU::getNamedOperandIdx(Op, srcModNameIdx);
+    if (modIdx == -1)
+      return true;
+
+    const MachineOperand &modMO = MI->getOperand(modIdx);
+    if (modMO.getImm() == 0)
+      return true;
+  }
+  return false;
+}
+
+bool isConstant(const MachineInstr *MI) {
+  unsigned Op = MI->getOpcode();
+  switch (Op) {
+  default:
+    break;
+  case AMDGPU::V_OR_B32_e32:
+  case AMDGPU::V_OR_B32_e64: {
+    // Check special case  or -1, which will get result -1.
+    const uint64_t kImm = -1;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+                             AMDGPU::OpName::src0_modifiers, kImm))
+      return true;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+                             AMDGPU::OpName::src1_modifiers, kImm))
+      return true;
+  } break;
+  case AMDGPU::S_OR_B32:
+  case AMDGPU::S_OR_B64: {
+    // Check special case  or -1, which will get result -1.
+    const uint64_t kImm = -1;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+                             AMDGPU::OpName::src0_modifiers, kImm))
+      return true;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+                             AMDGPU::OpName::src1_modifiers, kImm))
+      return true;
+  } break;
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::S_AND_B64:
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64: {
+    // Check special case  and 0, which will get result 0.
+    const uint64_t kImm = 0;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+                             AMDGPU::OpName::src0_modifiers, kImm))
+      return true;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+                             AMDGPU::OpName::src1_modifiers, kImm))
+      return true;
+  } break;
+  }
+  return false;
+}
+
+bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
+                  const MachineRegisterInfo &MRI) {
+  const auto *BoolRC = SIRI->getBoolRC();
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+        Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO)
+      return true;
+
+    // Check if the written register class overlaps the bool register class.
+    //
+    // Note that this check is insufficent to catch all of the cases where
+    // a "bool" value could be created (for example writing to a register
+    // pair s[0:1], then using s0 as a bool value in wave32).
+    //
+    // The underlying problem is that we have two notions of divergence
+    // (bit divergence and wave divergence) but the algorithm only propagates
+    // wave divergence. The bit divergence is important for bools because it determines
+    // if a branch is uniform or not (and thus catches cases where a uniform value is
+    // used outside of a divergent control flow region). For bool values the
+    // algorithm will treat normally uniform values (i.e. scalar registers) as divergent
+    // in order to try and propagate bit divergence.
+    //
+    // To fix all the possible bugs here I think we need to actually proagate bit
+    // divergence as well as wave divergences. That is a bigger fix and this check should
+    // cover most cases of treating a bool value as divergent.
+    const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+    if (SIRI->getCommonSubClass(BoolRC, RC))
+      return true;
+  }
+  return false;
+}
+
+bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
+                       const SIRegisterInfo *SIRI,
+                       const MachineRegisterInfo &MRI) {
+  unsigned Op = MI->getOpcode();
+  switch (Op) {
+  default:
+    // Mark all s_inst always uniform except write to bool dst. This doesn't
+    // mean it is bit uniform. When check branch/exec region, will use
+    // isBitUniform. A bool might be sreg, but still divergent, since it is just
+    // put all lanes in one 64/32 bits sreg.
+    if (SIII->isScalarUnit(*MI) && !writeBoolDst(MI, SIRI, MRI) &&
+        !MI->isTerminator())
+      return true;
+    break;
+  //case AMDGPU::AMDGPU_MAKE_UNIFORM:
+  //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
+  case AMDGPU::V_READFIRSTLANE_B32:
+  case AMDGPU::V_READLANE_B32:
+  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
+  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
+    // bool readfirstlane should be 1 bit, which means bit uniform.
+    return true;
+  case AMDGPU::S_OR_B32:
+  case AMDGPU::S_OR_B64: {
+    // Check special case  or -1, which will get result -1.
+    if (isConstant(MI))
+      return true;
+
+    return !writeBoolDst(MI, SIRI, MRI);
+  } break;
+  case AMDGPU::V_OR_B32_e32:
+  case AMDGPU::V_OR_B32_e64: {
+    // Check special case  or -1, which will get result -1.
+    if (isConstant(MI))
+      return true;
+  } break;
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::S_AND_B64: {
+    // Check special case  and 0, which will get result 0.
+    if (isConstant(MI))
+      return true;
+
+    return !writeBoolDst(MI, SIRI, MRI);
+  } break;
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64: {
+    // Check special case  and 0, which will get result 0.
+    if (isConstant(MI))
+      return true;
+  } break;
+  }
+  return false;
+}
+
+bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) {
+  return reg.isPhysical();;
+}
+
+bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
+  return MRI.getRegClass(reg)->getID() == regClassID;
+}
+
+// For input reg of MF, vgpr will be divergent.
+bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+  if (isPhysicalReg(MRI, Reg)) {
+    unsigned vir_reg = MRI.getLiveInVirtReg(Reg);
+    if (SIRI->isVGPR(MRI, vir_reg))
+      return true;
+  } else {
+   if (SIRI->isVGPR(MRI, Reg))
+      return true;
+  }
+  return false;
+}
+
+bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI,
+                          const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
+  //  return true;
+  if (isAMDGPUOpcodeDivergent(MI))
+    return true;
+
+  if (isAlwaysUniformMI(MI, SIII, SIRI, MRI))
+    return false;
+
+  // If the instruction is neither guaranteed to
+  // be uniform or divergent, check whether any
+  // of its operands are passed in to the shader as
+  // args through vector regs.
+  //
+  // This makes them divergent.
+  for (MachineOperand &op : MI->operands()) {
+    if (!op.isReg())
+      continue;
+    if (op.isDef())
+      continue;
+    unsigned reg = op.getReg();
+    if (MRI.isLiveIn(reg)) {
+      if (isDivergentInputReg(reg, MRI, SIRI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+// For VCC, try to find the nearest define inside same BB.
+const MachineInstr *findPhysicalDefineInSameMBB(const MachineInstr *MI,
+                                                unsigned PhyReg) {
+  const MachineBasicBlock *MBB = MI->getParent();
+  auto it = MI->getReverseIterator();
+  for (it++; it != MBB->rend(); it++) {
+    const MachineInstr &TmpMI = *it;
+    for (const MachineOperand &DefMO : TmpMI.operands()) {
+      if (!DefMO.isReg())
+        continue;
+      if (DefMO.isUse())
+        continue;
+      if (DefMO.getReg() == PhyReg)
+        return &TmpMI;
+    }
+  }
+  return nullptr;
+}
+
+bool isWriteExec(const MachineInstr *MI) {
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == AMDGPU::EXEC ||
+        Reg == AMDGPU::EXEC_LO)
+      return true;
+  }
+  return false;
+}
+
+bool isVCndMask(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case AMDGPU::V_CNDMASK_B32_e32:
+  case AMDGPU::V_CNDMASK_B32_e64:
+  case AMDGPU::V_CNDMASK_B32_dpp:
+  case AMDGPU::V_CNDMASK_B32_sdwa:
+  case AMDGPU::V_CNDMASK_B64_PSEUDO:
+    return true;
+  }
+}
+
+
+bool isExecRegionOp(unsigned Op) {
+  switch (Op) {
+  default:
+    return false;
+  case AMDGPU::COPY:
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+    return true;
+  }
+}
+
+bool isRestoreExec(const MachineInstr *MI) {
+  unsigned Op = MI->getOpcode();
+  if (!isExecRegionOp(Op))
+    return false;
+
+  return isWriteExec(MI);
+}
+
+const MachineInstr *
+findExecRegionBeginFromRegionEnd(const MachineInstr *MI,
+                                 const MachineRegisterInfo &MRI) {
+  const MachineOperand &MO = MI->getOperand(1);
+  if (!MO.isReg())
+    return nullptr;
+  unsigned Reg = MO.getReg();
+  const MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+  if (!Def)
+    return nullptr;
+  // Make sure the def is S_MOV Reg, Exec.
+  if (!isExecRegionOp(Def->getOpcode()))
+    return nullptr;
+  const MachineOperand &ExecMO = Def->getOperand(1);
+  if (!ExecMO.isReg())
+    return nullptr;
+  unsigned ExecReg = ExecMO.getReg();
+  if (ExecReg == AMDGPU::EXEC || ExecReg == AMDGPU::EXEC_LO)
+    return Def;
+  else
+    return nullptr;
+}
+
+bool isInsideExecRegion(const MachineInstr &MI, const MachineInstr &RegionBegin,
+                        const MachineInstr &RegionEnd,
+                        const MachineDominatorTree &DT,
+                        const MachinePostDominatorTree &PDT) {
+  if (!DT.dominates(&RegionBegin, &MI))
+    return false;
+
+  const MachineBasicBlock *MBB = MI.getParent();
+  const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
+  if (MBB != RegionEndMBB) {
+    return PDT.dominates(RegionEndMBB, MBB);
+  } else {
+    // MachineLoop through the basic block until we find A or B.
+    MachineBasicBlock::const_iterator I = MBB->begin();
+    for (; I != MI.getIterator() && I != RegionEnd.getIterator(); ++I)
+      /*empty*/;
+
+    // RegionEnd post-dominates MI if MI is found first in the basic block.
+    return I == MI.getIterator();
+  }
+}
+
+bool isInsideExecRegion(const MachineBasicBlock &MBB,
+                        const MachineInstr &RegionBegin,
+                        const MachineInstr &RegionEnd,
+                        const MachineDominatorTree &DT,
+                        const MachinePostDominatorTree &PDT) {
+  const MachineBasicBlock *RegionBeginMBB = RegionBegin.getParent();
+  const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
+  if (!DT.dominates(RegionBeginMBB, &MBB))
+    return false;
+  return PDT.dominates(RegionEndMBB, &MBB);
+}
+
+// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region?
+// Then when hit saveExec, propagate leaked users of define inside the exec region.
+
+} // namespace
+
+namespace llvm {
+// class DivergenceAnalysis
+DivergenceAnalysis::DivergenceAnalysis(
+    const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT,
+    const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI,
+    SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
+    // AMDGPU change begin.
+    DivergentJoinMapTy &JoinMap
+    // AMDGPU change end.
+    )
+    : F(F), MRI(F.getRegInfo()), RegionLoop(RegionLoop), DT(DT), PDT(PDT),
+      LI(LI), SDA(SDA), DivergentJoinMap(JoinMap), // AMDGPU change
+      IsLCSSAForm(IsLCSSAForm) {
+  const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+  SIRI = ST->getRegisterInfo();
+  SIII = ST->getInstrInfo();
+}
+
+void DivergenceAnalysis::markDivergent(const ValueTy DivVal) {
+  assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
+  // AMDGPU change begin.
+  LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+             const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+             dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI););
+  //AMDGPU change end.
+  DivergentValues.insert(DivVal);
+}
+
+// Mir change.
+void DivergenceAnalysis::markDivergent(const MachineInstr &I) {
+  for (const MachineOperand &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    markDivergent(Reg);
+  }
+  DivergentInsts.insert(&I);
+}
+
+void DivergenceAnalysis::addUniformOverride(const ValueTy UniVal) {
+  // TODO: support uniform multi-def.
+  if (MRI.getUniqueVRegDef(UniVal) == nullptr)
+    return;
+
+  UniformOverrides.insert(UniVal);
+}
+
+void DivergenceAnalysis::addUniformOverride(const MachineInstr &I) {
+  for (const MachineOperand &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    addUniformOverride(Reg);
+  }
+  UniformOverridesInsts.insert(&I);
+}
+
+bool DivergenceAnalysis::isBitUniform(
+    const MachineInstr &I, const llvm::MachineOperand &UseMO,
+    llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
+  if (UseMO.isImm()) {
+    uint64_t val = UseMO.getImm();
+    // 0 and -1 are OK since all lanes are still the same.
+    if (val == 0 || val == -1)
+      return true;
+    else
+      return false;
+  }
+  if (!UseMO.isReg())
+    return true;
+  unsigned Reg = UseMO.getReg();
+  // Exec is always bituniform, because all active lanes are 1.
+  if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+      // SCC only has 1 bit. Always bituniform.
+      Reg == AMDGPU::SCC)
+    return true;
+
+  const MachineInstr *UseMI = nullptr;
+  if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) {
+    // Try to find define of this VCC.
+    UseMI = findPhysicalDefineInSameMBB(&I, Reg);
+  } else {
+    UseMI = MRI.getUniqueVRegDef(Reg);
+  }
+  if (!UseMI) {
+    return false;
+  }
+
+  bool bResult = isBitUniform(*UseMI, Processed);
+  Processed[UseMI] = bResult;
+  return bResult;
+}
+
+bool DivergenceAnalysis::isBitUniform(
+    const MachineInstr &I,
+    llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
+  auto it = Processed.find(&I);
+  if (it != Processed.end())
+    return it->second;
+  // For branch on MIR, need to make sure all activi lanes are the same.
+  // cmp of uniform value will make sure all active lanes are the same.
+  // Imm is also the same for all active lanes.
+  if (isDivergent(I))
+    return false;
+  // Uniform cmp is bit uniform.
+  if (I.isCompare())
+    return true;
+  if (isConstant(&I))
+    return true;
+
+  // Conservatively consider bituniform to be false.
+  Processed[&I] = false;
+
+  // If all operand is bit uniform, then result is bit uniform.
+  bool bAllOperandBitUniform = true;
+  for (const MachineOperand &UseMO : I.uses()) {
+    if (isBitUniform(I, UseMO, Processed))
+      continue;
+    bAllOperandBitUniform = false;
+    break;
+  }
+  return bAllOperandBitUniform;
+}
+
+bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const {
+  if (Term.getParent()->succ_size() <= 1)
+    return false;
+  switch (Term.getOpcode()) {
+  default: {
+    if (updateNormalInstruction(Term))
+      return true;
+    llvm::DenseMap<const MachineInstr *, bool> Processed;
+    // Check bit uniform here if not divergent.
+    return !isBitUniform(Term, Processed);
+  }
+  //case AMDGPU::AMDGPU_CALL_INDIRECT:
+  case AMDGPU::SI_CALL:
+    return true;
+  }
+}
+
+bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
+  // TODO function calls with side effects, etc
+  if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
+    return false;
+  if (DivergentInsts.find(&I) != DivergentInsts.end())
+    return true;
+  for (const auto &Op : I.uses()) {
+    if (!Op.isReg())
+      continue;
+    Register Reg = Op.getReg();
+    if (Reg.isPhysical()) {
+      if (Reg == AMDGPU::EXEC ||
+          Reg == AMDGPU::EXEC_LO ||
+          Reg == AMDGPU::SCC)
+        continue;
+      else 
+      if (const MachineInstr *DefMI =
+              findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
+        if (isDivergent(*DefMI))
+          return true;
+      } else {
+        // If cannot find def in same MBB, just treat it as divergent.
+        return true;
+      }
+    } else {
+      if (isDivergent(Op.getReg()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
+                                             const ValueTy Val,
+                                             const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
+  const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants.
+  const auto *Inst = MRI.getUniqueVRegDef(Val);
+  if (Inst == nullptr)
+    return true;
+  if (Inst)
+      DefBlock = Inst->getParent(); 
+
+  // check whether any divergent loop carrying Val terminates before control
+  // proceeds to ObservingBlock
+  for (const auto *MachineLoop = LI.getLoopFor(DefBlock); // AMDGPU change
+       MachineLoop != RegionLoop && !MachineLoop->contains(&ObservingBlock);
+       MachineLoop = MachineLoop->getParentLoop()) {
+    if (DivergentLoops.find(MachineLoop) != DivergentLoops.end())
+      return true;
+  }
+
+  return false;
+}
+
+// AMDGPU CHANGE BEGIN
+static bool HasIncomingUndefValue(const PHINode_ *Phi) {
+  for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) {
+    const MachineOperand &Op = Phi->getOperand(I);
+    if (Op.isUndef())
+      return true;
+  }
+  return false;
+}
+
+// For case like
+//  %163:sreg_64_xexec = S_MOV_B64 $exec
+//bb.1:
+//; predecessors: %bb.1, %bb.0
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
+//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
+//  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//...
+//  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
+//  S_CBRANCH_EXECNZ %bb.1, implicit $exec
+// The ... code after SAVEEXEC will be divergent if %168 is divergent.
+// Return the SaveExec which affect MI.
+// If not exist, return nullptr.
+static const MachineInstr *
+findSaveExec(const MachineInstr *MI,
+             const SmallVector<const MachineInstr *, 2> &SaveExecs) {
+  // No save exec.
+  if (SaveExecs.empty())
+    return nullptr;
+  if (SaveExecs.size() > 1)
+    llvm::report_fatal_error(
+        "Not support case where, MBB has more than one SaveExec");
+  const MachineInstr *SaveExec = SaveExecs.front();
+  const MachineBasicBlock *MBB = SaveExec->getParent();
+  // Make sure MI is after SaveExec by check it is not before SaveExec.
+  // Assume MBB.begin to SaveExec is short here.
+  bool bIsAfterSaveExec = true;
+  for (auto it = MBB->begin(); it != SaveExec->getIterator(); it++) {
+    if (MI == it) {
+      bIsAfterSaveExec = false;
+      break;
+    }
+  }
+  // Not affect by save exec.
+  if (!bIsAfterSaveExec)
+    return nullptr;
+
+  return SaveExec;
+}
+
+// When a Phi's parent isJoinDivergent,the case make phi divergent is that 2
+// incoming values merged from different path of a divergent branch.
+// isJoinDivergentOnlyOnSameIncomingValue will check for all
+// combinations of incoming values except the BB with same incoming value,
+// because if values are same then even divergent branch is not divergent.
+// For example phi a:A, b:B, a:C.
+// It will check (A,B) (B,C) but not (A, C) Because A
+// and C has same value a.
+// If only (A,C) is sharing divergent branch,
+// then phi a:A, b:B, a:C is still uniform.
+// DivergentJoinMap saving MachineBasicBlock pairs which on different path of a
+// divergent branch and joined at one block.
+// For example,
+//    A
+//  /   \
+// |     \
+// |      \
+// B       /
+// | \    /
+// |  \  /
+// C   D
+// |   /
+//  \ /
+//   E
+// If A is uniform branch, B is divergent branch. Then only (C, D) will be saved
+// in DivergentJoinMap.
+// DivergentJoinMap is build with updateDisjointMap in
+// SyncDependenceAnalysis.cpp when SyncDependenceAnalysis::join_block is called.
+// It will only run on divergent branch, so (A, B) is not in
+// DivergentDisjointMap when A is uniform.
+static bool isJoinDivergentOnlyOnSameIncomingValue(
+    const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT,
+    DivergentJoinMapTy &DivergentJoinMap) {
+  // for phi which join divergent, if the incoming values from divergent
+  // branch are the same, the phi is still uniform.
+  // A
+  // | \
+  // |  \
+  // B   \
+  // |\   \
+  // | \   |
+  // C  D  E
+  // |  /  /
+  //  \/  /
+  //   \ /
+  //    F
+  // for phi in F like.
+  // phi (a:C, a:D, b:E)
+  // If A is uniform branch, B is non-uniform branch, phi is uniform.
+  SmallDenseSet<unsigned, 8> ValueToBlockMap;
+  for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+    const MachineOperand &Op = Phi.getOperand(I);
+    if (!Op.isReg())
+      continue;
+    unsigned Reg = Op.getReg();
+    if (pDA->isDivergent(Reg))
+      return false;
+
+    ValueToBlockMap.insert(Reg);
+  }
+  unsigned NumIncoming = (Phi.getNumOperands() - 1) / 2;
+  // When there's same incoming value from different incoming block.
+  // If divergent select is only on same value, then it is still uniform.
+  if (ValueToBlockMap.size() != NumIncoming) {
+    // When a phi is on divergent join block, there is incoming block which is
+    // comeing from different path of a divergent branch.
+    // Check all combination here.
+    for (unsigned i = 0; i < NumIncoming; i++) {
+      MachineBasicBlock *BB0 = Phi.getOperand(2 + 2 * i).getMBB();
+      const MachineOperand &MO0 = Phi.getOperand(1 + 2 * i);
+      for (unsigned j = i + 1; j < NumIncoming; j++) {
+        MachineBasicBlock *BB1 = Phi.getOperand(2 + 2 * j).getMBB();
+        const MachineOperand &MO1 = Phi.getOperand(1 + 2 * j);
+        // If value match, no divergent.
+        if (MO0.isImm() && MO1.isImm() && MO0.getImm() == MO1.getImm())
+          continue;
+        if (MO0.isReg() && MO1.isReg() && MO0.getReg() == MO1.getReg() &&
+            MO0.getSubReg() == MO1.getSubReg())
+          continue;
+
+        // If BB and BB2 is from divergent disjoint, then they will
+        // divergent join on phi.
+        // This is for case like
+        //    A
+        //  /   \
+        // |     \
+        // |      \
+        // B       /
+        // | \    /
+        // |  \  /
+        // C   D
+        // |   /
+        //  \ /
+        //   E
+        //
+        // phi(a:C, b:D)
+        // When nearestCommonDominator is A, but B also can be divergent
+        // disjoint for C and D.
+        if (DivergentJoinMap[BB0].count(BB1))
+          return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+// AMDGPU CHANGE END
+
+bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
+  // AMDGPU CHANGE BEGIN
+  // Do not mark phis with undef as incoming values as uniform.
+  // When promoting to scalar we will readfirstlane on
+  // the phi output. If some of the inputs are undef then
+  // this could replace a well defined vector value with an
+  // undefined scalar value.
+  if (HasIncomingUndefValue(&Phi))
+    return true;
+  // AMDGPU CHANGE END
+
+  // joining divergent disjoint path in Phi parent block
+  if (isJoinDivergent(*Phi.getParent())) {
+    // AMDGPU CHANGE BEGIN
+    if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
+      // Continue if the divergent join only on same incoming value.
+      if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT,
+                                                  DivergentJoinMap))
+        return true;
+    } else
+    // AMDGPU CHANGE END
+    return true;
+  }
+
+  // An incoming value could be divergent by itself.
+  // Otherwise, an incoming value could be uniform within the loop
+  // that carries its definition but it may appear divergent
+  // from outside the loop. This happens when divergent loop exits
+  // drop definitions of that uniform value in different iterations.
+  //
+  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
+  //   if (i % thread_id == 0) break;    // divergent loop exit
+  // }
+  // int divI = i;                 // divI is divergent
+  for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+    const MachineOperand &Op = Phi.getOperand(I);
+    if (!Op.isReg())
+      continue;
+
+    unsigned Reg = Op.getReg();
+    const MachineOperand &BB = Phi.getOperand(I + 1);
+    if (isDivergent(Reg) ||
+        isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB()))
+      return true;
+
+  }
+
+  return false;
+}
+
+bool DivergenceAnalysis::updateVCndMask(const MachineInstr &VCndMask) const {
+  // VCndMask require the Cond bituniform to be uniform.
+  unsigned Op = VCndMask.getOpcode();
+  unsigned src0Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src0);
+  unsigned src1Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src1);
+  unsigned src2Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src2);
+
+  const MachineOperand &src0 = VCndMask.getOperand(src0Idx);
+  const MachineOperand &src1 = VCndMask.getOperand(src1Idx);
+
+  const MachineOperand &cond = VCndMask.getOperand(src2Idx);
+
+  if (isDivergent(src0))
+    return true;
+
+  // If src0 == src1, then return src0 divergent.
+  if (src0.isReg() && src1.isReg() && src0.getReg() == src1.getReg()) {
+    if (src0.getSubReg() == src1.getSubReg() &&
+        SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src0_modifiers) ==
+            SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src1_modifiers))
+      return false;
+  }
+
+  if (isDivergent(src1))
+    return true;
+
+  llvm::DenseMap<const MachineInstr *, bool> Processed;
+  return !isBitUniform(VCndMask, cond, Processed);
+}
+
+bool DivergenceAnalysis::inRegion(const MachineInstr &I) const {
+  return I.getParent() && inRegion(*I.getParent());
+}
+
+bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const {
+  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
+}
+
+// marks all users of loop-carried values of the loop headed by LoopHeader as
+// divergent
+void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) {
+  auto *DivLoop = LI.getLoopFor(&LoopHeader);
+  assert(DivLoop && "loopHeader is not actually part of a loop");
+
+  SmallVector<MachineBasicBlock *, 8> TaintStack;
+  DivLoop->getExitBlocks(TaintStack);
+
+  // Otherwise potential users of loop-carried values could be anywhere in the
+  // dominance region of DivLoop (including its fringes for phi nodes)
+  DenseSet<const MachineBasicBlock *> Visited;
+  for (auto *Block : TaintStack) {
+    Visited.insert(Block);
+  }
+  Visited.insert(&LoopHeader);
+
+  while (!TaintStack.empty()) {
+    auto *UserBlock = TaintStack.back();
+    TaintStack.pop_back();
+
+    // don't spread divergence beyond the region
+    if (!inRegion(*UserBlock))
+      continue;
+
+    assert(!DivLoop->contains(UserBlock) &&
+           "irreducible control flow detected");
+
+    // phi nodes at the fringes of the dominance region
+    if (!DT.dominates(&LoopHeader, UserBlock)) {
+      // all PHI nodes of UserBlock become divergent
+      pushPHINodes(*UserBlock);
+      continue;
+    }
+
+    // taint outside users of values carried by DivLoop
+    for (auto &I : *UserBlock) {
+      if (isAlwaysUniformMI(&I, SIII, SIRI, MRI))
+        continue;
+      if (isDivergent(I))
+        continue;
+
+      for (auto &Op : I.uses()) {
+        if (!Op.isReg())
+          continue;
+        unsigned OpReg = Op.getReg();
+        MachineInstr *OpInst = MRI.getUniqueVRegDef(OpReg);
+        if (!OpInst)
+          continue;
+        if (DivLoop->contains(OpInst->getParent())) {
+          markDivergent(I);
+          pushUsers(I);
+          break;
+        }
+      }
+    }
+
+    // visit all blocks in the dominance region
+    for (auto *SuccBlock : UserBlock->successors()) {
+      if (!Visited.insert(SuccBlock).second) {
+        continue;
+      }
+      TaintStack.push_back(SuccBlock);
+    }
+  }
+}
+
+void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { 
+  Worklist.push_back(&I);
+}
+void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) {
+  for (const auto &Phi : Block.phis()) {
+    if (isDivergent(Phi))
+      continue;
+    pushInstruction(Phi);
+  }
+}
+
+void DivergenceAnalysis::pushUsers(const ValueTy V) {
+  for (const auto &UserInst : MRI.use_nodbg_instructions(V)) {
+
+    if (isDivergent(UserInst))
+      continue;
+
+    // only compute divergent inside loop
+    if (!inRegion(UserInst))
+      continue;
+
+    Worklist.push_back(&UserInst);
+  }
+}
+void DivergenceAnalysis::pushUsers(const MachineInstr &I) {
+  for (const auto &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    pushUsers(Reg);
+  }
+}
+
+bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
+                                                 const MachineLoop *BranchLoop) {
+  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
+
+  // ignore divergence outside the region
+  if (!inRegion(JoinBlock)) {
+    return false;
+  }
+
+  // push non-divergent phi nodes in JoinBlock to the worklist
+  pushPHINodes(JoinBlock);
+
+  // JoinBlock is a divergent loop exit
+  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
+    return true;
+  }
+
+  // disjoint-paths divergent at JoinBlock
+  markBlockJoinDivergent(JoinBlock);
+  return false;
+}
+
+void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) {
+  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
+
+  markDivergent(Term);
+
+  const auto *BranchLoop = LI.getLoopFor(Term.getParent());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint from Term within the loop
+  // also iterates over loop exits that become divergent due to Term.
+  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent loop due to the divergent branch in Term
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n");
+
+  // don't propagate beyond region
+  if (!inRegion(*ExitingLoop.getHeader()))
+    return;
+
+  const auto *BranchLoop = ExitingLoop.getParentLoop();
+
+  // Uses of loop-carried values could occur anywhere
+  // within the dominance region of the definition. All loop-carried
+  // definitions are dominated by the loop header (reducible control).
+  // Thus all users have to be in the dominance region of the loop header,
+  // except PHI nodes that can also live at the fringes of the dom region
+  // (incoming defining value).
+  if (!IsLCSSAForm)
+    taintLoopLiveOuts(*ExitingLoop.getHeader());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint paths from exits of
+  // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
+  // become divergent.
+  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent due to divergent loop exit in ExitingLoop
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+// For case like
+//  %149:sreg_64_xexec = S_MOV_B64 $exec
+//
+//bb.3:
+//; predecessors: %bb.3, %bb.2
+//  successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%)
+//
+//  %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3
+//  %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec
+//  %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec
+//  %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//  $m0 = S_MOV_B32 %153:sgpr_32
+//  %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec
+//  $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
+//  S_CBRANCH_EXECNZ %bb.3, implicit $exec
+//
+//bb.4:
+//; predecessors: %bb.3
+//  successors: %bb.5(0x80000000); %bb.5(100.00%)
+//
+//  $exec = S_MOV_B64 %149:sreg_64_xexec
+
+// bb.3 is inside exec region which exec is saved by %149.
+// %152:sreg_64 = S_AND_SAVEEXEC_B64 will update the exec which cause divergence
+// when it is not bituniform. Everything inside the exec region need to be
+// scaned. Out region or phi use should be marked as divergent and add users to
+// worklist.
+void DivergenceAnalysis::propagateExecControlFlowDivergence(
+    const MachineInstr &SaveExec) {
+  const MachineBasicBlock *MBB = SaveExec.getParent();
+  auto it = ExecRegionMap.find(MBB);
+  if (it == ExecRegionMap.end())
+    return;
+  ExecRegion &Region = *it->second;
+  // One region only need to propagate once.
+  if (Region.bPropagated)
+    return;
+  Region.bPropagated = true;
+  // Scan all MIs in the region. Mark out region or phi use as divergent and add
+  // their users to worklist.
+  auto propagateExecDivergence = [this, Region](const MachineInstr *MI) {
+    for (const auto &DstMO : MI->defs()) {
+      Register Reg = DstMO.getReg();
+      // Only VCC/Exec/m0.
+      // Exec always uniform. Assume VCC and m0 not cross region.
+      if (Reg.isPhysical())
+        continue;
+      for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
+
+        if (isDivergent(UserInst))
+          continue;
+
+        // only propagate user outside of region or phi which will not be
+        // guarded by saveExec.
+        if (UserInst.getOpcode() != AMDGPU::PHI &&
+            isInsideExecRegion(UserInst, *Region.begin, *Region.end, DT, PDT)) {
+          continue;
+        }
+        // Write exec is not divergent.
+        if (isWriteExec(&UserInst))
+          continue;
+
+        markDivergent(UserInst);
+        pushUsers(UserInst);
+      }
+    }
+  };
+  const MachineBasicBlock *RegionBeginMBB = Region.begin->getParent();
+  const MachineBasicBlock *RegionEndMBB = Region.end->getParent();
+  if (RegionBeginMBB != RegionEndMBB) {
+    auto it = Region.begin->getIterator();
+    for (it++; it != RegionBeginMBB->end(); it++) {
+      const MachineInstr &MI = *it;
+      propagateExecDivergence(&MI);
+    }
+
+    // All blocks between RegionBeginMBB and RegionEndMBB.
+    for (const MachineBasicBlock *MBB : Region.blocks) {
+      for (const MachineInstr &MI : *MBB) {
+        propagateExecDivergence(&MI);
+      }
+    }
+
+    for (auto it = RegionEndMBB->begin(); it != Region.end->getIterator();
+         it++) {
+      const MachineInstr &MI = *it;
+      propagateExecDivergence(&MI);
+    }
+
+  } else {
+    auto it = Region.begin->getIterator();
+    for (it++; it != Region.end->getIterator(); it++) {
+      const MachineInstr &MI = *it;
+      propagateExecDivergence(&MI);
+    }
+  }
+}
+
+void DivergenceAnalysis::compute() {
+  SmallVector<ExecRegion, 4> ExecRegions;
+  // Build exec regions.
+  // Add VCndMask for non-bituniform caused by input sreg.
+  for (const MachineBasicBlock &MBB : F) {
+    for (const MachineInstr &Term : MBB.terminators()) {
+      if (updateTerminator(Term))
+        pushInstruction(Term);
+    }
+
+    for (const MachineInstr &I : MBB) {
+      unsigned Opcode = I.getOpcode();
+      if (isVCndMask(Opcode)) {
+        // Cond for CndMask needs bit uniform check.
+        // Add it to worklist to check bit uniform from input.
+        pushInstruction(I);
+      } else if (isRestoreExec(&I)) {
+        const MachineInstr *RegionBegin =
+            findExecRegionBeginFromRegionEnd(&I, MRI);
+        if (RegionBegin) {
+          ExecRegions.emplace_back(ExecRegion(RegionBegin, &I));
+        }
+      }
+    }
+  }
+
+  // Build exec region map.
+  for (const MachineBasicBlock &MBB : F) {
+    for (ExecRegion &Region : ExecRegions) {
+      if (isInsideExecRegion(MBB, *Region.begin, *Region.end, DT, PDT)) {
+        // Add block to region.
+        if (&MBB != Region.begin->getParent() &&
+            &MBB != Region.end->getParent())
+          Region.blocks.emplace_back(&MBB);
+        // Update ExecRegionMap.
+        auto it = ExecRegionMap.find(&MBB);
+        if (it == ExecRegionMap.end()) {
+          ExecRegionMap[&MBB] = &Region;
+        } else {
+          // When MBB inside multiple regions, save the smallest one.
+          if (isInsideExecRegion(*Region.begin, *it->second->begin,
+                                 *it->second->end, DT, PDT)) {
+            ExecRegionMap[&MBB] = &Region;
+          }
+        }
+      }
+    }
+  }
+
+  for (auto DivVal : DivergentValues) {
+    LLVM_DEBUG(dbgs() << "\t sourceOfDivergence :"; printReg(DivVal, SIRI);
+               dbgs() << "\n";);
+    pushUsers(DivVal);
+  }
+
+  // propagate divergence
+  while (!Worklist.empty()) {
+    const MachineInstr *I= Worklist.back();
+    Worklist.pop_back();
+
+    // maintain uniformity of overrides
+    if (isAlwaysUniformMI(I, SIII, SIRI, MRI)) {
+      // If used by terminators, and not bit uniform.
+      // Add terminator.
+      SmallVector<const MachineInstr *, 2> TermUsers;
+      for (const auto &DstMO : I->defs()) {
+        unsigned Reg = DstMO.getReg();
+        for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
+
+          if (isDivergent(UserInst))
+            continue;
+          // Only check terminator here.
+          if (!UserInst.isTerminator())
+            continue;
+
+          // only compute divergent inside loop
+          if (!inRegion(UserInst))
+            continue;
+
+          TermUsers.emplace_back(&UserInst);
+        }
+      }
+
+      if (!TermUsers.empty()) {
+        llvm::DenseMap<const MachineInstr *, bool> Processed;
+        if (!isBitUniform(*I, Processed)) {
+          for (const MachineInstr *Term : TermUsers) {
+            Worklist.emplace_back(Term);
+          }
+        }
+      }
+
+      continue;
+    }
+
+    bool WasDivergent = isDivergent(*I);
+    if (WasDivergent)
+      continue;
+
+    // propagate divergence caused by terminator
+    if (I->isTerminator()) {
+      if (updateTerminator(*I)) {
+        // propagate control divergence to affected instructions
+        propagateBranchDivergence(*I);
+        continue;
+      }
+    }
+
+    // update divergence of I due to divergent operands
+    bool DivergentUpd = false;
+    unsigned Opcode = I->getOpcode();
+    switch (I->getOpcode()) {
+    default:
+      if (isVCndMask(Opcode)) {
+        DivergentUpd = updateVCndMask(*I);
+      } else {
+        DivergentUpd = updateNormalInstruction(*I);
+        llvm::DenseMap<const MachineInstr *, bool> Processed;
+        if ((DivergentUpd || !isBitUniform(*I, Processed)) && isWriteExec(I)) {
+          // propagate exec control divergence to affected instructions.
+          propagateExecControlFlowDivergence(*I);
+        }
+      }
+      break;
+    case AMDGPU::PHI:
+      DivergentUpd = updatePHINode(*I);
+      break;
+    }
+
+    // propagate value divergence to users
+    if (DivergentUpd) {
+      markDivergent(*I);
+      pushUsers(*I);
+    }
+  }
+}
+
+bool DivergenceAnalysis::isAlwaysUniform(const ValueTy V) const {
+  return UniformOverrides.find(V) != UniformOverrides.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const ValueTy V) const {
+  return DivergentValues.find(V) != DivergentValues.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return false;
+  Register Reg = MO.getReg();
+  if (Reg.isPhysical()) {
+    const MachineInstr *MI = MO.getParent();
+    if (MI)
+      return isDivergent(!MI);
+
+  } else {
+    return isDivergent(Reg);
+  }
+  return true;
+}
+
+bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const {
+  if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
+    return false;
+  if (DivergentInsts.find(&I) != DivergentInsts.end())
+    return true;
+  for (const MachineOperand &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    if (isDivergent(Reg))
+      return true;
+  }
+  return false;
+}
+
+void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const {
+  // iterate instructions using instructions() to ensure a deterministic order.
+  for (auto &MBB : F)
+  for (auto &I : MBB) {
+    if (isDivergent(I))
+      OS << "DIVERGENT:" << I ;
+    // AMDGPU changes begin
+    else
+      OS << "UNIFORM:" << I ;
+    // AMDGPU changes end
+  }
+}
+
+// class GPUDivergenceAnalysis
+MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F,
+                                             const MachineDominatorTree &DT,
+                                             const MachinePostDominatorTree &PDT,
+                                             const MachineLoopInfo &LI)
+    : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap),
+      DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) {
+  MachineRegisterInfo &MRI = F.getRegInfo();
+  const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  for (auto &MBB : F)
+    for (auto &I : MBB) {
+      if (isSourceOfDivergence(&I, MRI, SIRI, SIII)) {
+        DA.markDivergent(I);
+      } else if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) {
+        DA.addUniformOverride(I);
+      }
+    }
+  for (auto &ArgIt : F.getRegInfo().liveins()) {
+    unsigned Reg = ArgIt.first;
+    if (isDivergentInputReg(Reg, MRI, SIRI)) {
+      DA.markDivergent(Reg);
+    }
+  }
+
+  DA.compute();
+}
+
+bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const {
+  return DA.isDivergent(*I);
+}
+
+void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const {
+  OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
+  DA.print(OS, mod);
+  OS << "}\n";
+}
+
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
new file mode 100644
index 0000000000000..edcf96ec44a4d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
@@ -0,0 +1,281 @@
+//===- AMDGPUMirDivergenceAnalysis.h -        Mir Divergence Analysis -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// The divergence analysis determines which instructions and branches are
+// divergent given a set of divergent source instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
+#include "llvm/Pass.h"
+#include <vector>
+
+namespace llvm {
+class raw_ostream;
+class TargetTransformInfo;
+class MachineRegisterInfo;
+class SIInstrInfo;
+class SIRegisterInfo;
+class MachineOperand;
+class MachineBasicBlock;
+
+using Module_ = void;
+class TargetTransformInfo;
+using ValueTy = unsigned;
+using PHINode_ = MachineInstr;
+
+/// \brief Generic divergence analysis for reducible CFGs.
+///
+/// This analysis propagates divergence in a data-parallel context from sources
+/// of divergence to all users. It requires reducible CFGs. All assignments
+/// should be in SSA form.
+class DivergenceAnalysis {
+public:
+  /// \brief This instance will analyze the whole function \p F or the loop \p
+  /// RegionLoop.
+  ///
+  /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop.
+  /// Otherwise the whole function is analyzed.
+  /// \param IsLCSSAForm whether the analysis may assume that the IR in the
+  /// region in in LCSSA form.
+  DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop,
+                     const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+                     const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA,
+                     bool IsLCSSAForm,
+                     // AMDGPU change begin.
+                     DivergentJoinMapTy &JoinMap
+                     // AMDGPU change end.
+  );
+
+  /// \brief The loop that defines the analyzed region (if any).
+  const MachineLoop *getRegionLoop() const { return RegionLoop; }
+  const llvm::MachineFunction &getFunction() const { return F; }
+
+  /// \brief Whether \p BB is part of the region.
+  bool inRegion(const MachineBasicBlock &BB) const;
+  /// \brief Whether \p I is part of the region.
+  bool inRegion(const MachineInstr &I) const;
+
+  /// \brief Mark \p UniVal as a value that is always uniform.
+  void addUniformOverride(const ValueTy UniVal);
+  void addUniformOverride(const MachineInstr &I);
+
+  /// \brief Mark \p DivVal as a value that is always divergent.
+  void markDivergent(const ValueTy DivVal);
+  void markDivergent(const MachineInstr &I);
+
+  /// \brief Propagate divergence to all instructions in the region.
+  /// Divergence is seeded by calls to \p markDivergent.
+  void compute();
+
+  /// \brief Whether any value was marked or analyzed to be divergent.
+  bool hasDetectedDivergence() const { return !DivergentValues.empty(); }
+
+  /// \brief Whether \p Val will always return a uniform value regardless of its
+  /// operands
+  bool isAlwaysUniform(const ValueTy Val) const;
+
+  /// \brief Whether \p Val is a divergent value
+  bool isDivergent(const ValueTy Val) const;
+  bool isDivergent(const MachineInstr &I) const;
+
+  void print(llvm::raw_ostream &OS, const Module_ *) const;
+
+private:
+  bool isDivergent(const llvm::MachineOperand &MO) const;
+  bool updateTerminator(const MachineInstr &Term) const;
+  bool updatePHINode(const PHINode_ &Phi) const;
+  bool updateVCndMask(const MachineInstr &VCndMask) const;
+  bool isBitUniform(const MachineInstr &I,
+                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+  bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
+                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+
+  /// \brief Computes whether \p Inst is divergent based on the
+  /// divergence of its operands.
+  ///
+  /// \returns Whether \p Inst is divergent.
+  ///
+  /// This should only be called for non-phi, non-terminator instructions.
+  bool updateNormalInstruction(const MachineInstr &Inst) const;
+
+  /// \brief Mark users of live-out users as divergent.
+  ///
+  /// \param LoopHeader the header of the divergent loop.
+  ///
+  /// Marks all users of live-out values of the loop headed by \p LoopHeader
+  /// as divergent and puts them on the worklist.
+  void taintLoopLiveOuts(const MachineBasicBlock &LoopHeader);
+
+  /// \brief Push all users of \p Val (in the region) to the worklist
+  void pushUsers(const ValueTy I);
+  void pushUsers(const MachineInstr &I);
+
+  void pushInstruction(const MachineInstr &I);
+  /// \brief Push all phi nodes in @block to the worklist
+  void pushPHINodes(const MachineBasicBlock &Block);
+
+  /// \brief Mark \p Block as join divergent
+  ///
+  /// A block is join divergent if two threads may reach it from different
+  /// incoming blocks at the same time.
+  void markBlockJoinDivergent(const MachineBasicBlock &Block) {
+    DivergentJoinBlocks.insert(&Block);
+  }
+
+  /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
+  bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
+                           const ValueTy Val,
+                           const MachineBasicBlock &incomingBlock) const; // AMDGPU change
+
+  /// \brief Whether \p Block is join divergent
+  ///
+  /// (see markBlockJoinDivergent).
+  bool isJoinDivergent(const MachineBasicBlock &Block) const {
+    return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end();
+  }
+
+  /// \brief Propagate control-induced divergence to users (phi nodes and
+  /// instructions).
+  //
+  // \param JoinBlock is a divergent loop exit or join point of two disjoint
+  // paths.
+  // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
+  bool propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
+                               const MachineLoop *TermLoop);
+
+  /// \brief Propagate induced value divergence due to control divergence in \p
+  /// Term.
+  void propagateBranchDivergence(const MachineInstr &Term);
+
+  /// \brief Propagate induced value divergence due to exec update caused by \p
+  /// SaveExec.
+  void propagateExecControlFlowDivergence(const MachineInstr &SaveExec);
+
+  /// \brief Propagate divergent caused by a divergent loop exit.
+  ///
+  /// \param ExitingLoop is a divergent loop.
+  void propagateLoopDivergence(const MachineLoop &ExitingLoop);
+
+private:
+  const llvm::MachineFunction &F;
+  const llvm::MachineRegisterInfo &MRI;
+  const llvm::SIRegisterInfo *SIRI;
+  const llvm::SIInstrInfo *SIII;
+  // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
+  // Otw, analyze the whole function
+  const MachineLoop *RegionLoop;
+
+  const MachineDominatorTree &DT;
+  const MachinePostDominatorTree &PDT;
+  const MachineLoopInfo &LI;
+
+  // Recognized divergent loops
+  llvm::DenseSet<const MachineLoop *> DivergentLoops;
+
+  // AMDGPU change begin
+  // Save block pair which divergent disjoint.
+  // A
+  // | \
+  // |  \
+  // B   C
+  // |  /
+  //  D
+  // When A is divergent branch, B and C are divergent join at D.
+  // Then DivergentJoinMap[B].count(C) > 0 and
+  // DivergentJoinMap[C].count(B) > 0.
+  DivergentJoinMapTy &DivergentJoinMap;
+  // AMDGPU change end
+
+  // The SDA links divergent branches to divergent control-flow joins.
+  SyncDependenceAnalysis &SDA;
+
+  // Use simplified code path for LCSSA form.
+  bool IsLCSSAForm;
+
+  // Set of known-uniform values.
+  llvm::DenseSet<unsigned> UniformOverrides;
+  llvm::DenseSet<const llvm::MachineInstr*> UniformOverridesInsts;
+
+  // Blocks with joining divergent control from different predecessors.
+  llvm::DenseSet<const MachineBasicBlock *> DivergentJoinBlocks;
+
+  // Detected/marked divergent values.
+  llvm::DenseSet<unsigned> DivergentValues;
+  llvm::DenseSet<const llvm::MachineInstr*> DivergentInsts;
+
+  // Mir change for EXEC control flow.
+  // Map from MBB to the exec region it belongs too.
+  // A exec region is begin with
+  // S_MOV_B64 sreg, exec
+  // end with
+  // S_MOV_B64 exec, sreg
+  // Inside the region, exec might be updated to make control flow with exec.
+  struct ExecRegion {
+    const llvm::MachineInstr *begin;
+    const llvm::MachineInstr *end;
+    std::vector<const llvm::MachineBasicBlock*> blocks;
+    bool bPropagated = false;
+    ExecRegion(const llvm::MachineInstr *b,
+               const llvm::MachineInstr *e)
+        : begin(b), end(e), bPropagated(false) {}
+  };
+  llvm::DenseMap<const llvm::MachineBasicBlock *, ExecRegion *> ExecRegionMap;
+
+  // Internal worklist for divergence propagation.
+  std::vector<const llvm::MachineInstr*> Worklist;
+};
+
+/// \brief Divergence analysis frontend for GPU kernels.
+class MirGPUDivergenceAnalysis {
+  // AMDGPU change begin
+  // Save block pair which divergent disjoint.
+  // A
+  // | \
+  // |  \
+  // B   C
+  // |  /
+  //  D
+  // When A is divergent branch, B and C are divergent join at D.
+  // Then DivergentJoinMap[B].count(C) > 0 and
+  // DivergentJoinMap[C].count(B) > 0.
+  DivergentJoinMapTy  DivergentJoinMap;
+  // AMDGPU change end
+  SyncDependenceAnalysis SDA;
+  DivergenceAnalysis DA;
+
+public:
+  /// Runs the divergence analysis on @F, a GPU kernel
+  MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT,
+                        const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI);
+
+  /// Whether any divergence was detected.
+  bool hasDivergence() const { return DA.hasDetectedDivergence(); }
+
+  /// The GPU kernel this analysis result is for
+  const llvm::MachineFunction &getFunction() const { return DA.getFunction(); }
+
+  /// Whether \p I is divergent.
+  bool isDivergent(const MachineInstr *I) const;
+
+  /// Whether \p I is uniform/non-divergent
+  bool isUniform(const MachineInstr *I) const { return !isDivergent(I); }
+
+  /// Print all divergent values in the kernel.
+  void print(llvm::raw_ostream &OS, const Module_ *) const;
+};
+
+} // namespace llvm
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
new file mode 100644
index 0000000000000..7213f7b4b11b4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
@@ -0,0 +1,511 @@
+//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation
+//--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is based on Analysis/MirSyncDependenceAnalysis.cpp, just change
+// MachineBasicBlock to MachineBasicBlock.
+// This file implements an algorithm that returns for a divergent branch
+// the set of basic blocks whose phi nodes become divergent due to divergent
+// control. These are the blocks that are reachable by two disjoint paths from
+// the branch or loop exits that have a reaching path that is disjoint from a
+// path to the loop latch.
+//
+// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Summary --
+// The SyncDependenceAnalysis lazily computes sync dependences [3].
+// The analysis evaluates the disjoint path criterion [2] by a reduction
+// to SSA construction. The SSA construction algorithm is implemented as
+// a simple data-flow analysis [1].
+//
+// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
+// [2] "Efficiently Computing Static Single Assignment Form
+//     and the Control Dependence Graph", TOPLAS '91,
+//           Cytron, Ferrante, Rosen, Wegman and Zadeck
+// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
+// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+//
+// -- Sync dependence --
+// Sync dependence [4] characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme [2].
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+//       entry
+//     /      \
+//    A        \
+//  /   \       Y
+// B     C     /
+//  \   /  \  /
+//    D     E
+//     \   /
+//       F
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x different values in each
+// successor block of A.
+//           entry
+//         /      \
+//        A        \
+//      /   \       Y
+// x = 0   x = 1   /
+//      \  /   \  /
+//        D     E
+//         \   /
+//           F
+// Our flavor of SSA construction for x will construct the following
+//            entry
+//          /      \
+//         A        \
+//       /   \       Y
+// x0 = 0   x1 = 1  /
+//       \   /   \ /
+//      x2=phi    E
+//         \     /
+//          x3=phi
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// In case of loop exits we need to check the disjoint path criterion for loops
+// [2]. To this end, we check whether the definition of x differs between the
+// loop exit and the loop header (_after_ SSA construction).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+#include <stack>
+#include <unordered_set>
+
+#define DEBUG_TYPE "sync-dependence"
+
+namespace llvm {
+
+ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
+
+SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT,
+                                               const MachinePostDominatorTree &PDT,
+                                               const MachineLoopInfo &LI,
+                                               // AMDGPU change begin.
+                                               DivergentJoinMapTy &JoinMap
+                                               // AMDGPU change end.
+    )
+    : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI),
+    // AMDGPU change begin.
+      DivergentJoinMap(JoinMap)
+    // AMDGPU change end.
+{
+}
+
+SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
+
+using FunctionRPOT = ReversePostOrderTraversal<const MachineFunction *>;
+
+// divergence propagator for reducible CFGs
+struct DivergencePropagator {
+  const FunctionRPOT &FuncRPOT;
+  const MachineDominatorTree &DT;
+  const MachinePostDominatorTree &PDT;
+  const MachineLoopInfo &LI;
+
+  // identified join points
+  std::unique_ptr<ConstBlockSet> JoinBlocks;
+
+  // reached loop exits (by a path disjoint to a path to the loop header)
+  SmallPtrSet<const MachineBasicBlock *, 4> ReachedLoopExits;
+
+  // if DefMap[B] == C then C is the dominating definition at block B
+  // if DefMap[B] ~ undef then we haven't seen B yet
+  // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
+  // an immediate successor of X (initial value).
+  using DefiningBlockMap = std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
+  DefiningBlockMap DefMap;
+
+  // all blocks with pending visits
+  std::unordered_set<const MachineBasicBlock *> PendingUpdates;
+
+  DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT,
+                       const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+      : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
+        JoinBlocks(new ConstBlockSet) {}
+
+  // set the definition at @block and mark @block as pending for a visit
+  void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) {
+    bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
+    if (WasAdded)
+      PendingUpdates.insert(&Block);
+  }
+
+  void printDefs(raw_ostream &Out) {
+    Out << "Propagator::DefMap {\n";
+    for (const auto *Block : FuncRPOT) {
+      auto It = DefMap.find(Block);
+      Out << Block->getName() << " : ";
+      if (It == DefMap.end()) {
+        Out << "\n";
+      } else {
+        const auto *DefBlock = It->second;
+        Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
+      }
+    }
+    Out << "}\n";
+  }
+
+  // process @succBlock with reaching definition @defBlock
+  // the original divergent branch was in @parentLoop (if any)
+  void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop,
+                      const MachineBasicBlock &DefBlock) {
+
+    // @succBlock is a loop exit
+    if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
+      DefMap.emplace(&SuccBlock, &DefBlock);
+      ReachedLoopExits.insert(&SuccBlock);
+      return;
+    }
+
+    // first reaching def?
+    auto ItLastDef = DefMap.find(&SuccBlock);
+    if (ItLastDef == DefMap.end()) {
+      addPending(SuccBlock, DefBlock);
+      return;
+    }
+
+    // a join of at least two definitions
+    if (ItLastDef->second != &DefBlock) {
+      // do we know this join already?
+      if (!JoinBlocks->insert(&SuccBlock).second)
+        return;
+
+      // update the definition
+      addPending(SuccBlock, SuccBlock);
+    }
+  }
+
+  // find all blocks reachable by two disjoint paths from @rootTerm.
+  // This method works for both divergent terminators and loops with
+  // divergent exits.
+  // @rootBlock is either the block containing the branch or the header of the
+  // divergent loop.
+  // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator)
+  // headed by @rootBlock.
+  // @parentLoop is the parent loop of the MachineLoop or the loop that contains the
+  // Terminator.
+  template <typename SuccessorIterable>
+  std::unique_ptr<ConstBlockSet>
+  computeJoinPoints(const MachineBasicBlock &RootBlock,
+                    SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) {
+    assert(JoinBlocks);
+
+    // bootstrap with branch targets
+    for (const auto *SuccBlock : NodeSuccessors) {
+      DefMap.emplace(SuccBlock, SuccBlock);
+
+      if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
+        // immediate loop exit from node.
+        ReachedLoopExits.insert(SuccBlock);
+        continue;
+      } else {
+        // regular successor
+        PendingUpdates.insert(SuccBlock);
+      }
+    }
+
+    auto ItBeginRPO = FuncRPOT.begin();
+
+    // skip until term (TODO RPOT won't let us start at @term directly)
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+
+    auto ItEndRPO = FuncRPOT.end();
+    assert(ItBeginRPO != ItEndRPO);
+
+    // propagate definitions at the immediate successors of the node in RPO
+    auto ItBlockRPO = ItBeginRPO;
+    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+      const auto *Block = *ItBlockRPO;
+
+      // skip @block if not pending update
+      auto ItPending = PendingUpdates.find(Block);
+      if (ItPending == PendingUpdates.end())
+        continue;
+      PendingUpdates.erase(ItPending);
+
+      // propagate definition at @block to its successors
+      auto ItDef = DefMap.find(Block);
+      const auto *DefBlock = ItDef->second;
+      assert(DefBlock);
+
+      auto *BlockLoop = LI.getLoopFor(Block);
+      if (ParentLoop &&
+          (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
+        // if the successor is the header of a nested loop pretend its a
+        // single node with the loop's exits as successors
+        SmallVector<MachineBasicBlock *, 4> BlockLoopExits;
+        BlockLoop->getExitBlocks(BlockLoopExits);
+        for (const auto *BlockLoopExit : BlockLoopExits) {
+          visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
+        }
+
+      } else {
+        // the successors are either on the same loop level or loop exits
+        for (const auto *SuccBlock : Block->successors()) {
+          visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
+        }
+      }
+    }
+
+    // We need to know the definition at the parent loop header to decide
+    // whether the definition at the header is different from the definition at
+    // the loop exits, which would indicate a divergent loop exits.
+    //
+    // A // loop header
+    // |
+    // B // nested loop header
+    // |
+    // C -> X (exit from B loop) -..-> (A latch)
+    // |
+    // D -> back to B (B latch)
+    // |
+    // proper exit from both loops
+    //
+    // D post-dominates B as it is the only proper exit from the "A loop".
+    // If C has a divergent branch, propagation will therefore stop at D.
+    // That implies that B will never receive a definition.
+    // But that definition can only be the same as at D (D itself in thise case)
+    // because all paths to anywhere have to pass through D.
+    //
+    const MachineBasicBlock *ParentLoopHeader =
+        ParentLoop ? ParentLoop->getHeader() : nullptr;
+    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
+      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
+    }
+
+    // analyze reached loop exits
+    if (!ReachedLoopExits.empty()) {
+      assert(ParentLoop);
+      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+      LLVM_DEBUG(printDefs(dbgs()));
+
+      // AMDGPU CHANGE: Allow null HeaderDefBlock
+      // Because of the way they walk the blocks (a reverse post order traversal
+      // stopping at the immediate post dominator) it is possible that
+      // they will reach a loop exit, but not the loop header.
+      //
+      // We conservatively mark the exit blocks as divergent join points
+      // in this case.
+      //
+      // Problem CFG is below:
+      //
+      //     +--> A
+      //     |   / \
+      //     |  B   C
+      //     |  | / |
+      //     +--L   P
+      //   
+      // In this cfg, C is the RootBlock and P is C's post-dominator.
+      // It will only visit L and P and then stop because it hits the
+      // post dominator. Most loops do not hit this case because the
+      // loop exiting block (C) will branch directly back to the loop
+      // header.
+      // 
+      if (HeaderDefBlock)
+      {
+          for (const auto *ExitBlock : ReachedLoopExits) {
+            auto ItExitDef = DefMap.find(ExitBlock);
+            assert((ItExitDef != DefMap.end()) &&
+                   "no reaching def at reachable loop exit");
+            if (ItExitDef->second != HeaderDefBlock) {
+              JoinBlocks->insert(ExitBlock);
+            }
+          }
+      }
+      else
+      {
+          for (const auto *ExitBlock : ReachedLoopExits)
+          {
+              JoinBlocks->insert(ExitBlock);
+          }
+      }
+    }
+
+    return std::move(JoinBlocks);
+  }
+};
+
+// AMDGPU change begin.
+// For all join blocks caused by divergent RootBlock, the prevs of a join block
+// which are in DefMap or the RootBlock are divergent join each other on the join block because
+// of divergent RootBlock.
+static void updateJoinMap(
+    const MachineBasicBlock *RootBlock,
+    DenseMap<const MachineBasicBlock *, SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
+    DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) {
+  for (const MachineBasicBlock *JoinBB : JoinBlocks) {
+    // makr divergent join for all pred pair which in DefMap.
+    for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end();
+         predIt++) {
+      auto predIt2 = predIt;
+      const MachineBasicBlock *pred = *predIt;
+      if (DefMap.count(pred) == 0 && pred != RootBlock)
+        continue;
+
+      for (predIt2++; predIt2 != JoinBB->pred_end(); predIt2++) {
+        const MachineBasicBlock *pred2 = *predIt2;
+        if (DefMap.count(pred2) == 0 && pred2 != RootBlock)
+          continue;
+
+        JoinMap[pred].insert(pred2);
+        JoinMap[pred2].insert(pred);
+        LLVM_DEBUG(dbgs() << "joint_bb0: " << pred->getName()
+                          << " joint_bb1: " << pred2->getName() << "\n";);
+      }
+    }
+  }
+}
+// AMDGPU change end.
+
+const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
+  using LoopExitVec = SmallVector<MachineBasicBlock *, 4>;
+  LoopExitVec LoopExits;
+  MachineLoop.getExitBlocks(LoopExits);
+  if (LoopExits.size() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedLoopExitJoins.find(&MachineLoop);
+  if (ItCached != CachedLoopExitJoins.end()) {
+    return *ItCached->second;
+  }
+
+  // dont propagte beyond the immediate post dom of the loop
+  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
+  const auto *IpdNode = PdNode->getIDom();
+  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+  while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) {
+    IpdNode = IpdNode->getIDom();
+    PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+  }
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
+      *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock);
+
+  // AMDGPU change begin.
+  // Save divergent join pairs.
+  updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap,
+                    *JoinBlocks.get());
+  // AMDGPU change end.
+
+  auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
+  // trivial case
+  if (Term.getParent()->succ_size() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedBranchJoins.find(&Term);
+  if (ItCached != CachedBranchJoins.end())
+    return *ItCached->second;
+
+  // dont propagate beyond the immediate post dominator of the branch
+  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
+  const auto *IpdNode = PdNode->getIDom();
+  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+  
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  const auto &TermBlock = *Term.getParent();
+  
+  // AMDGPU CHANGE
+  // Make sure the post-dominator is outside the loop for the loop header.
+  // Otherwise, we may not find all the join blocks in the loop 
+  // because the search stops too early. Some join points can be reached
+  // after the post-dominator!
+  //
+  // Problem CFG is below:
+  //
+  //     +--> A
+  //     |   / \
+  //     |  B   P
+  //     |  | / |
+  //     +--L   X
+  //
+  // In this cfg, A is the loop header and P is A's post-dominator.
+  // The algorithm to mark join points does an Reverse Post Order walk
+  // from A and stops when it reaches the post dominator. It would not 
+  // mark the phi node in L as divergent even when A had a divergent branch.
+  // The fix we made was to make the join point search continue all the way
+  // to the loops post dominator (which is X in this example).
+  //
+  // NOTE: They already made this change for the loop case above, but for
+  //       a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&)
+  //   
+  const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock);
+  if (MachineLoop && (MachineLoop->getHeader() == &TermBlock))
+  {
+      while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
+        IpdNode = IpdNode->getIDom();
+        PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+      }
+  }
+ 
+  auto JoinBlocks = Propagator.computeJoinPoints(
+      TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock);
+
+  // AMDGPU change begin.
+  // Save divergent join pairs.
+  updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap,
+                    *JoinBlocks.get());
+  // AMDGPU change end.
+
+  auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
new file mode 100644
index 0000000000000..a52bcc7bc9e7c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
@@ -0,0 +1,98 @@
+//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines the SyncDependenceAnalysis class, which computes for
+// every divergent branch the set of phi nodes that the branch will make
+// divergent.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <memory>
+#include <map>
+
+namespace llvm {
+class MachineBasicBlock;
+class MachineDominatorTree;
+class MachineLoop;
+class MachinePostDominatorTree;
+class MachineLoopInfo;
+class MachineFunction;
+class MachineInstr;
+
+using DivergentJoinMapTy =
+    llvm::DenseMap<const llvm::MachineBasicBlock *,
+                   llvm::SmallPtrSet<const llvm::MachineBasicBlock *, 4>>;
+
+using ConstBlockSet = llvm::SmallPtrSet<const MachineBasicBlock *, 4>;
+
+/// \brief Relates points of divergent control to join points in
+/// reducible CFGs.
+///
+/// This analysis relates points of divergent control to points of converging
+/// divergent control. The analysis requires all loops to be reducible.
+class SyncDependenceAnalysis {
+  void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop,
+                      const MachineBasicBlock *defBlock);
+
+public:
+  bool inRegion(const MachineBasicBlock &BB) const;
+
+  ~SyncDependenceAnalysis();
+  SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+                         const MachineLoopInfo &LI,
+                         // AMDGPU change begin
+                         DivergentJoinMapTy &JoinMap
+                         // AMDGPU change end
+  );
+
+  /// \brief Computes divergent join points and loop exits caused by branch
+  /// divergence in \p Term.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from \p Term.
+  /// The set also contains loop exits if there two disjoint paths:
+  /// one from \p Term to the loop exit and another from \p Term to the loop
+  /// header. Those exit blocks are added to the returned set.
+  /// If L is the parent loop of \p Term and an exit of L is in the returned
+  /// set then L is a divergent loop.
+  const ConstBlockSet &join_blocks(const MachineInstr &Term);
+
+  /// \brief Computes divergent join points and loop exits (in the surrounding
+  /// loop) caused by the divergent loop exits of\p MachineLoop.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from the
+  /// loop exits of \p MachineLoop.
+  /// This treats the loop as a single node in \p MachineLoop's parent loop.
+  /// The returned set has the same properties as for join_blocks(TermInst&).
+  const ConstBlockSet &join_blocks(const MachineLoop &MachineLoop);
+
+private:
+  static ConstBlockSet EmptyBlockSet;
+
+  llvm::ReversePostOrderTraversal<const llvm::MachineFunction *> FuncRPOT;
+  const MachineDominatorTree &DT;
+  const MachinePostDominatorTree &PDT;
+  const MachineLoopInfo &LI;
+  // AMDGPU change begin.
+  DivergentJoinMapTy &DivergentJoinMap;
+  // AMDGPU change end.
+  std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+  std::map<const MachineInstr *, std::unique_ptr<ConstBlockSet>>
+      CachedBranchJoins;
+};
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
new file mode 100644
index 0000000000000..648df7f724617
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -0,0 +1,188 @@
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===--------------------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+namespace llvm {
+
+// Other info which can help compare schedule result.
+float SchedScore::computeScore() const {
+  // Occupancy 1 cannot mix alu.
+  unsigned MixHidenAlu = Alu - MixAlu;
+  if (Occupancy == 1)
+    MixHidenAlu = 0;
+  return ((float)MemLatency - (float)MixHidenAlu) / (float)Occupancy -
+         LatencyHide;
+}
+float SchedScore::computeScore2() const {
+  float cycles = 0;
+  cycles = (MixAlu * Occupancy + MemLatency);
+  cycles /= Occupancy;
+  return cycles;
+}
+
+void SchedScore::sum(const SchedScore &s, unsigned loopDepth) {
+  unsigned loopCount = loopDepth > 0 ? std::pow(3, loopDepth) : 1;
+  LatencyHide += loopCount * s.LatencyHide;
+  MemLatency += loopCount * s.MemLatency;
+  MixAlu += loopCount * s.MixAlu;
+  Alu += loopCount * s.Alu;
+  Lds += loopCount * s.Lds;
+  SgprSpill |= s.SgprSpill;
+}
+bool SchedScore::isBetter(const SchedScore &s) const {
+  float score = computeScore();
+  float newScore = s.computeScore();
+  bool spillBetter = !SgprSpill && s.SgprSpill;
+  return spillBetter ? true : newScore >= score;
+}
+// Does more occupancy give more perf.
+bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
+  unsigned gain = latencyGain(TargetOccupancy, ExtraOcc);
+  // 10% is good enough.
+  if ((10*gain) >= Alu)
+    return true;
+  else
+    return false;
+}
+
+unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
+  unsigned latency = MemLatency;
+  return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc));
+}
+
+// AMDGPULatencyTracker
+AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
+    : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
+
+void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
+  if (MI.isDebugInstr()) return;
+  int latency = SIII->getInstrLatency(ItinerayData, MI);
+  // If inside latency hide.
+  if (!LatencyMIs.empty()) {
+    bool bWaitCnt = false;
+    for (auto &MO : MI.operands()) {
+      if (MO.isReg()) {
+        unsigned reg = MO.getReg();
+        auto it = LatencyMIs.find(reg);
+        if (it != LatencyMIs.end()) {
+          bWaitCnt = true;
+          // If MI use mem result, update latency to mem latency.
+          int cycle = it->second;
+          if (cycle > latency)
+            latency = cycle;
+        }
+      }
+    }
+    // Update latency for each mem latency inst.
+    for (auto it = LatencyMIs.begin(); it != LatencyMIs.end();) {
+      auto prev = it;
+      auto l = (it++);
+      int cycle = l->second;
+      if (cycle <= latency) {
+        // Only left cycles.
+        // Remove the reg.
+        LatencyMIs.erase(prev);
+        if (bWaitCnt && cycle == latency) {
+          score.MemLatency += cycle;
+          // Only count memLatency once, the rest is hide.
+          bWaitCnt = false;
+        } else {
+          // Hide cycle or count mem latency?
+          score.LatencyHide += cycle;
+        }
+      } else {
+        l->second -= latency;
+        // Hide latency.
+        score.LatencyHide += latency;
+      }
+    }
+
+  } else {
+    // TODO: check branch/lds?
+    // TODO: check prevVAlu?
+    auto getAluStatus = [](const MachineInstr &MI,
+                           const llvm::SIInstrInfo *SIII) {
+      AluStatus status = AluStatus::Nothing;
+      if (SIII->isVALU(MI.getOpcode())) {
+        status = AluStatus::Vector;
+      } else if (SIII->isSALU(MI.getOpcode())) {
+        status = AluStatus::Scalar;
+      }
+      return status;
+    };
+    AluStatus status = getAluStatus(MI, SIII);
+
+    switch (prevStatus) {
+    case AluStatus::Nothing: {
+      score.Alu += latency;
+      score.MixAlu += latency;
+      prevStatus = status;
+    } break;
+    case AluStatus::Vector:
+    case AluStatus::Scalar: {
+      score.Alu += latency;
+      // Ignore mix alu.
+      if (prevStatus != status) {
+        prevStatus = AluStatus::Nothing;
+      } else {
+        score.MixAlu += latency;
+      }
+    } break;
+    }
+  }
+  // Update latency inst.
+  if (SIII->isHighLatencyInstruction(MI) && MI.mayLoad()) {
+    unsigned reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kHighLetency = 180;
+    LatencyMIs[reg] = kHighLetency;
+  } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) {
+    unsigned reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kLowLetency = 35;
+    LatencyMIs[reg] = kLowLetency;
+  }
+}
+
+SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI) {
+  SchedScore totalScore;
+  for (auto &MFI : MF) {
+    MachineBasicBlock &MBB = MFI;
+    MachineBasicBlock::iterator Next;
+    AMDGPULatencyTracker latencyTracker(ST);
+    for (auto &MI : MBB) {
+      latencyTracker.scan(MI);
+    }
+    unsigned loopDepth = 0;
+    if (MLI) {
+      loopDepth = MLI->getLoopDepth(&MBB);
+    }
+    totalScore.sum(latencyTracker.score, loopDepth);
+  }
+  return totalScore;
+}
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
new file mode 100644
index 0000000000000..f108bab24bd39
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -0,0 +1,74 @@
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===--------------------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class MachineFunction;
+class GCNSubtarget;
+class MachineInstr;
+class SIInstrInfo;
+class MachineLoopInfo;
+
+struct SchedScore {
+  // Score for this Sched result.
+  unsigned Occupancy = 0;
+  bool SgprSpill = false;
+  unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass?
+  unsigned MemLatency = 0;  // Only save mem latency.
+                            // We want mem latency small and hide big. Compare
+                            // memLatency - hide * Occ, smaller is better.
+  unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
+  unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
+  unsigned Lds = 0; // Todo: count lds.
+  SchedScore() {}
+
+  // Other info which can help compare schedule result.
+  float computeScore() const;
+  float computeScore2() const;
+
+  void sum(const SchedScore &s, unsigned loopDepth=0);
+  bool isBetter(const SchedScore &s) const;
+  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const;
+  // More latency can be hiden with ExtraOcc.
+  unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
+};
+
+struct AMDGPULatencyTracker {
+  AMDGPULatencyTracker(const llvm::GCNSubtarget &ST);
+  const llvm::SIInstrInfo *SIII;
+  const llvm::InstrItineraryData *ItinerayData;
+  // Latency MI dst reg to cycle map.
+  llvm::DenseMap<unsigned, int> LatencyMIs;
+  SchedScore score;
+  // Low latency MI not wait.
+  unsigned hideLatency = 0;
+  unsigned memLatency = 0;
+  // For simple, only consider mixture as one valu one salu.
+  // Not group now.
+  unsigned prevSAlu = 0;
+  unsigned prevVAlu = 0;
+  enum class AluStatus {
+    Nothing,
+    Vector,
+    Scalar,
+  } prevStatus = AluStatus::Nothing;
+  void scan(const llvm::MachineInstr &MI);
+};
+
+SchedScore CollectLatency(llvm::MachineFunction &MF,
+                          const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI = nullptr);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
new file mode 100644
index 0000000000000..a0f2a5d4dc121
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -0,0 +1,1790 @@
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+//#include "dxc/DXIL/DxilMetadataHelper.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/GraphWriter.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "GCNRegPressure.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUSubExpDag.h"
+#include <unordered_set>
+
+#define DEBUG_TYPE "xb-sub-exp-dag"
+using namespace llvm;
+
+namespace llvm {
+
+// Expression Dag.
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const {
+    dbgs() << "\nSubExp:\n";
+    dbgs() << "input regs:\n";
+    for (auto &input : inputLive) {
+      pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
+      dbgs() << "\n";
+    }
+    dbgs() << "output regs:\n";
+    for (auto &output : outputLive) {
+      pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
+      dbgs() << "\n";
+    }
+
+    for (MachineInstr *MI : SUnits) {
+      MI->dump();
+    }
+    dbgs() << "End of SubExp\n";
+}
+#endif
+
+bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const
+{
+    for (const MachineInstr *MI : SUnits)
+    {
+        if (MI->modifiesRegister(Reg, SIRI))
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
+                             const SIRegisterInfo *SIRI) {
+  sMaxSize = std::max(sInputSize, sOutputSize);
+  vMaxSize = std::max(vInputSize, vOutputSize);
+
+  DenseMap<unsigned, LaneBitmask> LiveRegs;
+  GCNRegPressure CurPressure;
+
+  // Add output to pressure.
+  for (MachineInstr *MI : BottomRoots) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual())
+        continue;
+      LaneBitmask mask = getRegMask(MO, MRI);
+      auto it = LiveRegs.find(Reg);
+      if (it != LiveRegs.end()) {
+        LiveRegs[Reg] = mask | it->second;
+      } else {
+        LiveRegs[Reg] = mask;
+      }
+    }
+  }
+
+  for (auto it : LiveRegs) {
+    LaneBitmask emptyMask;
+    CurPressure.inc(it.first, emptyMask, it.second, MRI);
+  }
+
+  for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) {
+    MachineInstr *MI = *it;
+    auto *ST = &MI->getMF()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual()) {
+        if (Reg == AMDGPU::SCC)
+          bTouchSCC = true;
+        continue;
+      }
+
+      LaneBitmask LiveMask = getRegMask(MO, MRI);
+      LaneBitmask PrevMask;
+      auto liveIt = LiveRegs.find(Reg);
+      if (liveIt != LiveRegs.end()) {
+        PrevMask = liveIt->second;
+      }
+
+      if (MO.isDef()) {
+        LiveMask = PrevMask & (~(LiveMask));
+      } else {
+        LiveMask = PrevMask | LiveMask;
+      }
+
+      CurPressure.inc(Reg, PrevMask, LiveMask, MRI);
+      LiveRegs[Reg] = LiveMask;
+    }
+
+    unsigned sSize = CurPressure.getSGPRNum();
+    unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts());
+    if (sSize > sMaxSize)
+      sMaxSize = sSize;
+    if (vSize > vMaxSize)
+      vMaxSize = vSize;
+  }
+}
+
+bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
+  if (bMultiDefOutput)
+    return false;
+  if (bHasTerminatorInst)
+    return false;
+  if (bUseIncomingReg)
+    return false;
+
+  // Input should be single def.
+  for (unsigned Reg : TopRegs) {
+    if (!MRI.hasOneDef(Reg) && !llvm::IsSub0Sub1SingleDef(Reg, MRI))
+      return false;
+  }
+  return true;
+}
+
+ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
+               const llvm::SIRegisterInfo *SIRI,
+               const SIInstrInfo *SIII, const bool bJoinInput)
+    : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {}
+
+template <typename T>
+void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) {
+  unsigned NodeSize = InputLiveReg.size() + insts.size();
+  SUnits.reserve(NodeSize);
+
+  for (MachineInstr *MI : insts) {
+    if (MI->isDebugInstr())
+      continue;
+    SUnits.emplace_back(MI, SUnits.size());
+    SUnit *SU = &SUnits.back();
+    SUnitMIMap[SU] = MI;
+    MISUnitMap[MI] = SU;
+  }
+
+  for (auto it : InputLiveReg) {
+    unsigned Reg = it.first;
+    SUnits.emplace_back();
+    SUnit *SU = &SUnits.back();
+    SU->NodeNum = SUnits.size() - 1;
+    SUnitInputMap[SU] = Reg;
+    InputSUnitMap[Reg] = SU;
+  }
+}
+
+template void ExpDag::initNodes<DenseSet<MachineInstr *>>(
+    const LiveSet &InputLiveReg, DenseSet<MachineInstr *> &instRange);
+
+template void ExpDag::initNodes<std::vector<MachineInstr *>>(
+    const LiveSet &InputLiveReg, std::vector<MachineInstr *> &instRange);
+
+template <typename T>
+void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+                   T &insts) {
+  initNodes(InputLiveReg, insts);
+  addDataDep(SIRI);
+  addCtrlDep();
+  buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII);
+}
+
+template void
+ExpDag::build<DenseSet<MachineInstr *>>(const LiveSet &InputLiveReg,
+                                        const LiveSet &OutputLiveReg,
+                                        DenseSet<MachineInstr *> &instRange);
+template void ExpDag::build<std::vector<MachineInstr *>>(const LiveSet &InputLiveReg,
+                                               const LiveSet &OutputLiveReg,
+                                               std::vector<MachineInstr *> &instRange);
+
+void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
+                         const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  IntEqClasses SubtreeClasses(SUnits.size());
+  std::vector<unsigned> passThruInputs;
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) {
+      passThruInputs.emplace_back(SU.NodeNum);
+      continue;
+    }
+    if (!bJoinInputToSubExp && !SU.isInstr())
+      continue;
+    // Join prev.
+    for (SDep &PreDep : SU.Preds) {
+      SUnit *PreSU = PreDep.getSUnit();
+      if (!bJoinInputToSubExp && !PreSU->isInstr())
+        continue;
+      SubtreeClasses.join(SU.NodeNum, PreSU->NodeNum);
+    }
+    // Join succ.
+    for (SDep &SucDep : SU.Succs) {
+      SUnit *SucSU = SucDep.getSUnit();
+      SubtreeClasses.join(SU.NodeNum, SucSU->NodeNum);
+    }
+  }
+  SubtreeClasses.compress();
+
+  unsigned NumSubExps = SubtreeClasses.getNumClasses();
+  // Not count passThruInputs for subExps since they're exp with only 1 SU.
+  // SubExpIndexMap is used to pack SubIdx within updated NumSubExps.
+  NumSubExps -= passThruInputs.size();
+  SubExps.resize(NumSubExps);
+  DenseMap<unsigned, unsigned> SubExpIndexMap;
+
+  // Add SU to sub exp.
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) {
+      continue;
+    }
+    unsigned SubIdx = SubtreeClasses[SU.NodeNum];
+    unsigned OriginSubIdx = SubIdx;
+    // Pack subidx.
+    if (SubExpIndexMap.count(SubIdx) == 0) {
+      unsigned count = SubExpIndexMap.size();
+      SubExpIndexMap.insert(std::make_pair(SubIdx, count));
+    }
+    SubIdx = SubExpIndexMap[SubIdx];
+    // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag.
+    SU.NodeQueueId = SubIdx;
+
+    SubExp &Exp = SubExps[SubIdx];
+    auto it = SUnitInputMap.find(&SU);
+    if (it != SUnitInputMap.end()) {
+      // Input.
+      unsigned Reg = it->second;
+      Exp.TopRegs.insert(Reg);
+    } else {
+      MachineInstr *MI = SU.getInstr();
+      MachineBasicBlock *MBB = MI->getParent();
+      Exp.FromBB = MBB;
+      for (MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isUse())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) {
+          Exp.bUseIncomingReg = true;
+        }
+      }
+
+      Exp.SUnits.emplace_back(MI);
+      if (SU.NumSuccsLeft == 0) {
+        Exp.BottomRoots.insert(MI);
+        if (MI->isTerminator())
+          Exp.bHasTerminatorInst = true;
+      }
+      if (MI->isNotDuplicable())
+        Exp.bNotSafeToCopy = true;
+      // Skip Scalar mem access since no scalar store.
+      if (MI->mayLoadOrStore() && !SIII->isSMRD(*MI)) {
+        Exp.bHasMemInst = true;
+      }
+      // Add bottom regs.
+      for (MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isDef())
+          continue;
+        Register Reg = MO.getReg();
+        // physical reg is not in live reg.
+        if (!Reg.isVirtual())
+          continue;
+        if (SU.NumSuccsLeft) {
+          // For SU which has used in current blk.
+          // Check if used in other blks or subExps.
+          bool bUsedInOtherBlk = false;
+          for (auto &UserMI : MRI.use_nodbg_instructions(Reg)) {
+            if (UserMI.getParent() != MBB) {
+              bUsedInOtherBlk = true;
+              break;
+            }
+            auto suIt = MISUnitMap.find(&UserMI);
+            // When UserMI is not in dag, treat it as other block.
+            if (suIt == MISUnitMap.end()) {
+              bUsedInOtherBlk = true;
+              break;
+            }
+            SUnit *UseSU = suIt->second;
+            // UserMI should always be in same subExp.
+            unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum];
+            if (UseSubIdx != OriginSubIdx) {
+              // When reg has multiple def, it is possible for user def in different subExp.
+              if (MRI.getUniqueVRegDef(Reg))
+                llvm::report_fatal_error("user and def in different subExp");
+              break;
+            }
+          }
+          if (!bUsedInOtherBlk)
+            continue;
+        }
+        Exp.BottomRegs.insert(Reg);
+        if (!MRI.getUniqueVRegDef(Reg)) {
+          Exp.bMultiDefOutput = true;
+        }
+      }
+    }
+  }
+  // Calc reg for SubExp.
+  // Get block live in and live out.
+  // Only reg will miss live mask.
+  for (SubExp &Exp : SubExps) {
+    for (unsigned Reg : Exp.TopRegs) {
+      auto it = StartLiveReg.find(Reg);
+      assert(it != StartLiveReg.end() &&
+             "cannot find input reg in block start live");
+      Exp.inputLive[Reg] |= it->second;
+    }
+
+    for (unsigned Reg : Exp.BottomRegs) {
+      auto it = EndLiveReg.find(Reg);
+      if (it == EndLiveReg.end()) {
+        //"cannot find output reg in block end live");
+        // Bottom reg is killed inside current block, did not get out of the
+        // block.
+        // Or the bottom reg is not treat as output in this dag, not save to
+        // outputLive which will affect profit count.
+        continue;
+      }
+      Exp.outputLive[Reg] |= it->second;
+    }
+
+    CollectLiveSetPressure(Exp.inputLive, MRI, SIRI, Exp.vInputSize,
+                           Exp.sInputSize);
+    CollectLiveSetPressure(Exp.outputLive, MRI, SIRI, Exp.vOutputSize,
+                           Exp.sOutputSize);
+  }
+}
+
+void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
+  DenseMap<unsigned, MachineInstr *> curDefMI;
+
+  for (SUnit &SU : SUnits) {
+    if (!SU.isInstr())
+      continue;
+    MachineInstr *MI = SU.getInstr();
+
+    // Link use to the def.
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+
+      Register Reg = MO.getReg();
+      SUnit *DefSU = nullptr;
+
+      auto curDefIt = curDefMI.find(Reg);
+      // Check def inst first.
+      if (curDefIt != curDefMI.end()) {
+        MachineInstr *curDef = curDefIt->second;
+        DefSU = MISUnitMap[curDef];
+      } else {
+        // physical reg is not in live reg.
+        if (!Reg.isVirtual())
+          continue;
+        if (MO.isUndef())
+          continue;
+        // Is it OK for degbug instr MO cannot find def?
+        if (MI->isDebugInstr())
+          continue;
+        // Should be an input.
+        assert(InputSUnitMap.count(Reg) > 0 && "cannot find def");
+        DefSU = InputSUnitMap[Reg];
+      }
+      SU.addPred(SDep(DefSU, SDep::Data, Reg));
+    }
+
+    // Add def to curDefMI;
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      // For case like:
+      // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+      // %808.sub1:sgpr_64 = S_MOV_B32 0
+      // When partially write, link MI to previous def.
+      if (MO.getSubReg() != 0) {
+        SUnit *DefSU = nullptr;
+        auto curDefIt = curDefMI.find(Reg);
+        // Check def inst first.
+        if (curDefIt != curDefMI.end()) {
+          MachineInstr *curDef = curDefIt->second;
+          DefSU = MISUnitMap[curDef];
+          // Add link between different defs.
+          SU.addPred(SDep(DefSU, SDep::Data, Reg));
+        }
+      }
+
+      curDefMI[Reg] = MI;
+    }
+  }
+}
+
+void ExpDag::addCtrlDep() {
+  // TODO: add depend for memory, barrier.
+}
+
+BlockExpDag::BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS,
+                         const llvm::MachineRegisterInfo &MRI,
+                         const llvm::SIRegisterInfo *SIRI,
+                         const llvm::SIInstrInfo *SIII)
+    : ExpDag(MRI, SIRI, SIII, /*bJoinInput*/ true), LIS(LIS), MBB(B) {}
+
+void BlockExpDag::build() {
+  auto *SlotIndexes = LIS->getSlotIndexes();
+  const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB);
+  const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI);
+
+  const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
+  const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
+
+  std::vector<MachineInstr *> insts;
+  for (MachineInstr &MI : *MBB) {
+    insts.emplace_back(&MI);
+  }
+
+  ExpDag::build(StartLiveReg, EndLiveReg, insts);
+}
+
+void BlockExpDag::buildWithPressure() {
+  auto *SlotIndexes = LIS->getSlotIndexes();
+  const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB);
+  const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI);
+
+  const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
+  const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
+
+  std::vector<MachineInstr *> insts;
+  for (MachineInstr &MI : *MBB) {
+    insts.emplace_back(&MI);
+  }
+
+  ExpDag::build(StartLiveReg, EndLiveReg, insts);
+  // Build pressure.
+  buildPressure(StartLiveReg, EndLiveReg);
+}
+
+void BlockExpDag::buildAvail(
+    const LiveSet &passThruSet,
+    DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
+  DenseSet<SUnit *> Processed;
+
+  DenseSet<SUnit *> WorkList;
+  MachineInstr &BeginMI = MBB->instr_front();
+
+  // Calc avaialbe for each node, live is avail & sum(input of success).
+  // If a reg is avaiable from the node, then success node can use it from this
+  // node. For dag live, pred output don't need to have all input a node needs.
+  // As long as all pred outputs can cover inputs, it is OK.
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0) {
+      GCNDownwardRPTracker RP(*LIS);
+      RP.reset(BeginMI, &passThruSet);
+      MachineInstr *MI = SU.getInstr();
+      if (MI) {
+        RP.reset(*MI, &passThruSet);
+        RP.advance();
+      }
+      DagAvailRegMap[&SU] = RP.getLiveRegs();
+
+      // Add succ to work list.
+      for (auto &Succ : SU.Succs) {
+        SUnit *SuccSU = Succ.getSUnit();
+        if (SuccSU->NumPredsLeft > 0)
+          SuccSU->NumPredsLeft--;
+        WorkList.insert(SuccSU);
+      }
+    }
+  }
+  while (!WorkList.empty()) {
+    bool bUpdated = false;
+    SmallVector<SUnit *, 4> ReadyNodes;
+    for (SUnit *SU : WorkList) {
+      if (SU->NumPredsLeft > 0)
+        continue;
+      ReadyNodes.emplace_back(SU);
+      // Ready, move it to Processed.
+      Processed.insert(SU);
+      bUpdated = true;
+      // Only update 1 node once.
+      // Order of schedle here should not affect pressure.
+      break;
+    }
+
+    for (SUnit *SU : ReadyNodes) {
+      // Remove SU from worklist.
+      WorkList.erase(SU);
+
+      MachineInstr *MI = SU->getInstr();
+      // Calc pressure based on pred nodes.
+      GCNRPTracker::LiveRegSet dagLive;
+      for (auto &Pred : SU->Preds) {
+        SUnit *PredSU = Pred.getSUnit();
+        GCNRPTracker::LiveRegSet PredLive = DagAvailRegMap[PredSU];
+
+        GCNDownwardRPTracker RP(*LIS);
+        RP.reset(BeginMI, &PredLive);
+        if (MI) {
+          RP.reset(*MI, &PredLive);
+          // Update PredLive based on MI.
+          RP.advance();
+        }
+        llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+      }
+      DagAvailRegMap[SU] = dagLive;
+
+      // Add succ to work list.
+      for (auto &Succ : SU->Succs) {
+        SUnit *SuccSU = Succ.getSUnit();
+        if (SuccSU->NumPredsLeft > 0)
+          SuccSU->NumPredsLeft--;
+        WorkList.insert(SuccSU);
+      }
+    }
+
+    // Skip dead loop
+    if (ReadyNodes.empty()) {
+      printf("dead loop when build dag pressure");
+      break;
+    }
+  }
+
+  assert(WorkList.empty() && "schedule failed for available reg");
+}
+
+void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
+                                const LiveSet &EndLiveReg) {
+  if (MBB->empty())
+    return;
+  DenseMap<SUnit *, GCNRPTracker::LiveRegSet> DagAvailRegMap;
+  GCNRPTracker::LiveRegSet passThruSet;
+  for (auto Reg : StartLiveReg) {
+    unsigned reg = Reg.first;
+    auto EndReg = EndLiveReg.find(reg);
+    if (EndReg == EndLiveReg.end())
+      continue;
+
+    LaneBitmask mask = Reg.second;
+    LaneBitmask endMask = EndReg->second;
+    mask &= endMask;
+    if (mask.getAsInteger() == 0)
+      continue;
+    passThruSet[reg] = mask;
+  }
+
+  // Build avial for each nodes.
+  buildAvail(passThruSet, DagAvailRegMap);
+
+  // Calc avaialbe for each node, live is avail & sum(input of success).
+  // If a reg is avaiable from the node, then success node can use it from this
+  // node. For dag live, pred output don't need to have all input a node needs.
+  // As long as all pred outputs can cover inputs, it is OK.
+  DenseSet<SUnit *> Processed;
+
+  DenseSet<SUnit *> WorkList;
+  MachineInstr &BeginMI = MBB->instr_front();
+
+  for (SUnit &SU : SUnits) {
+    if (SU.NumSuccsLeft == 0) {
+      // Calc pressure based on pass thru.
+      // Using pass thru as base because output of current SU should not
+      // affect other output SUs.
+      GCNUpwardRPTracker RP(*LIS);
+      RP.reset(BeginMI, &passThruSet, /*After*/true);
+      MachineInstr *MI = SU.getInstr();
+      if (MI) {
+        RP.reset(*MI, &passThruSet, /*After*/true);
+        RP.recede(*MI);
+      }
+      DagPressureMap[&SU] = RP.getLiveRegs();
+      // Add pred to work list.
+      for (auto &Pred : SU.Preds) {
+        SUnit *PredSU = Pred.getSUnit();
+        PredSU->NumSuccsLeft--;
+        WorkList.insert(PredSU);
+      }
+    }
+  }
+
+  while (!WorkList.empty()) {
+    bool bUpdated = false;
+    SmallVector<SUnit *, 4> ReadyNodes;
+    for (SUnit *SU : WorkList) {
+      if (SU->NumSuccsLeft > 0)
+        continue;
+      ReadyNodes.emplace_back(SU);
+      // Ready, move it to Processed.
+      Processed.insert(SU);
+      bUpdated = true;
+      // Only update 1 node once.
+      // Order of schedle here should not affect pressure.
+      break;
+    }
+
+    for (SUnit *SU : ReadyNodes) {
+      // Remove SU from worklist.
+      WorkList.erase(SU);
+
+      MachineInstr *MI = SU->getInstr();
+      // Calc pressure based on succ nodes.
+      GCNRPTracker::LiveRegSet dagLive;
+      for (auto &Succ : SU->Succs) {
+        SUnit *SuccSU = Succ.getSUnit();
+        GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU];
+
+        GCNUpwardRPTracker RP(*LIS);
+        RP.reset(BeginMI, &SuccLive, /*After*/true);
+        if (MI) {
+          RP.reset(*MI, &SuccLive, /*After*/true);
+          // Update SuccLive based on MI.
+          RP.recede(*MI);
+        }
+        llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+      }
+      // Remove live which not avail in SU.
+      GCNRPTracker::LiveRegSet availLive = DagAvailRegMap[SU];
+      llvm::andLiveRegSet(dagLive, availLive);
+      DagPressureMap[SU] = dagLive;
+
+      // Add pred to work list.
+      for (auto &Pred : SU->Preds) {
+        SUnit *PredSU = Pred.getSUnit();
+        PredSU->NumSuccsLeft--;
+        WorkList.insert(PredSU);
+      }
+    }
+
+    // Skip dead loop
+    if (ReadyNodes.empty()) {
+      printf("dead loop when build dag pressure");
+      break;
+    }
+  }
+}
+
+// dump functions.
+
+std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream oss(s);
+  auto it = SUnitInputMap.find(SU);
+  if (it != SUnitInputMap.end()) {
+    oss << "<input:" << llvm::printReg(it->second) << ">";
+  } else {
+    SU->getInstr()->print(oss, /*SkipOpers=*/true);
+  }
+
+  return oss.str();
+}
+
+/// Return the label.
+std::string ExpDag::getDAGName() const {
+  return "dag.exp";
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const {
+#if 0 // TODO: Re-enable this
+  // This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(const_cast<ExpDag *>(this), Name, false, Title);
+#else
+  errs() << "BlockExpDag::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+#endif
+}
+
+void ExpDag::dump() {
+  viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName());
+}
+
+}
+
+// Expression Dag dump.
+namespace llvm {
+
+static DenseSet<const SUnit *> ViewNodes;
+
+template <>
+struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const llvm::ExpDag *G) {
+    return "ExpDag graph";
+  }
+
+  static bool renderGraphFromBottomUp() { return true; }
+
+  static bool isNodeHidden(const SUnit *Node) {
+    if (ViewNodes.empty())
+      return false;
+
+    return ViewNodes.count(Node) == 0;
+  }
+
+  static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                            const llvm::ExpDag *Graph) {
+    std::string R;
+    raw_string_ostream OS(R);
+    OS << static_cast<const void *>(Node);
+    return R;
+  }
+
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+                                       const llvm::ExpDag *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
+
+  static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *Graph) {
+    std::string Str;
+    raw_string_ostream SS(Str);
+    SS << "SU:" << SU->NodeNum;
+    return SS.str();
+  }
+  static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) {
+    return G->getGraphNodeLabel(SU);
+  }
+  static std::string getNodeAttributes(const SUnit *N,
+                                       const llvm::ExpDag *Graph) {
+    std::string Str("shape=Mrecord");
+
+    Str += ",style=filled,fillcolor=\"#";
+    // Use NodeQueueId as SubIdx for ExpDag.
+    Str += DOT::getColorString(N->NodeQueueId);
+    Str += '"';
+
+    return Str;
+  }
+
+  static void addCustomGraphFeatures(llvm::ExpDag *G,
+                                     GraphWriter<llvm::ExpDag *> &GW) {
+    return G->addCustomGraphFeatures(GW);
+  }
+};
+
+template <> struct GraphTraits<llvm::ExpDag *> : public GraphTraits<SUnit *> {
+  using nodes_iterator = pointer_iterator<std::vector<SUnit>::iterator>;
+  static nodes_iterator nodes_begin(llvm::ExpDag *G) {
+    return nodes_iterator(G->SUnits.begin());
+  }
+  static nodes_iterator nodes_end(llvm::ExpDag *G) {
+    return nodes_iterator(G->SUnits.end());
+  }
+};
+
+} // namespace llvm
+
+namespace llvm {
+void getRegBound(llvm::MachineBasicBlock *MBB,
+                 const llvm::MachineRegisterInfo &MRI,
+                 const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                 llvm::LiveIntervals *LIS, unsigned &MaxVGPR,
+                 unsigned &MaxSGPR) {
+  // TODO: calc real reg bound.
+  MaxVGPR = AMDGPU::VGPR255 - AMDGPU::VGPR0;
+  MaxSGPR = AMDGPU::SGPR104 - AMDGPU::SGPR0;
+
+  const auto &EndSlot = LIS->getMBBEndIdx(MBB);
+  const GCNRPTracker::LiveRegSet outputLive =
+      llvm::getLiveRegs(EndSlot, *LIS, MRI);
+
+  auto* ST = &MBB->getParent()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+  if (MBB->empty()) {
+    GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive);
+    MaxSGPR = MaxPressure.getSGPRNum();
+    MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts());
+    return;
+  }
+
+  BlockExpDag dag(MBB, LIS, MRI, SIRI, SIII);
+  dag.build();
+
+  std::vector<SUnit> &SUnits = dag.SUnits;
+  // Remove input nodes.
+  for (SUnit &SU : SUnits) {
+    if (!SU.isInstr())
+      continue;
+    std::vector<SDep> inputDeps;
+    for (SDep &Dep : SU.Preds) {
+      SUnit *Pred = Dep.getSUnit();
+      if (Pred->isInstr())
+        continue;
+      inputDeps.emplace_back(Dep);
+    }
+    for (SDep &Dep : inputDeps) {
+      SU.removePred(Dep);
+    }
+  }
+
+  unsigned inputSize = dag.InputSUnitMap.size();
+  unsigned instNodeSize = SUnits.size() - inputSize;
+  SUnits.erase(SUnits.begin() + instNodeSize, SUnits.end());
+
+  std::vector<llvm::SUnit *> BotRoots;
+  for (SUnit &SU : SUnits) {
+    if (SU.NumSuccsLeft == 0)
+      BotRoots.emplace_back(&SU);
+  }
+
+  auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI);
+
+  GCNUpwardRPTracker RPTracker(*LIS);
+  RPTracker.reset(MBB->front(), &outputLive, /*After*/true);
+  for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) {
+    const SUnit *SU = *it;
+    if (!SU->isInstr())
+      continue;
+    MachineInstr *MI = SU->getInstr();
+    RPTracker.recede(*MI);
+  }
+
+  GCNRegPressure MaxPressure = RPTracker.getMaxPressureAndReset();
+  MaxSGPR = MaxPressure.getSGPRNum();
+  MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts());
+}
+} // namespace llvm
+
+// HRB
+namespace {
+
+std::vector<SUnit *> buildWorkList(std::vector<llvm::SUnit> &SUnits) {
+  std::vector<SUnit *> resultList;
+  resultList.reserve(SUnits.size());
+  for (SUnit &SU : SUnits) {
+    resultList.emplace_back(&SU);
+  }
+  return resultList;
+}
+
+void sortByHeight(std::vector<SUnit *> &workList) {
+  std::sort(workList.begin(), workList.end(),
+            [](const SUnit *a, const SUnit *b) {
+              // Lowest height first.
+              if (a->getHeight() < b->getHeight())
+                return true;
+              // If height the same, NodeNum big first.
+              if (a->getHeight() == b->getHeight())
+                return a->NodeNum > b->NodeNum;
+              return false;
+            });
+}
+
+void sortByInChain(std::vector<SUnit *> &workList, DenseSet<SUnit *> &Chained) {
+  // In chain nodes at end.
+  std::sort(workList.begin(), workList.end(),
+            [&Chained](const SUnit *a, const SUnit *b) {
+              return Chained.count(a) < Chained.count(b);
+            });
+}
+
+const TargetRegisterClass *getRegClass(SUnit *SU,
+                                       const MachineRegisterInfo &MRI,
+                                       const SIRegisterInfo *SIRI) {
+  if (!SU->isInstr())
+    return nullptr;
+  MachineInstr *MI = SU->getInstr();
+  if (MI->getNumDefs() == 0)
+    return nullptr;
+
+  // For MI has more than one dst, always use first dst.
+  MachineOperand *MO = MI->defs().begin();
+  if (!MO->isReg())
+    return nullptr;
+  unsigned Reg = MO->getReg();
+  return SIRI->getRegClassForReg(MRI, Reg);
+}
+
+unsigned getVGPRSize(const TargetRegisterClass *RC,
+                     const SIRegisterInfo *SIRI) {
+  if (!RC)
+    return 0;
+  if (SIRI->isSGPRClass(RC))
+    return 0;
+  return RC->getLaneMask().getNumLanes();
+}
+unsigned getSGPRSize(const TargetRegisterClass *RC,
+                     const SIRegisterInfo *SIRI) {
+  if (!RC)
+    return 0;
+  if (!SIRI->isSGPRClass(RC))
+    return 0;
+  return RC->getLaneMask().getNumLanes();
+}
+
+void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet<SUnit *, 2> &backNodes,
+                                unsigned NodeNum,
+                                SmallDenseSet<SUnit *, 4> &visitedNodes) {
+  if (visitedNodes.count(SU))
+    return;
+  visitedNodes.insert(SU);
+
+  for (SDep &Dep : SU->Succs) {
+    if (Dep.isWeak())
+      continue;
+    if (Dep.getLatency() > 0)
+      continue;
+
+    SUnit *Succ = Dep.getSUnit(); /*
+     if (Succ->NodeNum >= NodeNum)
+       continue;*/
+
+    backNodes.insert(Succ);
+    collectSameHeightBackNodes(Succ, backNodes, NodeNum, visitedNodes);
+  }
+}
+
+} // namespace
+
+namespace llvm {
+
+void HRB::Lineage::addNode(llvm::SUnit *SU) { Nodes.emplace_back(SU); }
+unsigned HRB::Lineage::getSize() const {
+  return RC ? RC->getLaneMask().getNumLanes() : 0;
+}
+unsigned HRB::Lineage::length() const { return Nodes.size(); }
+
+SUnit *HRB::Lineage::getHead() const { return Nodes.front(); }
+SUnit *HRB::Lineage::getTail() const { return Nodes.back(); }
+
+void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
+  // Working list from TopRoots.
+  std::vector<SUnit *> workList = buildWorkList(SUnits);
+  IntEqClasses EqClasses(SUnits.size());
+
+  while (!workList.empty()) {
+    sortByHeight(workList);
+    // Highest SU.
+    SUnit *SU = workList.back();
+    workList.pop_back();
+    if (!SU->isInstr())
+      continue;
+    if (ChainedNodes.count(SU) > 0)
+      continue;
+    bRecomputeHeight = false;
+    Lineage lineage = buildChain(SU, SUnits);
+
+    // Remove chained nodes from worklist.
+    sortByInChain(workList, ChainedNodes);
+    while (!workList.empty()) {
+      SUnit *back = workList.back();
+      if (ChainedNodes.count(back))
+        workList.pop_back();
+      else
+        break;
+    }
+
+    Lineages.emplace_back(lineage);
+
+    if (bRecomputeHeight) {
+      // Update height from tail.
+      SUnit *tail = lineage.Nodes.back();
+      tail->setDepthDirty();
+      tail->getHeight();
+    }
+  }
+
+  DenseSet<SUnit *> tailSet;
+  for (Lineage &L : Lineages) {
+    if (L.Nodes.size() < 2)
+      continue;
+    auto it = L.Nodes.rbegin();
+    it++;
+    SUnit *tail = L.Nodes.back();
+    // If already as tail for other lineage, start from next.
+    if (tailSet.count(tail) > 0) {
+      tail = *it;
+      it++;
+    } else {
+      tailSet.insert(tail);
+    }
+    for (; it != L.Nodes.rend(); it++) {
+      SUnit *SU = *it;
+      if (tail->NodeNum == -1)
+        continue;
+      EqClasses.join(SU->NodeNum, tail->NodeNum);
+    }
+  }
+
+  EqClasses.compress();
+  // TODO: assign sub class to node.
+  for (Lineage &L : Lineages) {
+    for (SUnit *SU : L.Nodes) {
+      if (SU->NodeNum == -1)
+        continue;
+      unsigned SubIdx = EqClasses[SU->NodeNum];
+      //// Pack subidx.
+      // if (EqClasses.count(SubIdx) == 0)
+      //  EqClasses[SubIdx] = EqClasses.size();
+      SubIdx = EqClasses[SubIdx];
+      // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag.
+      SU->NodeQueueId = SubIdx;
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "Chained Nodes:"; for (SUnit *SU
+                                       : ChainedNodes) {
+        dbgs() << " " << SU->NodeNum << "\n";
+      } for (int i = 0; i < Lineages.size(); i++) {
+        dbgs() << "Lineage" << i << ":";
+        Lineage &L = Lineages[i];
+        for (SUnit *SU : L.Nodes) {
+          dbgs() << " " << SU->NodeNum;
+        }
+        dbgs() << "\n";
+      });
+}
+
+SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
+  std::vector<SUnit *> Candidates;
+  for (SDep &Dep : SU->Succs) {
+    // Only check data dep.
+    if (Dep.getKind() != SDep::Data)
+      continue;
+
+    SUnit *Succ = Dep.getSUnit();
+    Candidates.emplace_back(Succ);
+  }
+
+  if (Candidates.empty())
+    return nullptr;
+
+  if (Candidates.size() == 1)
+    return Candidates.front();
+
+  sortByHeight(Candidates);
+  // Lowest height.
+  SUnit *Heir = Candidates.front();
+  SmallVector<SUnit *, 2> SameHeightCandidate;
+  for (SUnit *SU : Candidates) {
+    if (Heir->getHeight() != SU->getHeight())
+      break;
+    SameHeightCandidate.emplace_back(SU);
+  }
+  // Make sure choose lowest dependence between SameHeightCandidate.
+  if (SameHeightCandidate.size() > 1) {
+    for (int i = 1; i < SameHeightCandidate.size(); i++) {
+      SUnit *SU = SameHeightCandidate[i];
+      // If Heir is pred of SU, use SU.
+      if (canReach(SU, Heir))
+        Heir = SU;
+    }
+  }
+
+  unsigned HeriHeight = Heir->getHeight();
+
+  // if lowest node is in ChainedNodes, try to find same height nodes?
+
+  for (SDep &Dep : SU->Succs) {
+    // Only check data dep.
+    if (Dep.getKind() != SDep::Data)
+      continue;
+    SUnit *Succ = Dep.getSUnit();
+    if (Succ == Heir)
+      continue;
+    // Avoid cycle in DAG.
+    if (canReach(Heir, Succ))
+      return nullptr;
+    // Make sure Succ is before Heir.
+    Heir->addPred(SDep(Succ, SDep::Artificial));
+    updateReachForEdge(Succ, Heir, SUnits);
+    LLVM_DEBUG(dbgs() << "add edge from " << Succ->NodeNum << "("
+                      << Succ->getHeight() << ") to " << Heir->NodeNum << "("
+                      << HeriHeight << ")\n");
+    // Update height if need.
+    unsigned Height = Succ->getHeight();
+    if (Height <= HeriHeight) {
+      bRecomputeHeight = true;
+    }
+  }
+  return Heir;
+}
+
+HRB::Lineage HRB::buildChain(SUnit *Node,
+                             std::vector<llvm::SUnit> &SUnits) {
+  HRB::Lineage chain;
+  chain.addNode(Node);
+  ChainedNodes.insert(Node);
+  LLVM_DEBUG(dbgs() << "start chain " << Node->NodeNum << "("
+                    << Node->getHeight() << ")\n");
+  while (Node->NumSuccsLeft > 0) {
+    SUnit *Heir = findHeir(Node, SUnits);
+    if (!Heir)
+      break;
+    chain.addNode(Heir);
+
+    LLVM_DEBUG(dbgs() << "add node to chain " << Heir->NodeNum << "\n");
+    if (ChainedNodes.count(Heir) > 0)
+      break;
+    ChainedNodes.insert(Heir);
+
+    Node = Heir;
+  }
+  // Find biggest vgpr RC for the chain.
+  // TODO: Build conflict and allocate on each edge of the chain.
+  const TargetRegisterClass *RC = nullptr;
+  unsigned maxRCSize = 0;
+  for (SUnit *SU : chain.Nodes) {
+    const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
+    unsigned RCSize = getVGPRSize(SuRC, SIRI);
+    if (RCSize > maxRCSize) {
+      maxRCSize = RCSize;
+      RC = SuRC;
+    }
+  }
+  if (!RC) {
+    // TODO: Find biggest sgpr RC.
+    unsigned maxRCSize = 0;
+    for (SUnit *SU : chain.Nodes) {
+      const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
+      unsigned RCSize = getSGPRSize(SuRC, SIRI);
+      if (RCSize > maxRCSize) {
+        maxRCSize = RCSize;
+        RC = SuRC;
+      }
+    }
+  }
+  chain.RC = RC;
+  return chain;
+}
+
+void HRB::buildConflict() {
+
+  for (unsigned i = 0; i < Lineages.size(); i++) {
+    Lineage &a = Lineages[i];
+    for (unsigned j = i + 1; j < Lineages.size(); j++) {
+      Lineage &b = Lineages[j];
+      if (isConflict(a, b)) {
+        Color.Conflicts[i].insert(j);
+        Color.Conflicts[j].insert(i);
+        LLVM_DEBUG(dbgs() << i << " conflict" << j << "\n");
+      }
+    }
+    // SelfConflict.
+    Color.Conflicts[i].insert(i);
+  }
+}
+
+bool HRB::canReach(llvm::SUnit *a, llvm::SUnit *b) {
+  auto it = ReachMap.find(a);
+  // If no reach info, treat as reach.
+  if (it == ReachMap.end())
+    return true;
+  DenseSet<SUnit *> &CurReach = it->second;
+  return CurReach.find(b) != CurReach.end();
+}
+
+void HRB::updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+                             std::vector<llvm::SUnit> &SUnits) {
+  DenseSet<SUnit *> &ReachA = ReachMap[a];
+  ReachA.insert(b);
+  DenseSet<SUnit *> &ReachB = ReachMap[b];
+  ReachA.insert(ReachB.begin(), ReachB.end());
+
+  for (SUnit &SU : SUnits) {
+    if (!canReach(&SU, a))
+      continue;
+
+    DenseSet<SUnit *> &CurReach = ReachMap[&SU];
+    CurReach.insert(ReachA.begin(), ReachA.end());
+  }
+}
+
+void HRB::buildReachRelation(ArrayRef<SUnit *> BotRoots) {
+  // Add fake entry to do PostOrder traversal.
+  // SUnit using Pred to traversal, so need to Revrese post order.
+  SUnit FakeEntry;
+  SmallVector<SDep, 4> FakeDeps;
+  for (SUnit *Root : BotRoots) {
+    SDep Dep = SDep(Root, SDep::Artificial);
+    FakeEntry.addPred(Dep);
+    FakeDeps.emplace_back(Dep);
+  }
+
+  ReversePostOrderTraversal<SUnit *> RPOT(&FakeEntry);
+  for (SUnit *SU : RPOT) {
+    // Create Reach Set first.
+    ReachMap[SU].clear();
+  }
+  for (SUnit *SU : RPOT) {
+    DenseSet<SUnit *> &CurReach = ReachMap[SU];
+    // All Preds can reach SU and SU's reach.
+    for (SDep &Dep : SU->Preds) {
+      // Igonre week dep.
+      if (Dep.isWeak())
+        continue;
+      DenseSet<SUnit *> &PrevReach = ReachMap[Dep.getSUnit()];
+      PrevReach.insert(SU);
+      PrevReach.insert(CurReach.begin(), CurReach.end());
+    }
+    assert(CurReach.count(SU) == 0 && "dead loop");
+  }
+  // Remove fake entry.
+  for (SDep &Dep : FakeDeps) {
+    FakeEntry.removePred(Dep);
+  }
+  ReachMap.erase(&FakeEntry);
+
+  LLVM_DEBUG(for (Lineage &L
+                  : Lineages) {
+    for (SUnit *SU : L.Nodes) {
+      DenseSet<SUnit *> &CurReach = ReachMap[SU];
+      dbgs() << SU->NodeNum << " reach: ";
+      for (SUnit *R : CurReach) {
+        dbgs() << R->NodeNum << " ";
+      }
+      dbgs() << "\n";
+    }
+  });
+}
+
+bool HRB::isConflict(const Lineage &a, const Lineage &b) {
+  // Make conflict between sgpr and vgpr to help group lineages when share
+  // colors. Keep the conflict will group lineages in avoid mix use color in
+  // different sub exp.
+  SUnit *head0 = a.getHead();
+  SUnit *tail0 = a.getTail();
+  SUnit *head1 = b.getHead();
+  SUnit *tail1 = b.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+  bool r01 = Reach0.count(tail1) != 0;
+  bool r10 = Reach1.count(tail0) != 0;
+  return r01 && r10;
+}
+bool HRB::canFuse(const Lineage &a, const Lineage &b) {
+  if (a.RC != b.RC) {
+    // no RC will not conflict with other nodes.
+    if (!a.RC)
+      return false;
+    if (!b.RC)
+      return false;
+    // SGRP and VGPR not conflict.
+    if (SIRI->isSGPRClass(a.RC) != SIRI->isSGPRClass(b.RC))
+      return false;
+  }
+  // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
+  SUnit *head0 = a.getHead();
+  SUnit *tail0 = a.getTail();
+  SUnit *head1 = b.getHead();
+  SUnit *tail1 = b.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+  bool r01 = Reach0.count(tail1) != 0;
+  bool r10 = Reach1.count(tail0) != 0;
+  return r01 != r10;
+}
+
+bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits) {
+
+  // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
+  SUnit *head0 = a.getHead();
+  SUnit *tail0 = a.getTail();
+  SUnit *head1 = b.getHead();
+  SUnit *tail1 = b.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+  bool r01 = Reach0.count(tail1) != 0;
+  bool r10 = Reach1.count(tail0) != 0;
+  if (r01 == r10)
+    return false;
+  Lineage *newHead = &a;
+  Lineage *newTail = &b;
+  if (r01) {
+    // a reach b, b cannot reach a.
+    // link a.tail->b.head.
+    newHead = &a;
+    newTail = &b;
+  } else {
+    // b reach a, a cannot reach b.
+    // link b.tail->a.head.
+    newHead = &b;
+    newTail = &a;
+  }
+
+  // Merge reg class.
+  const TargetRegisterClass *RC0 = newHead->RC;
+  const TargetRegisterClass *RC1 = newTail->RC;
+  unsigned RC0Size = getVGPRSize(RC0, SIRI);
+  unsigned RC1Size = getVGPRSize(RC1, SIRI);
+  if (RC1Size > RC0Size)
+    newHead->RC = RC1;
+  // Merge chain.
+  SUnit *fuseTail = newHead->getTail();
+  SUnit *fuseHead = newTail->getHead();
+  assert(ReachMap[fuseHead].count(fuseTail) == 0 && "");
+  fuseHead->addPred(SDep(fuseTail, SDep::Artificial));
+  LLVM_DEBUG(dbgs() << "fuse " << fuseTail->NodeNum << "->" << fuseHead->NodeNum
+                    << "\n");
+  // Update reach map.
+  updateReachForEdge(fuseTail, fuseHead, SUnits);
+  // Merge Nodes.
+  newHead->Nodes.append(newTail->Nodes.begin(), newTail->Nodes.end());
+  // Clear newTail.
+  newTail->Nodes.clear();
+  newTail->RC = nullptr;
+  return true;
+}
+
+void HRB::fusionLineages(std::vector<llvm::SUnit> &SUnits) {
+  if (Lineages.empty())
+    return;
+  bool bUpdated = true;
+  while (bUpdated) {
+    bUpdated = false;
+    int size = Lineages.size();
+    for (int i = 0; i < size; i++) {
+      Lineage &a = Lineages[i];
+      if (a.length() == 0)
+        continue;
+
+      for (int j = i + 1; j < size; j++) {
+        Lineage &b = Lineages[j];
+        if (b.length() == 0)
+          continue;
+        if (tryFuse(a, b, SUnits)) {
+          bUpdated = true;
+          if (a.length() == 0)
+            break;
+        }
+      }
+    }
+    // Remove empty lineages.
+    std::sort(Lineages.begin(), Lineages.end(),
+              [](const Lineage &a, const Lineage &b) {
+                return a.length() > b.length();
+              });
+    while (Lineages.back().length() == 0) {
+      Lineages.pop_back();
+    }
+  }
+  // Set ID after fusion.
+  unsigned ID = 0;
+  for (Lineage &L : Lineages) {
+    L.ID = ID++;
+  }
+}
+
+unsigned HRB::colorLineages(std::vector<Lineage *> &lineages,
+                            DenseMap<Lineage *, unsigned> &AllocMap,
+                            const unsigned Limit) {
+  // allocate long Lineage first. How about size of RC?
+  std::sort(lineages.begin(), lineages.end(),
+            [](const Lineage *a, const Lineage *b) {
+              // Make sure root allocate first.
+              return a->length() > b->length();
+            });
+
+  unsigned maxColor = 0;
+  const unsigned VGPR_LIMIT = 256 * 4;
+
+  for (Lineage *L : lineages) {
+    unsigned ID = L->ID;
+    auto &Conflict = Color.Conflicts[ID];
+    std::bitset<VGPR_LIMIT> colors;
+    for (unsigned j : Conflict) {
+      Lineage *C = &Lineages[j];
+      if (AllocMap.count(C) == 0)
+        continue;
+      unsigned c = AllocMap[C];
+      unsigned s = C->getSize();
+      for (unsigned i = 0; i < s; i++) {
+        unsigned pos = c + i;
+        colors.set(pos);
+      }
+    }
+
+    unsigned color = Limit;
+    unsigned size = L->getSize();
+    for (unsigned i = 0; i < Limit - size;) {
+      unsigned oldI = i;
+      for (unsigned j = 0; j < size; j++) {
+        unsigned pos = i + size - 1 - j;
+        if (colors.test(pos)) {
+          i = pos + 1;
+          break;
+        }
+      }
+
+      if (i != oldI)
+        continue;
+      color = i;
+      break;
+    }
+
+    AllocMap[L] = color;
+    color += size;
+    if (color > maxColor)
+      maxColor = color;
+  }
+  return maxColor;
+}
+
+void HRB::ColorResult::colorSU(SUnit *SU, unsigned color) {
+  ColorMap[SU] = color;
+}
+
+unsigned HRB::ColorResult::getLineage(SUnit *SU) const {
+  return LineageMap.find(SU)->second;
+}
+
+bool HRB::ColorResult::isConflict(const SUnit *SU0, unsigned Lineage) const {
+  const unsigned L = LineageMap.find(SU0)->second;
+  const auto &Conflict = Conflicts.find(L)->second;
+  return Conflict.count(Lineage) > 0;
+}
+
+bool HRB::ColorResult::isHead(SUnit *SU) const { return HeadSet.count(SU); }
+bool HRB::ColorResult::isTail(SUnit *SU) const { return TailSet.count(SU); }
+
+const SUnit *HRB::ColorResult::getTail(SUnit *SU) const {
+  if (!isHead(SU))
+    return nullptr;
+  auto it = HeadTailMap.find(SU);
+  return it->second;
+}
+
+unsigned HRB::ColorResult::getColor(const llvm::SUnit *SU) const {
+  auto it = ColorMap.find(SU);
+  return it->second;
+}
+
+unsigned HRB::ColorResult::getSize(const llvm::SUnit *SU) const {
+  auto it = SizeMap.find(SU);
+  return it->second;
+}
+
+HRB::ColorResult &HRB::coloring() {
+  // Collect VGPR lineages.
+  std::vector<Lineage *> vgprLineages;
+  for (Lineage &L : Lineages) {
+    auto RC = L.RC;
+    if (!RC)
+      continue;
+    if (SIRI->isSGPRClass(RC))
+      continue;
+    vgprLineages.emplace_back(&L);
+  }
+
+  const unsigned VGPR_LIMIT = 256 * 4;
+  DenseMap<Lineage *, unsigned> VAllocMap;
+  const unsigned maxVGPR = colorLineages(vgprLineages, VAllocMap, VGPR_LIMIT);
+
+  // Collect SGPR lineages.
+  std::vector<Lineage *> sgprLineages;
+  for (Lineage &L : Lineages) {
+    auto RC = L.RC;
+    if (!RC)
+      continue;
+    if (!SIRI->isSGPRClass(RC))
+      continue;
+    sgprLineages.emplace_back(&L);
+  }
+
+  const unsigned SGPR_LIMIT = 104;
+  DenseMap<Lineage *, unsigned> SAllocMap;
+  const unsigned maxSGPR = colorLineages(sgprLineages, SAllocMap, SGPR_LIMIT);
+  // +1 for each type of lineages(SGPR, VGPR, no reg).
+  const unsigned maxReg = maxSGPR + 1 + maxVGPR + 1 + 1;
+  const unsigned sgprBase = maxVGPR + 1;
+
+  for (Lineage &L : Lineages) {
+    // Collect HeadSet.
+    Color.HeadSet.insert(L.getHead());
+    Color.TailSet.insert(L.getTail());
+    Color.HeadTailMap[L.getHead()] = L.getTail();
+    // Save color.
+    auto RC = L.RC;
+    // All no reg lineage goes to maxReg.
+    unsigned color = maxReg;
+    if (!RC) {
+    } else if (SIRI->isSGPRClass(RC)) {
+      color = SAllocMap[&L] + sgprBase;
+    } else {
+      color = VAllocMap[&L];
+    }
+    unsigned size = L.getSize();
+    for (SUnit *SU : L.Nodes) {
+      Color.colorSU(SU, color);
+      Color.SizeMap[SU] = size;
+      Color.LineageMap[SU] = L.ID;
+    }
+  }
+  Color.maxReg = maxReg;
+  Color.maxSGPR = maxSGPR;
+  Color.maxVGPR = maxVGPR;
+
+  for (unsigned i = 0; i < Lineages.size(); i++) {
+    Lineage &a = Lineages[i];
+    SUnit *headA = a.getHead();
+    unsigned colorA = Color.getColor(headA);
+    unsigned sizeA = Color.getSize(headA);
+    for (unsigned j = i + 1; j < Lineages.size(); j++) {
+      Lineage &b = Lineages[j];
+
+      SUnit *headB = b.getHead();
+      unsigned colorB = Color.getColor(headB);
+      unsigned sizeB = Color.getSize(headB);
+
+      if (colorB >= (colorA + sizeA))
+        continue;
+      if (colorA >= (colorB + sizeB))
+        continue;
+      Color.ShareColorLineages.insert(i);
+      Color.ShareColorLineages.insert(j);
+    }
+  }
+
+  return Color;
+}
+
+void HRB::dump() {
+  for (int i = 0; i < Lineages.size(); i++) {
+    dbgs() << "Lineage" << i << ":";
+    Lineage &L = Lineages[i];
+    for (SUnit *SU : L.Nodes) {
+      dbgs() << " " << SU->NodeNum;
+    }
+    dbgs() << "\n";
+    if (!Color.ColorMap.empty()) {
+      dbgs() << "color:" << Color.getColor(L.getHead())
+             << " size: " << Color.getSize(L.getHead()) << "\n";
+    }
+    if (!ReachMap.empty()) {
+      dbgs() << "conflict:";
+      for (int j = 0; j < Lineages.size(); j++) {
+        if (i == j)
+          continue;
+        if (isConflict(L, Lineages[j])) {
+          dbgs() << " " << j;
+        }
+      }
+      dbgs() << "\n";
+    }
+  }
+}
+
+void HRB::dumpReachMap() {
+  if (!ReachMap.empty()) {
+    dbgs() << "reachMap:";
+    for (auto it : ReachMap) {
+      SUnit *SU = it.first;
+      auto &Reach = it.second;
+      if (SU->isInstr()) {
+        MachineInstr *MI = SU->getInstr();
+        MI->print(dbgs());
+      }
+      dbgs() << SU->NodeNum << "can reach :\n";
+      for (SUnit *R : Reach) {
+        dbgs() << R->NodeNum << " ";
+      }
+      dbgs() << "\n";
+    }
+    dbgs() << "\n";
+  }
+}
+
+// schedule base on HRB lineages and color result.
+
+std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
+                                    std::vector<SUnit *> &BRoots,
+                                    const llvm::MachineRegisterInfo &MRI,
+                                    const llvm::SIRegisterInfo *SIRI) {
+  HRB hrb(MRI, SIRI);
+  // build reach info to avoid dead loop when build linear.
+  hrb.buildReachRelation(BRoots);
+  hrb.buildLinear(SUnits);
+
+  std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *a, const SUnit *b) {
+    return a->NumSuccsLeft < b->NumSuccsLeft;
+  });
+  while (!BRoots.empty() && BRoots.back()->NumSuccsLeft > 0) {
+    BRoots.pop_back();
+  }
+
+  hrb.buildReachRelation(BRoots);
+  hrb.fusionLineages(SUnits);
+  hrb.buildConflict();
+  const HRB::ColorResult &Color = hrb.coloring();
+
+  LLVM_DEBUG(hrb.dump());
+
+  // All lineage head which don't has Pred is TopRoots.
+  // Put top roots in worklist.
+  // while worklist not empty.
+  //    if not head or color avail
+  //        is candidate.
+  //    choose best candidate by height.
+  //    update worklist.
+  std::vector<SUnit *> ReadyList;
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0)
+      ReadyList.emplace_back(&SU); //.insert(&SU);
+  }
+  // When there're more than one sub exp in the DAG, make sure not mix different
+  // sub exp or it will dead loop for color goes different subexp.
+
+  std::bitset<512 * 2> colors;
+  auto isColorAvail = [&colors](unsigned color, unsigned size) -> bool {
+    for (unsigned i = 0; i < size; i++) {
+      unsigned pos = color + i;
+      if (colors.test(pos))
+        return false;
+    }
+    return true;
+  };
+  auto allocColor = [&colors](unsigned color, unsigned size) {
+    for (unsigned i = 0; i < size; i++) {
+      unsigned pos = color + i;
+      assert(!colors.test(pos) && "color already allocated");
+      LLVM_DEBUG(dbgs() << pos << "is allocated\n");
+      colors.set(pos);
+    }
+  };
+
+  auto freeColor = [&colors](unsigned color, unsigned size) {
+    for (unsigned i = 0; i < size; i++) {
+      unsigned pos = color + i;
+      assert(colors.test(pos) && "color has not been allocated");
+      LLVM_DEBUG(dbgs() << pos << "is free\n");
+      colors.reset(pos);
+    }
+  };
+
+  // Save color and size for tail to support case two lineage share tail.
+  // When finish a tail, free color for working lineage which end with tail.
+  DenseMap<const SUnit *,
+           SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2>>
+      TailMap;
+
+  // For lineages share same color, need to choose correct order.
+  // If l0 has color 0, l1 has color 1, l2 has color 0, l3 has color 1.
+  // l0 and l3 conflict, l1 and l2 conflict.
+  // l0 and l3 must sched together.
+  // If sched l0 and l1, it may dead lock for l0 wait something in l3 and l1
+  // wait something in l2.
+  // ShareColorLineages will mark lineages which share color with other
+  // lineages. When sched, choose new lineages which has more conflict with
+  // ShareColorLineages.
+  const DenseSet<unsigned> &ShareColorLineages = Color.ShareColorLineages;
+
+  std::vector<const SUnit *> Schedule;
+  DenseSet<unsigned> UnfinishedLineages;
+  while (!ReadyList.empty()) {
+    // Make sure node conflict with predLineage first.
+    std::sort(ReadyList.begin(), ReadyList.end(),
+              [&UnfinishedLineages, &Color](const SUnit *a, const SUnit *b) {
+                unsigned confA = 0;
+                for (unsigned L : UnfinishedLineages) {
+                  if (Color.isConflict(a, L))
+                    confA++;
+                }
+                unsigned confB = 0;
+                for (unsigned L : UnfinishedLineages) {
+                  if (Color.isConflict(b, L))
+                    confB++;
+                }
+                return confA > confB;
+              });
+
+    LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU
+                                              : ReadyList) {
+      dbgs() << " " << SU->NodeNum;
+    } dbgs() << "\n";);
+    SUnit *Candidate = nullptr;
+    for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
+      SUnit *SU = *it;
+      unsigned color = Color.getColor(SU);
+      unsigned size = Color.getSize(SU);
+      // If SU is not head or color is available, SU is the candidate.
+      if (Color.isHead(SU)) {
+        if (!isColorAvail(color, size))
+          continue;
+        // alloc color.
+        allocColor(color, size);
+        // save tail color.
+        const SUnit *Tail = Color.getTail(SU);
+        unsigned ID = Color.getLineage(SU);
+        SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2> &tailColors =
+            TailMap[Tail];
+        tailColors.emplace_back(std::make_tuple(color, size, ID));
+        if (ShareColorLineages.count(ID))
+          UnfinishedLineages.insert(ID);
+      }
+
+      // free color for working lineage which end with SU.
+      if (Color.isTail(SU)) {
+        auto &tailColors = TailMap[SU];
+        for (auto &tailTuple : tailColors) {
+          unsigned lineageColor, lineageSize, ID;
+          std::tie(lineageColor, lineageSize, ID) = tailTuple;
+          freeColor(lineageColor, lineageSize);
+          if (ShareColorLineages.count(ID))
+            UnfinishedLineages.insert(ID);
+        }
+        // Clear the tail.
+        TailMap.erase(SU);
+      }
+
+      Candidate = SU;
+      // Remove Candidate from ReadyList.
+      ReadyList.erase(it);
+      break;
+    }
+
+    if (!Candidate) {
+      // In case failed to find candidate, start a lineage if there is one.
+      for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
+        SUnit *SU = *it;
+
+        if (!Color.isHead(SU)) {
+            continue;
+        }
+        Candidate = SU;
+        // Remove Candidate from ReadyList.
+        ReadyList.erase(it);
+        break;
+      }
+    }
+    assert(Candidate && "fail to find a Candidate");
+    LLVM_DEBUG(dbgs() << "Sched " << Candidate->NodeNum << "\n");
+
+    // Add all Candidate succ which is Ready.
+    for (SDep &Dep : Candidate->Succs) {
+      if (Dep.isWeak())
+        continue;
+      SUnit *Succ = Dep.getSUnit();
+
+      if (Succ->NumPredsLeft > 0)
+        Succ->NumPredsLeft--;
+      LLVM_DEBUG(dbgs() << "Succ " << Succ->NodeNum << " has "
+                        << Succ->NumPredsLeft << " preds\n");
+      if (Succ->NumPredsLeft == 0)
+        ReadyList.emplace_back(Succ);
+    }
+
+    // Sched Candidate.
+    assert(Candidate->isInstr() && "Candidate must be instr Node");
+    Schedule.emplace_back(Candidate);
+  }
+  assert(Schedule.size() == SUnits.size() && "SUnit size should match");
+  return Schedule;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
new file mode 100644
index 0000000000000..c234f32370793
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -0,0 +1,197 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/LaneBitmask.h"
+
+#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+
+namespace llvm {
+class MachineFunction;
+class LiveIntervals;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class SIInstrInfo;
+class MachineInstr;
+class MachineBasicBlock;
+template<typename GraphType>
+class GraphWriter;
+class SUnit;
+class IntEqClasses;
+class Twine;
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+
+// SubExp and BlockExpDag.
+struct SubExp {
+  // Keep original order for sunits.
+  std::vector<llvm::MachineInstr *> SUnits;
+  llvm::DenseSet<unsigned> TopRegs;
+  llvm::DenseSet<llvm::MachineInstr *> BottomRoots;
+  llvm::DenseSet<unsigned> BottomRegs;
+  bool bMultiDefOutput = false;
+  bool bHasTerminatorInst = false;
+  bool bUseIncomingReg = false;
+  bool bMoveIntoLoop = false;
+  bool bNotSafeToCopy = false;
+  bool bHasMemInst = false;
+  bool bHoist = false;
+  // If temp/out reg is used by inst not in the subExp, cannot move since not
+  // all users will be move. But OK to clone.
+  bool bCloneOnly = false;
+  bool bTouchSCC = false;
+  llvm::MachineBasicBlock *FromBB;
+  llvm::MachineBasicBlock *ToBB;
+  unsigned sInputSize;
+  unsigned vInputSize;
+  unsigned sOutputSize;
+  unsigned vOutputSize;
+  unsigned sMaxSize;
+  unsigned vMaxSize;
+  LiveSet inputLive;
+  LiveSet outputLive;
+  bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool bMoveUp) const;
+  void calcMaxPressure(const llvm::MachineRegisterInfo &MRI,
+                       const llvm::SIRegisterInfo *SIRI);
+  void dump(const llvm::MachineRegisterInfo &MRI,
+            const llvm::SIRegisterInfo *SIRI) const;
+  bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const;
+};
+
+struct ExpDag {
+  ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
+         const llvm::SIInstrInfo *SIII,
+         const bool bJoinInput);
+  const llvm::MachineRegisterInfo &MRI;
+  const llvm::SIRegisterInfo *SIRI;
+  const llvm::SIInstrInfo *SIII;
+  const bool bJoinInputToSubExp;
+
+  std::vector<llvm::SUnit> SUnits; ///< The scheduling units.
+  llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
+  llvm::DenseMap<llvm::SUnit *, llvm::MachineInstr *> SUnitMIMap;
+  llvm::DenseMap<unsigned, llvm::SUnit *> InputSUnitMap;
+  llvm::DenseMap<llvm::SUnit *, unsigned> SUnitInputMap;
+  std::vector<SubExp> SubExps;
+  template <typename T>
+  void build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+             T &insts);
+  void dump();
+  void viewGraph(const llvm::Twine &Name, const llvm::Twine &Title) const;
+  /// Returns a label for an SUnit node in a visualization of the ScheduleDAG.
+  std::string getGraphNodeLabel(const llvm::SUnit *SU) const;
+  std::string getDAGName() const;
+  /// Adds custom features for a visualization of the ScheduleDAG.
+  void addCustomGraphFeatures(llvm::GraphWriter<ExpDag *> &) const {}
+private:
+  template<typename T>
+  void initNodes(const LiveSet &InputLiveReg, T &insts);
+  void addDataDep(const llvm::SIRegisterInfo *SIRI);
+  void addCtrlDep();
+  void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
+                   const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+};
+
+struct BlockExpDag : public ExpDag {
+  BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS,
+              const llvm::MachineRegisterInfo &MRI,
+              const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+  llvm::LiveIntervals *LIS;
+  llvm::MachineBasicBlock *MBB;
+  llvm::DenseMap<llvm::SUnit *, LiveSet> DagPressureMap;
+  std::vector<std::vector<llvm::SUnit *>> SUnitsInSameDepth;
+  std::vector<SubExp> SubExps;
+  void build();
+  void buildWithPressure();
+private:
+  void buildAvail(const LiveSet &passThruSet,
+                  llvm::DenseMap<llvm::SUnit *, LiveSet> &DagAvailRegMap);
+  void buildPressure(const LiveSet &StartLiveReg,
+                     const LiveSet &EndLiveReg);
+};
+
+void getRegBound(llvm::MachineBasicBlock *MBB,
+                 const llvm::MachineRegisterInfo &MRI,
+                 const llvm::SIRegisterInfo *SIRI,
+                 const llvm::SIInstrInfo *SIII, llvm::LiveIntervals *LIS,
+                 unsigned &MaxVGPR, unsigned &MaxSGRP);
+
+// Currently mix sgpr and vgpr when build lineage to avoid cycle.
+// This maybe waste registers.
+// Based on "Minimum Register Instruction Sequencing to Reduce Register Spills
+// in Out-of-Order Issue Superscalar Architectures".
+class HRB {
+public:
+  struct Lineage {
+    unsigned ID = 0;
+    const llvm::TargetRegisterClass *RC = nullptr;
+    llvm::SmallVector<llvm::SUnit *, 4> Nodes;
+    llvm::SUnit *getHead() const;
+    llvm::SUnit *getTail() const;
+    void addNode(llvm::SUnit *);
+    unsigned getSize() const;
+    unsigned length() const;
+  };
+  struct ColorResult {
+    llvm::DenseMap<llvm::SUnit *, unsigned> ColorMap;
+    llvm::DenseMap<llvm::SUnit *, unsigned> SizeMap;
+    llvm::DenseMap<llvm::SUnit *, unsigned> LineageMap;
+    llvm::DenseMap<unsigned, llvm::DenseSet<unsigned>> Conflicts;
+    llvm::DenseSet<unsigned> ShareColorLineages;
+    llvm::DenseSet<llvm::SUnit *> HeadSet;
+    llvm::DenseSet<llvm::SUnit *> TailSet;
+    llvm::DenseMap<llvm::SUnit *, llvm::SUnit *> HeadTailMap;
+    unsigned maxReg = 0;
+    unsigned maxVGPR = 0;
+    unsigned maxSGPR = 0;
+    void colorSU(llvm::SUnit *SU, unsigned color);
+    unsigned getLineage(llvm::SUnit *SU) const;
+    bool isConflict(const llvm::SUnit *SU0, unsigned Lineage) const;
+    bool isHead(llvm::SUnit *SU) const;
+    bool isTail(llvm::SUnit *SU) const;
+    const llvm::SUnit *getTail(llvm::SUnit *SU) const;
+    unsigned getColor(const llvm::SUnit *SU) const;
+    unsigned getSize(const llvm::SUnit *SU) const;
+  };
+  HRB(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI)
+      : MRI(MRI), SIRI(SIRI) {}
+
+  void buildLinear(std::vector<llvm::SUnit> &SUnits);
+  void buildConflict();
+  void buildReachRelation(llvm::ArrayRef<llvm::SUnit *> BotRoots);
+  llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> &getReachMap() {
+    return ReachMap;
+  }
+  bool canReach(llvm::SUnit *a, llvm::SUnit *b);
+  void updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+                          std::vector<llvm::SUnit> &SUnits);
+  void fusionLineages(std::vector<llvm::SUnit> &SUnits);
+  ColorResult &coloring();
+  void dump();
+  void dumpReachMap();
+
+private:
+  Lineage buildChain(llvm::SUnit *Node, std::vector<llvm::SUnit> &SUnits);
+  llvm::SUnit *findHeir(llvm::SUnit *SU, std::vector<llvm::SUnit> &SUnits);
+  bool isConflict(const Lineage &a, const Lineage &b);
+  bool canFuse(const Lineage &a, const Lineage &b);
+  bool tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits);
+  unsigned colorLineages(std::vector<Lineage *> &lineages,
+                         llvm::DenseMap<Lineage *, unsigned> &AllocMap,
+                         const unsigned Limit);
+
+  llvm::DenseSet<llvm::SUnit *> ChainedNodes;
+  llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> ReachMap;
+  bool bRecomputeHeight = false;
+  std::vector<Lineage> Lineages;
+  ColorResult Color;
+  const llvm::MachineRegisterInfo &MRI;
+  const llvm::SIRegisterInfo *SIRI;
+};
+
+std::vector<const llvm::SUnit *> hrbSched(std::vector<llvm::SUnit> &SUnits,
+                                          std::vector<llvm::SUnit *> &BRoots,
+                                          const llvm::MachineRegisterInfo &MRI,
+                                          const llvm::SIRegisterInfo *SIRI);
+
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d0454cce15756..564c92239acdf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -517,6 +517,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
+  initializeAMDGPUHotBlockRematerializePass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
new file mode 100644
index 0000000000000..c9172bae2cb4a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
@@ -0,0 +1,106 @@
+//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Build degree about VMem to help balance latency and pressure inside a
+/// block.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <vector>
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+
+namespace llvm {
+class MachineBasicBlock;
+class SUnit;
+class SIInstrInfo;
+class MachineInstr;
+
+class SimpleDAG {
+public:
+  SimpleDAG(llvm::MachineBasicBlock &MBB, const llvm::SIInstrInfo *TII)
+      : SIII(TII), MBB(MBB) {}
+  std::vector<llvm::SUnit> SUnits;
+  // InstrInfo.
+  const llvm::SIInstrInfo *SIII;
+  llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
+  llvm::DenseMap<llvm::SUnit *, llvm::MachineInstr *> SUnitMIMap;
+  llvm::MachineBasicBlock &MBB;
+  void build();
+
+private:
+  void initNodes();
+  void addDependence();
+  void addCtrlDep();
+};
+
+
+// Collect height/depth for high latency mem ld, which only update height/depth
+// when cross high latency mem ld. Call the height/depth as VMem degree here.
+// The rule is sample and its user should has different degree.
+// For example
+// a = sample     // a has depth 0, height 3
+// b = sample a   // b has depth 1, height 2
+// c = sample c   // c has depth 2, height 1
+//   user of c    // user of c has depth 2, height 0
+//
+// For the purpose of in block reorder/remat, nothing will move/clone cross the
+// block. So do this after cross blk remat? In the middle of cross block remat
+// to help reach target when only move things cross blk cannot reach the target.
+// Reorder at the beginning? No pressure at that time? After get pressure, might
+// need to update max pressure.
+
+class VMemDegreeDAG {
+public:
+  VMemDegreeDAG(std::vector<llvm::SUnit> &Units,
+              const llvm::SIInstrInfo *TII)
+      : SUnits(Units), SIII(TII) {}
+  std::vector<llvm::SUnit> &SUnits;
+  // InstrInfo.
+  const llvm::SIInstrInfo *SIII;
+  void build();
+
+
+  bool isHighLatency(const llvm::SUnit *SU) const;
+  bool isHighLatency(const llvm::MachineInstr *MI) const;
+  // height/depth based on Long latency inst.
+  std::vector<unsigned> VMemDataHeight;
+  std::vector<unsigned> VMemDataDepth;
+  // Full height/depth count non-data dependent too.
+  std::vector<unsigned> VMemFullHeight;
+  std::vector<unsigned> VMemFullDepth;
+  llvm::SmallVector<llvm::SUnit *, 16> VMemSUs;
+  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUs;
+  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUsByDepth;
+
+
+  void dump();
+
+private:
+  static constexpr unsigned kNoReg = -1;
+
+
+  std::pair<unsigned, unsigned> buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
+                            std::vector<unsigned> &VMemDepth, bool bDataOnly);
+  // Compute vmem height/depth.
+  void buildVMemDepthHeight();
+  void buildVMemDataDepthHeight();
+  void groupVmemSUnits();
+
+};
+
+
+
+// Split block based on vmem depth.
+void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag);
+
+}
+
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..f089b210c8849 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHotBlockRematerialize.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
@@ -81,10 +82,14 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
+  AMDGPUMIRUtils.cpp
+  AMDGPUMirDivergenceAnalysis.cpp
+  AMDGPUMirSyncDependenceAnalysis.cpp
   AMDGPUIGroupLP.cpp
   AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
+  AMDGPUOccupancyAndLatencyHelper.cpp
   AMDGPUPerfHintAnalysis.cpp
   AMDGPUPostLegalizerCombiner.cpp
   AMDGPUPreLegalizerCombiner.cpp
@@ -106,6 +111,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUSelectionDAGInfo.cpp
   AMDGPUSetWavePriority.cpp
   AMDGPUSplitModule.cpp
+  AMDGPUSubExpDag.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7554b9f578fcb..aa4b3f948b726 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -47,6 +47,10 @@ struct GCNRegPressure {
 
   void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
 
+  unsigned getMaxSGPR() const {
+    return std::max(getSGPRNum(), getSGPRTuplesWeight());
+  }
+
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR32]; }
   /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 79ef1432d512a..3c467c098a65e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1332,6 +1332,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isLowLatencyInstruction(const MachineInstr &MI) const;
   bool isHighLatencyDef(int Opc) const override;
+  bool isHighLatencyInstruction(const MachineInstr& MI) const {
+    return isHighLatencyDef(MI.getOpcode());
+  }
 
   /// Return the descriptor of the target-specific machine instruction
   /// that corresponds to the specified pseudo or native opcode.
diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
new file mode 100644
index 0000000000000..e8a66b47ac732
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
@@ -0,0 +1,405 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+
+# DEFS
+# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni00:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div00]], implicit $exec
+# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni01:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div01]], implicit $exec
+# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni02:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div02]], implicit $exec
+# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni03:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div03]], implicit $exec
+# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni04:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div04]], implicit $exec
+# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni05:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div05]], implicit $exec
+# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni06:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div06]], implicit $exec
+# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni07:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div07]], implicit $exec
+# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni08:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div08]], implicit $exec
+# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni09:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div09]], implicit $exec
+# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni10:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div10]], implicit $exec
+# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni11:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div11]], implicit $exec
+# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni12:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div12]], implicit $exec
+# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni13:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div13]], implicit $exec
+# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni14:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div14]], implicit $exec
+# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni15:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div15]], implicit $exec
+# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni16:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div16]], implicit $exec
+# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni17:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div17]], implicit $exec
+# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni18:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div18]], implicit $exec
+# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni19:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div19]], implicit $exec
+# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni20:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div20]], implicit $exec
+# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni21:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div21]], implicit $exec
+# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni22:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div22]], implicit $exec
+# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni23:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div23]], implicit $exec
+# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni24:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div24]], implicit $exec
+# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni25:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div25]], implicit $exec
+# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni26:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div26]], implicit $exec
+# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni27:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div27]], implicit $exec
+# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni28:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div28]], implicit $exec
+# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni29:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div29]], implicit $exec
+# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni30:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div30]], implicit $exec
+# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni31:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div31]], implicit $exec
+# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni32:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div32]], implicit $exec
+# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni33:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div33]], implicit $exec
+# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni34:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div34]], implicit $exec
+# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni35:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div35]], implicit $exec
+# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni36:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div36]], implicit $exec
+# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni37:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div37]], implicit $exec
+# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni38:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div38]], implicit $exec
+# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni39:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div39]], implicit $exec
+# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni40:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div40]], implicit $exec
+# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni41:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div41]], implicit $exec
+# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni42:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div42]], implicit $exec
+# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni43:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div43]], implicit $exec
+# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni44:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div44]], implicit $exec
+# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni45:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div45]], implicit $exec
+# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni46:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div46]], implicit $exec
+# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni47:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div47]], implicit $exec
+# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni48:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div48]], implicit $exec
+# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni49:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div49]], implicit $exec
+# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni50:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div50]], implicit $exec
+# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni51:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div51]], implicit $exec
+# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni52:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div52]], implicit $exec
+# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni53:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div53]], implicit $exec
+# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni54:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div54]], implicit $exec
+# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni55:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div55]], implicit $exec
+# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni56:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div56]], implicit $exec
+# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni57:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div57]], implicit $exec
+# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni58:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div58]], implicit $exec
+# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni59:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div59]], implicit $exec
+
+
+# USERS:
+# CHECK: %[[#div_00:]]:vgpr_32 = COPY %[[#uni00]]
+#CHECK: EXP 0, %[[#div_00]],
+# CHECK: %[[#div_01:]]:vgpr_32 = COPY %[[#uni01]]
+#CHECK: EXP 0, %[[#div_01]],
+# CHECK: %[[#div_02:]]:vgpr_32 = COPY %[[#uni02]]
+#CHECK: EXP 0, %[[#div_02]],
+# CHECK: %[[#div_03:]]:vgpr_32 = COPY %[[#uni03]]
+#CHECK: EXP 0, %[[#div_03]],
+# CHECK: %[[#div_04:]]:vgpr_32 = COPY %[[#uni04]]
+#CHECK: EXP 0, %[[#div_04]],
+# CHECK: %[[#div_05:]]:vgpr_32 = COPY %[[#uni05]]
+#CHECK: EXP 0, %[[#div_05]],
+# CHECK: %[[#div_06:]]:vgpr_32 = COPY %[[#uni06]]
+#CHECK: EXP 0, %[[#div_06]],
+# CHECK: %[[#div_07:]]:vgpr_32 = COPY %[[#uni07]]
+#CHECK: EXP 0, %[[#div_07]],
+# CHECK: %[[#div_08:]]:vgpr_32 = COPY %[[#uni08]]
+#CHECK: EXP 0, %[[#div_08]],
+# CHECK: %[[#div_09:]]:vgpr_32 = COPY %[[#uni09]]
+#CHECK: EXP 0, %[[#div_09]],
+# CHECK: %[[#div_10:]]:vgpr_32 = COPY %[[#uni10]]
+#CHECK: EXP 0, %[[#div_10]],
+# CHECK: %[[#div_11:]]:vgpr_32 = COPY %[[#uni11]]
+#CHECK: EXP 0, %[[#div_11]],
+# CHECK: %[[#div_12:]]:vgpr_32 = COPY %[[#uni12]]
+#CHECK: EXP 0, %[[#div_12]],
+# CHECK: %[[#div_13:]]:vgpr_32 = COPY %[[#uni13]]
+#CHECK: EXP 0, %[[#div_13]],
+# CHECK: %[[#div_14:]]:vgpr_32 = COPY %[[#uni14]]
+#CHECK: EXP 0, %[[#div_14]],
+# CHECK: %[[#div_15:]]:vgpr_32 = COPY %[[#uni15]]
+#CHECK: EXP 0, %[[#div_15]],
+# CHECK: %[[#div_16:]]:vgpr_32 = COPY %[[#uni16]]
+#CHECK: EXP 0, %[[#div_16]],
+# CHECK: %[[#div_17:]]:vgpr_32 = COPY %[[#uni17]]
+#CHECK: EXP 0, %[[#div_17]],
+# CHECK: %[[#div_18:]]:vgpr_32 = COPY %[[#uni18]]
+#CHECK: EXP 0, %[[#div_18]],
+# CHECK: %[[#div_19:]]:vgpr_32 = COPY %[[#uni19]]
+#CHECK: EXP 0, %[[#div_19]],
+# CHECK: %[[#div_20:]]:vgpr_32 = COPY %[[#uni20]]
+#CHECK: EXP 0, %[[#div_20]],
+# CHECK: %[[#div_21:]]:vgpr_32 = COPY %[[#uni21]]
+#CHECK: EXP 0, %[[#div_21]],
+# CHECK: %[[#div_22:]]:vgpr_32 = COPY %[[#uni22]]
+#CHECK: EXP 0, %[[#div_22]],
+# CHECK: %[[#div_23:]]:vgpr_32 = COPY %[[#uni23]]
+#CHECK: EXP 0, %[[#div_23]],
+# CHECK: %[[#div_24:]]:vgpr_32 = COPY %[[#uni24]]
+#CHECK: EXP 0, %[[#div_24]],
+# CHECK: %[[#div_25:]]:vgpr_32 = COPY %[[#uni25]]
+#CHECK: EXP 0, %[[#div_25]],
+# CHECK: %[[#div_26:]]:vgpr_32 = COPY %[[#uni26]]
+#CHECK: EXP 0, %[[#div_26]],
+# CHECK: %[[#div_27:]]:vgpr_32 = COPY %[[#uni27]]
+#CHECK: EXP 0, %[[#div_27]],
+# CHECK: %[[#div_28:]]:vgpr_32 = COPY %[[#uni28]]
+#CHECK: EXP 0, %[[#div_28]],
+# CHECK: %[[#div_29:]]:vgpr_32 = COPY %[[#uni29]]
+#CHECK: EXP 0, %[[#div_29]],
+# CHECK: %[[#div_30:]]:vgpr_32 = COPY %[[#uni30]]
+#CHECK: EXP 0, %[[#div_30]],
+# CHECK: %[[#div_31:]]:vgpr_32 = COPY %[[#uni31]]
+#CHECK: EXP 0, %[[#div_31]],
+# CHECK: %[[#div_32:]]:vgpr_32 = COPY %[[#uni32]]
+#CHECK: EXP 0, %[[#div_32]],
+# CHECK: %[[#div_33:]]:vgpr_32 = COPY %[[#uni33]]
+#CHECK: EXP 0, %[[#div_33]],
+# CHECK: %[[#div_34:]]:vgpr_32 = COPY %[[#uni34]]
+#CHECK: EXP 0, %[[#div_34]],
+# CHECK: %[[#div_35:]]:vgpr_32 = COPY %[[#uni35]]
+#CHECK: EXP 0, %[[#div_35]],
+# CHECK: %[[#div_36:]]:vgpr_32 = COPY %[[#uni36]]
+#CHECK: EXP 0, %[[#div_36]],
+# CHECK: %[[#div_37:]]:vgpr_32 = COPY %[[#uni37]]
+#CHECK: EXP 0, %[[#div_37]],
+# CHECK: %[[#div_38:]]:vgpr_32 = COPY %[[#uni38]]
+#CHECK: EXP 0, %[[#div_38]],
+# CHECK: %[[#div_39:]]:vgpr_32 = COPY %[[#uni39]]
+#CHECK: EXP 0, %[[#div_39]],
+# CHECK: %[[#div_40:]]:vgpr_32 = COPY %[[#uni40]]
+#CHECK: EXP 0, %[[#div_40]],
+# CHECK: %[[#div_41:]]:vgpr_32 = COPY %[[#uni41]]
+#CHECK: EXP 0, %[[#div_41]],
+# CHECK: %[[#div_42:]]:vgpr_32 = COPY %[[#uni42]]
+#CHECK: EXP 0, %[[#div_42]],
+# CHECK: %[[#div_43:]]:vgpr_32 = COPY %[[#uni43]]
+#CHECK: EXP 0, %[[#div_43]],
+# CHECK: %[[#div_44:]]:vgpr_32 = COPY %[[#uni44]]
+#CHECK: EXP 0, %[[#div_44]],
+# CHECK: %[[#div_45:]]:vgpr_32 = COPY %[[#uni45]]
+#CHECK: EXP 0, %[[#div_45]],
+# CHECK: %[[#div_46:]]:vgpr_32 = COPY %[[#uni46]]
+#CHECK: EXP 0, %[[#div_46]],
+# CHECK: %[[#div_47:]]:vgpr_32 = COPY %[[#uni47]]
+#CHECK: EXP 0, %[[#div_47]],
+# CHECK: %[[#div_48:]]:vgpr_32 = COPY %[[#uni48]]
+#CHECK: EXP 0, %[[#div_48]],
+# CHECK: %[[#div_49:]]:vgpr_32 = COPY %[[#uni49]]
+#CHECK: EXP 0, %[[#div_49]],
+# CHECK: %[[#div_50:]]:vgpr_32 = COPY %[[#uni50]]
+#CHECK: EXP 0, %[[#div_50]],
+# CHECK: %[[#div_51:]]:vgpr_32 = COPY %[[#uni51]]
+#CHECK: EXP 0, %[[#div_51]],
+# CHECK: %[[#div_52:]]:vgpr_32 = COPY %[[#uni52]]
+#CHECK: EXP 0, %[[#div_52]],
+# CHECK: %[[#div_53:]]:vgpr_32 = COPY %[[#uni53]]
+#CHECK: EXP 0, %[[#div_53]],
+# CHECK: %[[#div_54:]]:vgpr_32 = COPY %[[#uni54]]
+#CHECK: EXP 0, %[[#div_54]],
+# CHECK: %[[#div_55:]]:vgpr_32 = COPY %[[#uni55]]
+#CHECK: EXP 0, %[[#div_55]],
+# CHECK: %[[#div_56:]]:vgpr_32 = COPY %[[#uni56]]
+#CHECK: EXP 0, %[[#div_56]],
+# CHECK: %[[#div_57:]]:vgpr_32 = COPY %[[#uni57]]
+#CHECK: EXP 0, %[[#div_57]],
+# CHECK: %[[#div_58:]]:vgpr_32 = COPY %[[#uni58]]
+#CHECK: EXP 0, %[[#div_58]],
+# CHECK: %[[#div_59:]]:vgpr_32 = COPY %[[#uni59]]
+#CHECK: EXP 0, %[[#div_59]],
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr8, $vgpr0, $vgpr1
+
+    %1000:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1001:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1002:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1003:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1004:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1005:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1006:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1007:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1008:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1009:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %1059, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %99:vgpr_32 = COPY %1058
+    S_BRANCH %bb.2
+
+  bb.2:
+    %1:vgpr_32 = IMPLICIT_DEF
+    EXP 0, killed %1000, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1001, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1002, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1003, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1004, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1005, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1006, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1007, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1008, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1009, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1010, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1011, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1012, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1013, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1014, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1015, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1016, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1017, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1018, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1019, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1020, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1021, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1022, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1023, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1024, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1025, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1026, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1027, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1028, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1029, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1030, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1031, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1032, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1033, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1034, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1035, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1036, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1037, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1038, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1039, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1040, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1041, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1042, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1043, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1044, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1045, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1046, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1047, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1048, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1049, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1050, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1051, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1052, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1053, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1054, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1055, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1056, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1057, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1058, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1059, %1, %1, %1, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...

From 77398423b044438b3f1a1306c140908b815e244b Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Thu, 6 Feb 2025 13:52:02 -0800
Subject: [PATCH 02/25] Fixed build, and added simple tests that exercise major
 code paths

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |   8 +-
 .../test/CodeGen/AMDGPU/remat/group_remat.mir | 507 ++++++++++++++
 .../AMDGPU/remat/group_remat_with_uses.mir    | 641 ++++++++++++++++++
 .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 450 ++++++++++++
 4 files changed, 1603 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 44ebaa2d51bec..8647185bf5d51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -291,7 +291,7 @@ unsigned CollectFnPressure(
     MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
     const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
     RematStatus &status) {
-  unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF);
+  unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
   // If only have one block, input/ouput virtual live set are empty.
   if (MF.size() > 1) {
     // Build input output live reg first.
@@ -1351,7 +1351,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
   bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
 
   // If bound by lds, skip.
-  if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) &&
+  if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
       !bForceRematSgpr)
     return false;
 
@@ -1663,6 +1663,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
     Register OpReg = Op.getReg();
     if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
       continue;
+    if (IsImplicitUseOfReg(Op, AMDGPU::MODE))
+      continue;
     if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
       continue;
     // Alow unused scc define.
@@ -4454,7 +4456,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
   }
 
   // If bound by lds, skip.
-  if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) &&
+  if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second &&
       !bSGPRSpill)
     return false;
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
new file mode 100644
index 0000000000000..7f3483c66a5d9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
@@ -0,0 +1,507 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+
+# Check that the whole expression gets moved to uses in bb.2.
+# CHECK: bb.0:
+# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0
+# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1
+# CHECK: bb.1:
+# CHECK: bb.2:
+# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]]
+# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]]
+# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]]
+# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]]
+# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]]
+# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]]
+# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]]
+# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]]
+# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]]
+# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]]
+# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]]
+# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]]
+# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]]
+# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]]
+# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]]
+# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]]
+# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]]
+# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]]
+# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]]
+# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]]
+# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]]
+# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]]
+# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]]
+# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]]
+# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]]
+# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]]
+# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]]
+# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]]
+# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]]
+# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]]
+# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]]
+# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]]
+# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]]
+# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]]
+# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]]
+# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]]
+# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]]
+# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]]
+# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]]
+# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]]
+# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]]
+# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]]
+# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]]
+# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]]
+# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]]
+# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]]
+# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]]
+# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]]
+# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]]
+# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]]
+# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]]
+# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]]
+# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]]
+# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]]
+# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]]
+# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]]
+# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]]
+# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]]
+# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]]
+# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]]
+# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]]
+# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]]
+# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]]
+# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]]
+# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]]
+# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]]
+# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]]
+# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]]
+# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]]
+# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]]
+# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]]
+# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]]
+# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]]
+# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]]
+# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]]
+# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]]
+# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]]
+# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]]
+# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]]
+# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]]
+# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]]
+# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]]
+# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]]
+# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]]
+# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]]
+# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]]
+# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]]
+# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]]
+# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]]
+# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]]
+# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]]
+# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]]
+# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]]
+# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]]
+# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]]
+# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]]
+# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]]
+# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]]
+# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]]
+# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]]
+# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]]
+# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]]
+# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]]
+# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]]
+# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]]
+# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]]
+# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]]
+# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]]
+# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]]
+# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]]
+# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]]
+# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]]
+# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]]
+# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]]
+# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]]
+# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]]
+# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]]
+# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]]
+# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]]
+# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]]
+# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]]
+# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]]
+# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]]
+# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]]
+# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]]
+# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]]
+# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]]
+# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]]
+# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]]
+# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]]
+# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]]
+# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]]
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    undef %0.sub0:sgpr_64 = COPY $sgpr0
+    undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    undef %1.sub0:sgpr_128 = COPY $sgpr4
+    undef %1.sub1:sgpr_128 = COPY $sgpr5
+    undef %1.sub2:sgpr_128 = COPY $sgpr6
+    undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+    %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
+    %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
+    %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode
+    %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode
+    %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode
+    %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode
+    %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode
+    %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode
+    %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode
+    %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode
+    %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode
+    %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode
+    %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode
+    %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode
+    %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode
+    %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode
+    %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode
+    %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode
+    %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode
+    %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode
+    %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode
+    %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode
+    %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode
+    %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode
+    %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode
+    %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode
+    %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode
+    %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode
+    %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode
+    %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode
+    %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode
+    %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode
+    %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode
+    %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode
+    %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode
+    %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode
+    %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode
+    %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode
+    %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode
+    %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode
+    %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode
+    %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode
+    %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode
+    %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode
+    %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode
+    %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode
+    %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode
+    %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode
+    %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode
+    %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode
+    %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode
+    %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode
+    %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode
+    %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode
+    %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode
+    %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode
+    %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode
+    %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode
+    %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode
+    %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode
+    %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode
+    %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode
+    %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode
+    %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode
+    %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode
+    %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode
+    %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode
+    %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode
+    %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode
+    %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode
+    %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode
+    %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode
+    %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode
+    %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode
+    %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode
+    %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode
+    %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode
+    %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode
+    %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode
+    %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode
+    %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode
+    %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode
+    %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode
+    %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode
+    %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode
+    %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode
+    %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode
+    %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode
+    %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode
+    %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode
+    %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode
+    %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode
+    %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode
+    %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode
+    %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode
+    %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode
+    %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode
+    %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode
+    %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode
+    %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode
+    %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode
+    %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode
+    %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode
+    %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode
+    %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode
+    %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode
+    %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode
+    %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode
+    %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode
+    %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode
+    %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode
+    %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode
+    %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode
+    %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode
+    %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode
+    %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode
+    %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode
+    %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode
+    %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode
+    %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode
+    %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode
+    %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode
+    %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode
+    %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode
+    %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode
+    %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode
+    %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode
+    %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode
+    %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode
+    %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode
+    %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode
+    %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode
+    %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+
+    %8001:vgpr_32 = COPY %8000
+    %8002:vgpr_32 = COPY %8000
+    %8003:vgpr_32 = COPY %8000
+    %8004:vgpr_32 = COPY %8000
+    %8005:vgpr_32 = COPY %8000
+    %8006:vgpr_32 = COPY %8000
+    %8007:vgpr_32 = COPY %8000
+    %8008:vgpr_32 = COPY %8000
+    %8009:vgpr_32 = COPY %8000
+    %8010:vgpr_32 = COPY %8000
+    %8011:vgpr_32 = COPY %8000
+    %8012:vgpr_32 = COPY %8000
+    %8013:vgpr_32 = COPY %8000
+    %8014:vgpr_32 = COPY %8000
+    %8015:vgpr_32 = COPY %8000
+    %8016:vgpr_32 = COPY %8000
+    %8017:vgpr_32 = COPY %8000
+
+    %9001:vgpr_32 = COPY %8001
+    %9002:vgpr_32 = COPY %8002
+    %9003:vgpr_32 = COPY %8003
+    %9004:vgpr_32 = COPY %8004
+    %9005:vgpr_32 = COPY %8005
+    %9006:vgpr_32 = COPY %8006
+    %9007:vgpr_32 = COPY %8007
+    %9008:vgpr_32 = COPY %8008
+    %9009:vgpr_32 = COPY %8009
+    %9010:vgpr_32 = COPY %8010
+    %9011:vgpr_32 = COPY %8011
+    %9012:vgpr_32 = COPY %8012
+    %9013:vgpr_32 = COPY %8013
+    %9014:vgpr_32 = COPY %8014
+    %9015:vgpr_32 = COPY %8015
+    %9016:vgpr_32 = COPY %8016
+    %9017:vgpr_32 = COPY %8017
+
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
new file mode 100644
index 0000000000000..637a683bdd041
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
@@ -0,0 +1,641 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-sub-exp-remat-aggressive | FileCheck %s
+
+# Check that the whole expression gets CLONED to uses in bb.2.
+# CHECK: bb.0:
+# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0
+# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1
+# CHECK: bb.1:
+# CHECK: bb.2:
+# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]]
+# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]]
+# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]]
+# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]]
+# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]]
+# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]]
+# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]]
+# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]]
+# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]]
+# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]]
+# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]]
+# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]]
+# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]]
+# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]]
+# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]]
+# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]]
+# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]]
+# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]]
+# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]]
+# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]]
+# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]]
+# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]]
+# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]]
+# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]]
+# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]]
+# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]]
+# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]]
+# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]]
+# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]]
+# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]]
+# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]]
+# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]]
+# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]]
+# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]]
+# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]]
+# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]]
+# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]]
+# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]]
+# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]]
+# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]]
+# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]]
+# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]]
+# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]]
+# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]]
+# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]]
+# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]]
+# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]]
+# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]]
+# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]]
+# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]]
+# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]]
+# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]]
+# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]]
+# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]]
+# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]]
+# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]]
+# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]]
+# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]]
+# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]]
+# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]]
+# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]]
+# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]]
+# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]]
+# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]]
+# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]]
+# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]]
+# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]]
+# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]]
+# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]]
+# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]]
+# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]]
+# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]]
+# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]]
+# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]]
+# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]]
+# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]]
+# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]]
+# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]]
+# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]]
+# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]]
+# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]]
+# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]]
+# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]]
+# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]]
+# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]]
+# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]]
+# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]]
+# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]]
+# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]]
+# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]]
+# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]]
+# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]]
+# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]]
+# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]]
+# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]]
+# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]]
+# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]]
+# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]]
+# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]]
+# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]]
+# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]]
+# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]]
+# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]]
+# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]]
+# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]]
+# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]]
+# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]]
+# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]]
+# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]]
+# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]]
+# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]]
+# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]]
+# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]]
+# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]]
+# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]]
+# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]]
+# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]]
+# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]]
+# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]]
+# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]]
+# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]]
+# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]]
+# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]]
+# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]]
+# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]]
+# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]]
+# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]]
+# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]]
+# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]]
+# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]]
+# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]]
+# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]]
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    undef %0.sub0:sgpr_64 = COPY $sgpr0
+    undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    undef %1.sub0:sgpr_128 = COPY $sgpr4
+    undef %1.sub1:sgpr_128 = COPY $sgpr5
+    undef %1.sub2:sgpr_128 = COPY $sgpr6
+    undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+    %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
+    %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
+    %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode
+    %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode
+    %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode
+    %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode
+    %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode
+    %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode
+    %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode
+    %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode
+    %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode
+    %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode
+    %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode
+    %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode
+    %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode
+    %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode
+    %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode
+    %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode
+    %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode
+    %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode
+    %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode
+    %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode
+    %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode
+    %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode
+    %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode
+    %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode
+    %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode
+    %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode
+    %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode
+    %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode
+    %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode
+    %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode
+    %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode
+    %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode
+    %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode
+    %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode
+    %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode
+    %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode
+    %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode
+    %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode
+    %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode
+    %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode
+    %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode
+    %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode
+    %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode
+    %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode
+    %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode
+    %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode
+    %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode
+    %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode
+    %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode
+    %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode
+    %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode
+    %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode
+    %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode
+    %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode
+    %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode
+    %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode
+    %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode
+    %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode
+    %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode
+    %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode
+    %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode
+    %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode
+    %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode
+    %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode
+    %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode
+    %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode
+    %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode
+    %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode
+    %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode
+    %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode
+    %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode
+    %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode
+    %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode
+    %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode
+    %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode
+    %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode
+    %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode
+    %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode
+    %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode
+    %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode
+    %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode
+    %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode
+    %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode
+    %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode
+    %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode
+    %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode
+    %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode
+    %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode
+    %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode
+    %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode
+    %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode
+    %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode
+    %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode
+    %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode
+    %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode
+    %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode
+    %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode
+    %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode
+    %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode
+    %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode
+    %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode
+    %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode
+    %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode
+    %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode
+    %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode
+    %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode
+    %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode
+    %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode
+    %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode
+    %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode
+    %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode
+    %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode
+    %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode
+    %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode
+    %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode
+    %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode
+    %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode
+    %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode
+    %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode
+    %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode
+    %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode
+    %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode
+    %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode
+    %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode
+    %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode
+    %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode
+    %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode
+    %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode
+    %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode
+    %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode
+    %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode
+    EXP 0, %500, %500, %500, %500, -1, -1, 15, implicit $exec
+    EXP 0, %501, %501, %501, %501, -1, -1, 15, implicit $exec
+    EXP 0, %502, %502, %502, %502, -1, -1, 15, implicit $exec
+    EXP 0, %503, %503, %503, %503, -1, -1, 15, implicit $exec
+    EXP 0, %504, %504, %504, %504, -1, -1, 15, implicit $exec
+    EXP 0, %505, %505, %505, %505, -1, -1, 15, implicit $exec
+    EXP 0, %506, %506, %506, %506, -1, -1, 15, implicit $exec
+    EXP 0, %507, %507, %507, %507, -1, -1, 15, implicit $exec
+    EXP 0, %508, %508, %508, %508, -1, -1, 15, implicit $exec
+    EXP 0, %509, %509, %509, %509, -1, -1, 15, implicit $exec
+    EXP 0, %5010, %5010, %5010, %5010, -1, -1, 15, implicit $exec
+    EXP 0, %5011, %5011, %5011, %5011, -1, -1, 15, implicit $exec
+    EXP 0, %5012, %5012, %5012, %5012, -1, -1, 15, implicit $exec
+    EXP 0, %5013, %5013, %5013, %5013, -1, -1, 15, implicit $exec
+    EXP 0, %5014, %5014, %5014, %5014, -1, -1, 15, implicit $exec
+    EXP 0, %5015, %5015, %5015, %5015, -1, -1, 15, implicit $exec
+    EXP 0, %5016, %5016, %5016, %5016, -1, -1, 15, implicit $exec
+    EXP 0, %5017, %5017, %5017, %5017, -1, -1, 15, implicit $exec
+    EXP 0, %5018, %5018, %5018, %5018, -1, -1, 15, implicit $exec
+    EXP 0, %5019, %5019, %5019, %5019, -1, -1, 15, implicit $exec
+    EXP 0, %5020, %5020, %5020, %5020, -1, -1, 15, implicit $exec
+    EXP 0, %5021, %5021, %5021, %5021, -1, -1, 15, implicit $exec
+    EXP 0, %5022, %5022, %5022, %5022, -1, -1, 15, implicit $exec
+    EXP 0, %5023, %5023, %5023, %5023, -1, -1, 15, implicit $exec
+    EXP 0, %5024, %5024, %5024, %5024, -1, -1, 15, implicit $exec
+    EXP 0, %5025, %5025, %5025, %5025, -1, -1, 15, implicit $exec
+    EXP 0, %5026, %5026, %5026, %5026, -1, -1, 15, implicit $exec
+    EXP 0, %5027, %5027, %5027, %5027, -1, -1, 15, implicit $exec
+    EXP 0, %5028, %5028, %5028, %5028, -1, -1, 15, implicit $exec
+    EXP 0, %5029, %5029, %5029, %5029, -1, -1, 15, implicit $exec
+    EXP 0, %5030, %5030, %5030, %5030, -1, -1, 15, implicit $exec
+    EXP 0, %5031, %5031, %5031, %5031, -1, -1, 15, implicit $exec
+    EXP 0, %5032, %5032, %5032, %5032, -1, -1, 15, implicit $exec
+    EXP 0, %5033, %5033, %5033, %5033, -1, -1, 15, implicit $exec
+    EXP 0, %5034, %5034, %5034, %5034, -1, -1, 15, implicit $exec
+    EXP 0, %5035, %5035, %5035, %5035, -1, -1, 15, implicit $exec
+    EXP 0, %5036, %5036, %5036, %5036, -1, -1, 15, implicit $exec
+    EXP 0, %5037, %5037, %5037, %5037, -1, -1, 15, implicit $exec
+    EXP 0, %5038, %5038, %5038, %5038, -1, -1, 15, implicit $exec
+    EXP 0, %5039, %5039, %5039, %5039, -1, -1, 15, implicit $exec
+    EXP 0, %5040, %5040, %5040, %5040, -1, -1, 15, implicit $exec
+    EXP 0, %5041, %5041, %5041, %5041, -1, -1, 15, implicit $exec
+    EXP 0, %5042, %5042, %5042, %5042, -1, -1, 15, implicit $exec
+    EXP 0, %5043, %5043, %5043, %5043, -1, -1, 15, implicit $exec
+    EXP 0, %5044, %5044, %5044, %5044, -1, -1, 15, implicit $exec
+    EXP 0, %5045, %5045, %5045, %5045, -1, -1, 15, implicit $exec
+    EXP 0, %5046, %5046, %5046, %5046, -1, -1, 15, implicit $exec
+    EXP 0, %5047, %5047, %5047, %5047, -1, -1, 15, implicit $exec
+    EXP 0, %5048, %5048, %5048, %5048, -1, -1, 15, implicit $exec
+    EXP 0, %5049, %5049, %5049, %5049, -1, -1, 15, implicit $exec
+    EXP 0, %5050, %5050, %5050, %5050, -1, -1, 15, implicit $exec
+    EXP 0, %5051, %5051, %5051, %5051, -1, -1, 15, implicit $exec
+    EXP 0, %5052, %5052, %5052, %5052, -1, -1, 15, implicit $exec
+    EXP 0, %5053, %5053, %5053, %5053, -1, -1, 15, implicit $exec
+    EXP 0, %5054, %5054, %5054, %5054, -1, -1, 15, implicit $exec
+    EXP 0, %5055, %5055, %5055, %5055, -1, -1, 15, implicit $exec
+    EXP 0, %5056, %5056, %5056, %5056, -1, -1, 15, implicit $exec
+    EXP 0, %5057, %5057, %5057, %5057, -1, -1, 15, implicit $exec
+    EXP 0, %5058, %5058, %5058, %5058, -1, -1, 15, implicit $exec
+    EXP 0, %5059, %5059, %5059, %5059, -1, -1, 15, implicit $exec
+    EXP 0, %5060, %5060, %5060, %5060, -1, -1, 15, implicit $exec
+    EXP 0, %5061, %5061, %5061, %5061, -1, -1, 15, implicit $exec
+    EXP 0, %5062, %5062, %5062, %5062, -1, -1, 15, implicit $exec
+    EXP 0, %5063, %5063, %5063, %5063, -1, -1, 15, implicit $exec
+    EXP 0, %5064, %5064, %5064, %5064, -1, -1, 15, implicit $exec
+    EXP 0, %5065, %5065, %5065, %5065, -1, -1, 15, implicit $exec
+    EXP 0, %5066, %5066, %5066, %5066, -1, -1, 15, implicit $exec
+    EXP 0, %5067, %5067, %5067, %5067, -1, -1, 15, implicit $exec
+    EXP 0, %5068, %5068, %5068, %5068, -1, -1, 15, implicit $exec
+    EXP 0, %5069, %5069, %5069, %5069, -1, -1, 15, implicit $exec
+    EXP 0, %5070, %5070, %5070, %5070, -1, -1, 15, implicit $exec
+    EXP 0, %5071, %5071, %5071, %5071, -1, -1, 15, implicit $exec
+    EXP 0, %5072, %5072, %5072, %5072, -1, -1, 15, implicit $exec
+    EXP 0, %5073, %5073, %5073, %5073, -1, -1, 15, implicit $exec
+    EXP 0, %5074, %5074, %5074, %5074, -1, -1, 15, implicit $exec
+    EXP 0, %5075, %5075, %5075, %5075, -1, -1, 15, implicit $exec
+    EXP 0, %5076, %5076, %5076, %5076, -1, -1, 15, implicit $exec
+    EXP 0, %5077, %5077, %5077, %5077, -1, -1, 15, implicit $exec
+    EXP 0, %5078, %5078, %5078, %5078, -1, -1, 15, implicit $exec
+    EXP 0, %5079, %5079, %5079, %5079, -1, -1, 15, implicit $exec
+    EXP 0, %5080, %5080, %5080, %5080, -1, -1, 15, implicit $exec
+    EXP 0, %5081, %5081, %5081, %5081, -1, -1, 15, implicit $exec
+    EXP 0, %5082, %5082, %5082, %5082, -1, -1, 15, implicit $exec
+    EXP 0, %5083, %5083, %5083, %5083, -1, -1, 15, implicit $exec
+    EXP 0, %5084, %5084, %5084, %5084, -1, -1, 15, implicit $exec
+    EXP 0, %5085, %5085, %5085, %5085, -1, -1, 15, implicit $exec
+    EXP 0, %5086, %5086, %5086, %5086, -1, -1, 15, implicit $exec
+    EXP 0, %5087, %5087, %5087, %5087, -1, -1, 15, implicit $exec
+    EXP 0, %5088, %5088, %5088, %5088, -1, -1, 15, implicit $exec
+    EXP 0, %5089, %5089, %5089, %5089, -1, -1, 15, implicit $exec
+    EXP 0, %5090, %5090, %5090, %5090, -1, -1, 15, implicit $exec
+    EXP 0, %5091, %5091, %5091, %5091, -1, -1, 15, implicit $exec
+    EXP 0, %5092, %5092, %5092, %5092, -1, -1, 15, implicit $exec
+    EXP 0, %5093, %5093, %5093, %5093, -1, -1, 15, implicit $exec
+    EXP 0, %5094, %5094, %5094, %5094, -1, -1, 15, implicit $exec
+    EXP 0, %5095, %5095, %5095, %5095, -1, -1, 15, implicit $exec
+    EXP 0, %5096, %5096, %5096, %5096, -1, -1, 15, implicit $exec
+    EXP 0, %5097, %5097, %5097, %5097, -1, -1, 15, implicit $exec
+    EXP 0, %5098, %5098, %5098, %5098, -1, -1, 15, implicit $exec
+    EXP 0, %5099, %5099, %5099, %5099, -1, -1, 15, implicit $exec
+    EXP 0, %50100, %50100, %50100, %50100, -1, -1, 15, implicit $exec
+    EXP 0, %50101, %50101, %50101, %50101, -1, -1, 15, implicit $exec
+    EXP 0, %50102, %50102, %50102, %50102, -1, -1, 15, implicit $exec
+    EXP 0, %50103, %50103, %50103, %50103, -1, -1, 15, implicit $exec
+    EXP 0, %50104, %50104, %50104, %50104, -1, -1, 15, implicit $exec
+    EXP 0, %50105, %50105, %50105, %50105, -1, -1, 15, implicit $exec
+    EXP 0, %50106, %50106, %50106, %50106, -1, -1, 15, implicit $exec
+    EXP 0, %50107, %50107, %50107, %50107, -1, -1, 15, implicit $exec
+    EXP 0, %50108, %50108, %50108, %50108, -1, -1, 15, implicit $exec
+    EXP 0, %50109, %50109, %50109, %50109, -1, -1, 15, implicit $exec
+    EXP 0, %50110, %50110, %50110, %50110, -1, -1, 15, implicit $exec
+    EXP 0, %50111, %50111, %50111, %50111, -1, -1, 15, implicit $exec
+    EXP 0, %50112, %50112, %50112, %50112, -1, -1, 15, implicit $exec
+    EXP 0, %50113, %50113, %50113, %50113, -1, -1, 15, implicit $exec
+    EXP 0, %50114, %50114, %50114, %50114, -1, -1, 15, implicit $exec
+    EXP 0, %50115, %50115, %50115, %50115, -1, -1, 15, implicit $exec
+    EXP 0, %50116, %50116, %50116, %50116, -1, -1, 15, implicit $exec
+    EXP 0, %50117, %50117, %50117, %50117, -1, -1, 15, implicit $exec
+    EXP 0, %50118, %50118, %50118, %50118, -1, -1, 15, implicit $exec
+    EXP 0, %50119, %50119, %50119, %50119, -1, -1, 15, implicit $exec
+    EXP 0, %50120, %50120, %50120, %50120, -1, -1, 15, implicit $exec
+    EXP 0, %50121, %50121, %50121, %50121, -1, -1, 15, implicit $exec
+    EXP 0, %50122, %50122, %50122, %50122, -1, -1, 15, implicit $exec
+    EXP 0, %50123, %50123, %50123, %50123, -1, -1, 15, implicit $exec
+    EXP 0, %50124, %50124, %50124, %50124, -1, -1, 15, implicit $exec
+    EXP 0, %50125, %50125, %50125, %50125, -1, -1, 15, implicit $exec
+    EXP 0, %50126, %50126, %50126, %50126, -1, -1, 15, implicit $exec
+    EXP 0, %50127, %50127, %50127, %50127, -1, -1, 15, implicit $exec
+    EXP 0, %50128, %50128, %50128, %50128, -1, -1, 15, implicit $exec
+    EXP 0, %50129, %50129, %50129, %50129, -1, -1, 15, implicit $exec
+    EXP 0, %50130, %50130, %50130, %50130, -1, -1, 15, implicit $exec
+    EXP 0, %50131, %50131, %50131, %50131, -1, -1, 15, implicit $exec
+    EXP 0, %50132, %50132, %50132, %50132, -1, -1, 15, implicit $exec
+    EXP 0, %50133, %50133, %50133, %50133, -1, -1, 15, implicit $exec
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+
+    %8001:vgpr_32 = COPY %8000
+    %8002:vgpr_32 = COPY %8000
+    %8003:vgpr_32 = COPY %8000
+    %8004:vgpr_32 = COPY %8000
+    %8005:vgpr_32 = COPY %8000
+    %8006:vgpr_32 = COPY %8000
+    %8007:vgpr_32 = COPY %8000
+    %8008:vgpr_32 = COPY %8000
+    %8009:vgpr_32 = COPY %8000
+    %8010:vgpr_32 = COPY %8000
+    %8011:vgpr_32 = COPY %8000
+    %8012:vgpr_32 = COPY %8000
+    %8013:vgpr_32 = COPY %8000
+    %8014:vgpr_32 = COPY %8000
+    %8015:vgpr_32 = COPY %8000
+    %8016:vgpr_32 = COPY %8000
+    %8017:vgpr_32 = COPY %8000
+
+    %9001:vgpr_32 = COPY %8001
+    %9002:vgpr_32 = COPY %8002
+    %9003:vgpr_32 = COPY %8003
+    %9004:vgpr_32 = COPY %8004
+    %9005:vgpr_32 = COPY %8005
+    %9006:vgpr_32 = COPY %8006
+    %9007:vgpr_32 = COPY %8007
+    %9008:vgpr_32 = COPY %8008
+    %9009:vgpr_32 = COPY %8009
+    %9010:vgpr_32 = COPY %8010
+    %9011:vgpr_32 = COPY %8011
+    %9012:vgpr_32 = COPY %8012
+    %9013:vgpr_32 = COPY %8013
+    %9014:vgpr_32 = COPY %8014
+    %9015:vgpr_32 = COPY %8015
+    %9016:vgpr_32 = COPY %8016
+    %9017:vgpr_32 = COPY %8017
+
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
new file mode 100644
index 0000000000000..bc2c97f91f46c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -0,0 +1,450 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the loads have been moved to the use
+# CHECK: bb.2:
+# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0
+# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0
+# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0
+# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0
+# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0
+# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0
+# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0
+# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0
+# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0
+# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0
+# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0
+# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    undef %0.sub0:sgpr_64 = COPY $sgpr0
+    undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    undef %1.sub0:sgpr_128 = COPY $sgpr4
+    undef %1.sub1:sgpr_128 = COPY $sgpr5
+    undef %1.sub2:sgpr_128 = COPY $sgpr6
+    undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+    %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0
+    %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0
+    %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0
+    %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0
+    %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0
+    %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0
+    %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0
+    %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0
+    %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0
+    %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0
+    %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0
+    %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0
+    %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0
+    %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0
+    %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0
+    %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0
+    %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0
+    %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0
+    %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0
+    %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0
+    %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0
+    %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0
+    %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0
+    %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0
+    %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0
+    %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0
+    %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0
+    %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0
+    %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0
+    %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0
+    %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0
+    %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0
+    %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0
+    %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0
+    %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0
+    %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0
+    %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0
+    %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0
+    %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0
+    %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0
+    %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0
+    %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0
+    %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0
+    %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0
+    %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0
+    %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0
+    %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0
+    %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0
+    %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0
+    %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0
+    %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0
+    %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0
+    %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0
+    %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0
+    %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0
+    %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0
+    %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0
+    %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0
+    %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0
+    %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0
+    %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0
+    %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0
+    %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0
+    %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0
+
+    %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %8001:vgpr_32 = COPY %8000
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+    S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+    EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...

From 3539ab3386d1cfe3798c7b2294bfa163dddc5745 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Thu, 6 Feb 2025 14:09:32 -0800
Subject: [PATCH 03/25] Test renames, only keeping the required flags for the
 tests

---
 .../remat/{group_remat_with_uses.mir => group_remat_clone.mir}  | 2 +-
 .../AMDGPU/remat/{group_remat.mir => group_remat_move.mir}      | 0
 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir                  | 2 +-
 llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir             | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/remat/{group_remat_with_uses.mir => group_remat_clone.mir} (99%)
 rename llvm/test/CodeGen/AMDGPU/remat/{group_remat.mir => group_remat_move.mir} (100%)

diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
index 637a683bdd041..c99a1835454fd 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
@@ -638,4 +638,4 @@ body:             |
 
     S_ENDPGM 0
 ...
-    
\ No newline at end of file
+    
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
index bc2c97f91f46c..528515d235c8b 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
 
-# Check that the loads have been moved to the use
+# Check that the scalar loads have been moved to the use
 # CHECK: bb.2:
 # CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
 # CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
index e8a66b47ac732..53f59cc3f8b0b 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
 
 # DEFS
 # CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec

From a33f944e94d4e1036085a84370a8fc94b05ff1ff Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 10 Feb 2025 13:22:19 -0800
Subject: [PATCH 04/25] Using the mir uniformity analysis instead, which DOES
 require SSA; but I don't see any reason why we can't just require SSA

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp      | 13 +++++++++++--
 .../CodeGen/AMDGPU/remat/group_remat_clone.mir  | 15 ++++++++-------
 .../CodeGen/AMDGPU/remat/group_remat_move.mir   | 17 +++++++++--------
 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir  | 14 ++++++++------
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 8647185bf5d51..f0a2dcdb5cc11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -34,6 +34,9 @@
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+
 #include <unordered_set>
 #define DEBUG_TYPE "amdgpu-hot-block-remat"
 
@@ -4619,10 +4622,16 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
   AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   {
-    llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
+    MachineCycleInfo CI;
+    CI.compute(MF);
+    auto TTI = MF.getTarget().getTargetTransformInfo(MF.getFunction());
+    MachineUniformityInfo MachineUniformity =
+      llvm::computeMachineUniformityInfo(MF, CI, *DT, /*HasBranchDivergence*/true);
+
+    //llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : MBB) {
-        if (DA.isUniform(&MI)) {
+        if (MachineUniformity.isUniform(&MI)) {
           TotalUniformInsts.insert(&MI);
         }
       }
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
index c99a1835454fd..06ea907aca44d 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
@@ -170,14 +170,15 @@ body:             |
     successors: %bb.1, %bb.2
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
 
-    undef %0.sub0:sgpr_64 = COPY $sgpr0
-    undef %0.sub1:sgpr_64 = COPY $sgpr1
-
-    undef %1.sub0:sgpr_128 = COPY $sgpr4
-    undef %1.sub1:sgpr_128 = COPY $sgpr5
-    undef %1.sub2:sgpr_128 = COPY $sgpr6
-    undef %1.sub3:sgpr_128 = COPY $sgpr7
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
 
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
 
     %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
     %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
index 7f3483c66a5d9..ebd89451154ae 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
@@ -170,14 +170,15 @@ body:             |
     successors: %bb.1, %bb.2
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
 
-    undef %0.sub0:sgpr_64 = COPY $sgpr0
-    undef %0.sub1:sgpr_64 = COPY $sgpr1
-
-    undef %1.sub0:sgpr_128 = COPY $sgpr4
-    undef %1.sub1:sgpr_128 = COPY $sgpr5
-    undef %1.sub2:sgpr_128 = COPY $sgpr6
-    undef %1.sub3:sgpr_128 = COPY $sgpr7
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
 
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
 
     %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
     %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
@@ -504,4 +505,4 @@ body:             |
 
     S_ENDPGM 0
 ...
-    
\ No newline at end of file
+    
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
index 528515d235c8b..a702f7fc8011e 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -162,13 +162,15 @@ body:             |
     successors: %bb.1, %bb.2
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
 
-    undef %0.sub0:sgpr_64 = COPY $sgpr0
-    undef %0.sub1:sgpr_64 = COPY $sgpr1
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
 
-    undef %1.sub0:sgpr_128 = COPY $sgpr4
-    undef %1.sub1:sgpr_128 = COPY $sgpr5
-    undef %1.sub2:sgpr_128 = COPY $sgpr6
-    undef %1.sub3:sgpr_128 = COPY $sgpr7
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
 
     %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0
     %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0

From 2215b797765b0fe5d04db70cfb0bb015918f5f50 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Tue, 11 Feb 2025 16:40:00 -0800
Subject: [PATCH 05/25] In block remat AND making v to s slightly more robust

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |   6 +-
 .../AMDGPU/remat/group_remat_clone.mir        |   3 +-
 llvm/test/CodeGen/AMDGPU/remat/in_blk.mir     | 760 ++++++++++++++++++
 .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir |   2 +-
 .../CodeGen/AMDGPU/remat/vector_to_scalar.mir | 240 +++---
 5 files changed, 887 insertions(+), 124 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/in_blk.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index f0a2dcdb5cc11..a3a20765c2df6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -2676,7 +2676,7 @@ bool collectPacifist(MachineInstr &MI,
       continue;
 
     Register Reg = MO.getReg();
-    if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+    if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::MODE))
       continue;
     if (Reg.isPhysical())
       return false;
@@ -2794,7 +2794,9 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
   bool bUpdated = false;
 
   // Move pacifist to its first user.
-  for (MachineInstr *MI : pacifistList) {
+  //for (MachineInstr *MI : pacifistList) {
+  for (auto it = pacifistList.rbegin(); it != pacifistList.rend(); it++) {
+    MachineInstr *MI = *it;
     MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
     if (firstUser == MI)
       continue;
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
index 06ea907aca44d..bfb8e85c8aef6 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
@@ -180,6 +180,7 @@ body:             |
     ; undef %1.sub2:sgpr_128 = COPY $sgpr6
     ; undef %1.sub3:sgpr_128 = COPY $sgpr7
 
+
     %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
     %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
     %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
@@ -639,4 +640,4 @@ body:             |
 
     S_ENDPGM 0
 ...
-    
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir b/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir
new file mode 100644
index 0000000000000..6db673b849ef2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/in_blk.mir
@@ -0,0 +1,760 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-in-blk-remat | FileCheck %s
+
+# Check that pacifist insts are moved to their users within the block.
+# CHECK: bb.0:
+# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0
+# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1
+# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]]
+# CHECK: EXP 0, %[[#r502]]
+# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]]
+# CHECK: EXP 0, %[[#r503]]
+# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]]
+# CHECK: EXP 0, %[[#r504]]
+# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]]
+# CHECK: EXP 0, %[[#r505]]
+# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]]
+# CHECK: EXP 0, %[[#r506]]
+# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]]
+# CHECK: EXP 0, %[[#r507]]
+# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]]
+# CHECK: EXP 0, %[[#r508]]
+# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]]
+# CHECK: EXP 0, %[[#r509]]
+# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]]
+# CHECK: EXP 0, %[[#r5010]]
+# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]]
+# CHECK: EXP 0, %[[#r5011]]
+# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]]
+# CHECK: EXP 0, %[[#r5012]]
+# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]]
+# CHECK: EXP 0, %[[#r5013]]
+# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]]
+# CHECK: EXP 0, %[[#r5014]]
+# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]]
+# CHECK: EXP 0, %[[#r5015]]
+# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]]
+# CHECK: EXP 0, %[[#r5016]]
+# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]]
+# CHECK: EXP 0, %[[#r5017]]
+# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]]
+# CHECK: EXP 0, %[[#r5018]]
+# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]]
+# CHECK: EXP 0, %[[#r5019]]
+# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]]
+# CHECK: EXP 0, %[[#r5020]]
+# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]]
+# CHECK: EXP 0, %[[#r5021]]
+# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]]
+# CHECK: EXP 0, %[[#r5022]]
+# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]]
+# CHECK: EXP 0, %[[#r5023]]
+# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]]
+# CHECK: EXP 0, %[[#r5024]]
+# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]]
+# CHECK: EXP 0, %[[#r5025]]
+# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]]
+# CHECK: EXP 0, %[[#r5026]]
+# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]]
+# CHECK: EXP 0, %[[#r5027]]
+# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]]
+# CHECK: EXP 0, %[[#r5028]]
+# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]]
+# CHECK: EXP 0, %[[#r5029]]
+# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]]
+# CHECK: EXP 0, %[[#r5030]]
+# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]]
+# CHECK: EXP 0, %[[#r5031]]
+# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]]
+# CHECK: EXP 0, %[[#r5032]]
+# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]]
+# CHECK: EXP 0, %[[#r5033]]
+# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]]
+# CHECK: EXP 0, %[[#r5034]]
+# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]]
+# CHECK: EXP 0, %[[#r5035]]
+# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]]
+# CHECK: EXP 0, %[[#r5036]]
+# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]]
+# CHECK: EXP 0, %[[#r5037]]
+# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]]
+# CHECK: EXP 0, %[[#r5038]]
+# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]]
+# CHECK: EXP 0, %[[#r5039]]
+# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]]
+# CHECK: EXP 0, %[[#r5040]]
+# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]]
+# CHECK: EXP 0, %[[#r5041]]
+# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]]
+# CHECK: EXP 0, %[[#r5042]]
+# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]]
+# CHECK: EXP 0, %[[#r5043]]
+# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]]
+# CHECK: EXP 0, %[[#r5044]]
+# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]]
+# CHECK: EXP 0, %[[#r5045]]
+# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]]
+# CHECK: EXP 0, %[[#r5046]]
+# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]]
+# CHECK: EXP 0, %[[#r5047]]
+# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]]
+# CHECK: EXP 0, %[[#r5048]]
+# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]]
+# CHECK: EXP 0, %[[#r5049]]
+# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]]
+# CHECK: EXP 0, %[[#r5050]]
+# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]]
+# CHECK: EXP 0, %[[#r5051]]
+# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]]
+# CHECK: EXP 0, %[[#r5052]]
+# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]]
+# CHECK: EXP 0, %[[#r5053]]
+# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]]
+# CHECK: EXP 0, %[[#r5054]]
+# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]]
+# CHECK: EXP 0, %[[#r5055]]
+# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]]
+# CHECK: EXP 0, %[[#r5056]]
+# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]]
+# CHECK: EXP 0, %[[#r5057]]
+# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]]
+# CHECK: EXP 0, %[[#r5058]]
+# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]]
+# CHECK: EXP 0, %[[#r5059]]
+# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]]
+# CHECK: EXP 0, %[[#r5060]]
+# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]]
+# CHECK: EXP 0, %[[#r5061]]
+# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]]
+# CHECK: EXP 0, %[[#r5062]]
+# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]]
+# CHECK: EXP 0, %[[#r5063]]
+# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]]
+# CHECK: EXP 0, %[[#r5064]]
+# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]]
+# CHECK: EXP 0, %[[#r5065]]
+# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]]
+# CHECK: EXP 0, %[[#r5066]]
+# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]]
+# CHECK: EXP 0, %[[#r5067]]
+# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]]
+# CHECK: EXP 0, %[[#r5068]]
+# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]]
+# CHECK: EXP 0, %[[#r5069]]
+# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]]
+# CHECK: EXP 0, %[[#r5070]]
+# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]]
+# CHECK: EXP 0, %[[#r5071]]
+# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]]
+# CHECK: EXP 0, %[[#r5072]]
+# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]]
+# CHECK: EXP 0, %[[#r5073]]
+# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]]
+# CHECK: EXP 0, %[[#r5074]]
+# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]]
+# CHECK: EXP 0, %[[#r5075]]
+# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]]
+# CHECK: EXP 0, %[[#r5076]]
+# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]]
+# CHECK: EXP 0, %[[#r5077]]
+# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]]
+# CHECK: EXP 0, %[[#r5078]]
+# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]]
+# CHECK: EXP 0, %[[#r5079]]
+# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]]
+# CHECK: EXP 0, %[[#r5080]]
+# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]]
+# CHECK: EXP 0, %[[#r5081]]
+# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]]
+# CHECK: EXP 0, %[[#r5082]]
+# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]]
+# CHECK: EXP 0, %[[#r5083]]
+# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]]
+# CHECK: EXP 0, %[[#r5084]]
+# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]]
+# CHECK: EXP 0, %[[#r5085]]
+# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]]
+# CHECK: EXP 0, %[[#r5086]]
+# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]]
+# CHECK: EXP 0, %[[#r5087]]
+# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]]
+# CHECK: EXP 0, %[[#r5088]]
+# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]]
+# CHECK: EXP 0, %[[#r5089]]
+# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]]
+# CHECK: EXP 0, %[[#r5090]]
+# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]]
+# CHECK: EXP 0, %[[#r5091]]
+# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]]
+# CHECK: EXP 0, %[[#r5092]]
+# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]]
+# CHECK: EXP 0, %[[#r5093]]
+# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]]
+# CHECK: EXP 0, %[[#r5094]]
+# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]]
+# CHECK: EXP 0, %[[#r5095]]
+# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]]
+# CHECK: EXP 0, %[[#r5096]]
+# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]]
+# CHECK: EXP 0, %[[#r5097]]
+# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]]
+# CHECK: EXP 0, %[[#r5098]]
+# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]]
+# CHECK: EXP 0, %[[#r5099]]
+# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]]
+# CHECK: EXP 0, %[[#r50100]]
+# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]]
+# CHECK: EXP 0, %[[#r50101]]
+# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]]
+# CHECK: EXP 0, %[[#r50102]]
+# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]]
+# CHECK: EXP 0, %[[#r50103]]
+# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]]
+# CHECK: EXP 0, %[[#r50104]]
+# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]]
+# CHECK: EXP 0, %[[#r50105]]
+# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]]
+# CHECK: EXP 0, %[[#r50106]]
+# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]]
+# CHECK: EXP 0, %[[#r50107]]
+# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]]
+# CHECK: EXP 0, %[[#r50108]]
+# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]]
+# CHECK: EXP 0, %[[#r50109]]
+# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]]
+# CHECK: EXP 0, %[[#r50110]]
+# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]]
+# CHECK: EXP 0, %[[#r50111]]
+# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]]
+# CHECK: EXP 0, %[[#r50112]]
+# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]]
+# CHECK: EXP 0, %[[#r50113]]
+# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]]
+# CHECK: EXP 0, %[[#r50114]]
+# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]]
+# CHECK: EXP 0, %[[#r50115]]
+# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]]
+# CHECK: EXP 0, %[[#r50116]]
+# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]]
+# CHECK: EXP 0, %[[#r50117]]
+# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]]
+# CHECK: EXP 0, %[[#r50118]]
+# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]]
+# CHECK: EXP 0, %[[#r50119]]
+# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]]
+# CHECK: EXP 0, %[[#r50120]]
+# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]]
+# CHECK: EXP 0, %[[#r50121]]
+# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]]
+# CHECK: EXP 0, %[[#r50122]]
+# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]]
+# CHECK: EXP 0, %[[#r50123]]
+# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]]
+# CHECK: EXP 0, %[[#r50124]]
+# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]]
+# CHECK: EXP 0, %[[#r50125]]
+# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]]
+# CHECK: EXP 0, %[[#r50126]]
+# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]]
+# CHECK: EXP 0, %[[#r50127]]
+# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]]
+# CHECK: EXP 0, %[[#r50128]]
+# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]]
+# CHECK: EXP 0, %[[#r50129]]
+# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]]
+# CHECK: EXP 0, %[[#r50130]]
+# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]]
+# CHECK: EXP 0, %[[#r50131]]
+# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]]
+# CHECK: EXP 0, %[[#r50132]]
+# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]]
+# CHECK: EXP 0, %[[#r50133]]
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+
+    %3:vgpr_32 = IMPLICIT_DEF
+
+
+    %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
+    %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
+    %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode
+    %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode
+    %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode
+    %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode
+    %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode
+    %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode
+    %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode
+    %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode
+    %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode
+    %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode
+    %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode
+    %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode
+    %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode
+    %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode
+    %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode
+    %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode
+    %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode
+    %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode
+    %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode
+    %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode
+    %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode
+    %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode
+    %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode
+    %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode
+    %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode
+    %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode
+    %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode
+    %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode
+    %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode
+    %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode
+    %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode
+    %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode
+    %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode
+    %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode
+    %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode
+    %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode
+    %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode
+    %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode
+    %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode
+    %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode
+    %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode
+    %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode
+    %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode
+    %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode
+    %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode
+    %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode
+    %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode
+    %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode
+    %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode
+    %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode
+    %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode
+    %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode
+    %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode
+    %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode
+    %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode
+    %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode
+    %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode
+    %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode
+    %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode
+    %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode
+    %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode
+    %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode
+    %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode
+    %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode
+    %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode
+    %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode
+    %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode
+    %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode
+    %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode
+    %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode
+    %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode
+    %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode
+    %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode
+    %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode
+    %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode
+    %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode
+    %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode
+    %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode
+    %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode
+    %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode
+    %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode
+    %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode
+    %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode
+    %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode
+    %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode
+    %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode
+    %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode
+    %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode
+    %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode
+    %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode
+    %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode
+    %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode
+    %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode
+    %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode
+    %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode
+    %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode
+    %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode
+    %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode
+    %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode
+    %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode
+    %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode
+    %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode
+    %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode
+    %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode
+    %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode
+    %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode
+    %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode
+    %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode
+    %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode
+    %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode
+    %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode
+    %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode
+    %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode
+    %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode
+    %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode
+    %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode
+    %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode
+    %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode
+    %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode
+    %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode
+    %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode
+    %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode
+    %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode
+    %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode
+    %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode
+    %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode
+    %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode
+    %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode
+    %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode
+    %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode
+    %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode
+
+
+
+    EXP 0, %500, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %501, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %502, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %503, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %504, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %505, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %506, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %507, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %508, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %509, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_BRANCH %bb.1
+
+    ; %8001:vgpr_32 = COPY %8000
+    ; %8002:vgpr_32 = COPY %8000
+    ; %8003:vgpr_32 = COPY %8000
+    ; %8004:vgpr_32 = COPY %8000
+    ; %8005:vgpr_32 = COPY %8000
+    ; %8006:vgpr_32 = COPY %8000
+    ; %8007:vgpr_32 = COPY %8000
+    ; %8008:vgpr_32 = COPY %8000
+    ; %8009:vgpr_32 = COPY %8000
+    ; %8010:vgpr_32 = COPY %8000
+    ; %8011:vgpr_32 = COPY %8000
+    ; %8012:vgpr_32 = COPY %8000
+    ; %8013:vgpr_32 = COPY %8000
+    ; %8014:vgpr_32 = COPY %8000
+    ; %8015:vgpr_32 = COPY %8000
+    ; %8016:vgpr_32 = COPY %8000
+    ; %8017:vgpr_32 = COPY %8000
+
+    ; %9001:vgpr_32 = COPY %8001
+    ; %9002:vgpr_32 = COPY %8002
+    ; %9003:vgpr_32 = COPY %8003
+    ; %9004:vgpr_32 = COPY %8004
+    ; %9005:vgpr_32 = COPY %8005
+    ; %9006:vgpr_32 = COPY %8006
+    ; %9007:vgpr_32 = COPY %8007
+    ; %9008:vgpr_32 = COPY %8008
+    ; %9009:vgpr_32 = COPY %8009
+    ; %9010:vgpr_32 = COPY %8010
+    ; %9011:vgpr_32 = COPY %8011
+    ; %9012:vgpr_32 = COPY %8012
+    ; %9013:vgpr_32 = COPY %8013
+    ; %9014:vgpr_32 = COPY %8014
+    ; %9015:vgpr_32 = COPY %8015
+    ; %9016:vgpr_32 = COPY %8016
+    ; %9017:vgpr_32 = COPY %8017
+
+  bb.1:
+
+    EXP 0, %500, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %501, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %502, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %503, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %504, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %505, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %506, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %507, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %508, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %509, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
index a702f7fc8011e..69875261b74e9 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
 
-# Check that the scalar loads have been moved to the use
+# Check that the loads have been moved to the use
 # CHECK: bb.2:
 # CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
 # CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
index 53f59cc3f8b0b..3a2d61555c0b4 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
@@ -1,125 +1,125 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
 
 # DEFS
-# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni00:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div00]], implicit $exec
-# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni01:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div01]], implicit $exec
-# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni02:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div02]], implicit $exec
-# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni03:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div03]], implicit $exec
-# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni04:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div04]], implicit $exec
-# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni05:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div05]], implicit $exec
-# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni06:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div06]], implicit $exec
-# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni07:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div07]], implicit $exec
-# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni08:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div08]], implicit $exec
-# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni09:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div09]], implicit $exec
-# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni10:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div10]], implicit $exec
-# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni11:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div11]], implicit $exec
-# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni12:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div12]], implicit $exec
-# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni13:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div13]], implicit $exec
-# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni14:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div14]], implicit $exec
-# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni15:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div15]], implicit $exec
-# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni16:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div16]], implicit $exec
-# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni17:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div17]], implicit $exec
-# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni18:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div18]], implicit $exec
-# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni19:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div19]], implicit $exec
-# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni20:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div20]], implicit $exec
-# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni21:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div21]], implicit $exec
-# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni22:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div22]], implicit $exec
-# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni23:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div23]], implicit $exec
-# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni24:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div24]], implicit $exec
-# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni25:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div25]], implicit $exec
-# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni26:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div26]], implicit $exec
-# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni27:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div27]], implicit $exec
-# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni28:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div28]], implicit $exec
-# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni29:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div29]], implicit $exec
-# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni30:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div30]], implicit $exec
-# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni31:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div31]], implicit $exec
-# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni32:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div32]], implicit $exec
-# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni33:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div33]], implicit $exec
-# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni34:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div34]], implicit $exec
-# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni35:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div35]], implicit $exec
-# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni36:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div36]], implicit $exec
-# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni37:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div37]], implicit $exec
-# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni38:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div38]], implicit $exec
-# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni39:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div39]], implicit $exec
-# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni40:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div40]], implicit $exec
-# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni41:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div41]], implicit $exec
-# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni42:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div42]], implicit $exec
-# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni43:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div43]], implicit $exec
-# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni44:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div44]], implicit $exec
-# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni45:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div45]], implicit $exec
-# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni46:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div46]], implicit $exec
-# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni47:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div47]], implicit $exec
-# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni48:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div48]], implicit $exec
-# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni49:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div49]], implicit $exec
-# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni50:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div50]], implicit $exec
-# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni51:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div51]], implicit $exec
-# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni52:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div52]], implicit $exec
-# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni53:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div53]], implicit $exec
-# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni54:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div54]], implicit $exec
-# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni55:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div55]], implicit $exec
-# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni56:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div56]], implicit $exec
-# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni57:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div57]], implicit $exec
-# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni58:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div58]], implicit $exec
-# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
 # CHECK: %[[#uni59:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div59]], implicit $exec
 
 
@@ -269,66 +269,66 @@ body:             |
     successors: %bb.1, %bb.2
     liveins: $sgpr0, $sgpr1, $sgpr8, $vgpr0, $vgpr1
 
-    %1000:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1001:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1002:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1003:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1004:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1005:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1006:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1007:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1008:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1009:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1000:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1001:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1002:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1003:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1004:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1005:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1006:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1007:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1008:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1009:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec
     %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %1059, 0, implicit $exec, implicit $mode
     $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
     S_CBRANCH_EXECZ %bb.2, implicit $exec

From d36a4ae0143602d8e858a72745e735e92b2f7f30 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Wed, 12 Feb 2025 09:29:11 -0800
Subject: [PATCH 06/25] clang-format

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 582 +++++++++---------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 372 +++++------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       | 107 ++--
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp    | 215 +++----
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.h      |  48 +-
 .../AMDGPUMirSyncDependenceAnalysis.cpp       | 152 ++---
 .../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h  |  17 +-
 .../AMDGPUOccupancyAndLatencyHelper.cpp       |  21 +-
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |  15 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    | 106 ++--
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h      |  23 +-
 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h  |  27 +-
 12 files changed, 820 insertions(+), 865 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index a3a20765c2df6..4656e28499a0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===//
+//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,24 +13,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUMIRUtils.h"
 #include "AMDGPUMirDivergenceAnalysis.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "AMDGPUSubExpDag.h"
+#include "AMDGPUSubtarget.h"
 #include "AMDGPUVMemDegreeDAG.h"
-#include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "GCNRegPressure.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "AMDGPUMIRUtils.h"
+#include "SIRegisterInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 
@@ -43,20 +43,24 @@
 using namespace llvm;
 
 static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
-static cl::opt<bool> EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
-static cl::opt<bool> EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
-static cl::opt<bool> EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
+static cl::opt<bool>
+    EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<bool>
+    EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
+static cl::opt<bool>
+    EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
 static cl::opt<bool> EnableVmemDegree("amdgpu-remat-enable-vmem-degree");
 static cl::opt<bool> EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat");
 static cl::opt<bool> EnableSubExp("amdgpu-remat-enable-sub-exp-remat");
-static cl::opt<bool> EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
-static cl::opt<bool> EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
+static cl::opt<bool>
+    EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
+static cl::opt<bool>
+    EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
 
 namespace {
 typedef DenseSet<MachineInstr *> InstSet;
 typedef DenseSet<MachineBasicBlock *> BlockSet;
-template<typename T>
-using BlockMap = MapVector<MachineBasicBlock *, T>;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
 
 // Rematerialize in a single pass instead of doing in register allcation.
 // If in register allocation, fail to rematerialize will cause spill.
@@ -65,9 +69,9 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
 public:
   static char ID;
 
-  DenseSet<const MachineInstr*> TotalUniformInsts;
-  DenseSet<const MachineInstr*> SafeToRemoveInsts;
-  DenseSet<const MachineInstr*> DivergentInsts;
+  DenseSet<const MachineInstr *> TotalUniformInsts;
+  DenseSet<const MachineInstr *> SafeToRemoveInsts;
+  DenseSet<const MachineInstr *> DivergentInsts;
   void RemoveInst(const MachineInstr *MI) {
     TotalUniformInsts.erase(MI);
     SafeToRemoveInsts.erase(MI);
@@ -99,9 +103,8 @@ typedef AMDGPUHotBlockRematerialize Remat;
 // Util functions.
 namespace {
 
-MachineBasicBlock *
-nearest_common_dominator(MachineDominatorTree *DT,
-                         BlockSet &Blocks) {
+MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT,
+                                            BlockSet &Blocks) {
   auto I = Blocks.begin(), E = Blocks.end();
 
   MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
@@ -217,10 +220,10 @@ bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   return true;
 }
 
-
 // SGPR has alignment requirment, cannot get accurate reg number.
 const unsigned NearTargetRegLimit = 10;
-bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) {
+bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST,
+                   MachineFunction &MF) {
   unsigned maxSGPR = ST->getAddressableNumSGPRs();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
@@ -254,10 +257,10 @@ struct RematStatus {
   DenseSet<MachineBasicBlock *> MemWriteMBBSet;
 };
 
-unsigned CollectMBBPressure(
-    MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
-    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
-    RematStatus &status) {
+unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+                            const MachineRegisterInfo &MRI,
+                            const GCNSubtarget *ST, unsigned &maxVPressure,
+                            unsigned &maxSPressure, RematStatus &status) {
   // Skip processing current block if it has only debug instructions
   if (MBB.getFirstNonDebugInstr() == MBB.end())
     return ST->getOccupancyWithNumVGPRs(0);
@@ -290,10 +293,10 @@ unsigned CollectMBBPressure(
   return RP.getOccupancy(*ST);
 }
 
-unsigned CollectFnPressure(
-    MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
-    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
-    RematStatus &status) {
+unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+                           const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST, unsigned &maxVPressure,
+                           unsigned &maxSPressure, RematStatus &status) {
   unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
   // If only have one block, input/ouput virtual live set are empty.
   if (MF.size() > 1) {
@@ -376,14 +379,14 @@ unsigned CollectFnPressure(
   }
   return TgtOcc;
 }
-RematStatus
-GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
-               const MachineRegisterInfo &MRI, const GCNSubtarget *ST) {
+RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+                           LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST) {
   unsigned maxSPressure = 0;
   unsigned maxVPressure = 0;
   RematStatus status;
-  unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure,
-                                      maxSPressure, status);
+  unsigned TgtOcc =
+      CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, maxSPressure, status);
   const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
   if (TgtOcc >= MaxOcc) {
     status.TargetOcc = TgtOcc;
@@ -418,7 +421,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
       TgtOcc = bigOcc;
       bNotBalance = true;
       if (TgtOcc >= MaxOccupancy)
-        TgtOcc = MaxOccupancy-1;
+        TgtOcc = MaxOccupancy - 1;
     }
   }
 
@@ -436,7 +439,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
       vInputPressure += RegSize;
     } else {
       unsigned RegIndex = SIRI->getHWRegIndex(Reg);
-      uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex;
+      uint64_t mask = ((1 << RegSize) - 1) << RegIndex;
       sInputMask |= mask;
     }
   }
@@ -451,7 +454,6 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
     mask = mask << 4;
   }
 
-
   // If balanced, try next occupancy.
   TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1);
 
@@ -614,8 +616,7 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
 }
 
 int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
-                   GCNRPTracker::LiveRegSet &CanidateSet,
-                   InstSet &ReducedInsts,
+                   GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts,
                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                    BlockLiveInfo &LiveInfo,
                    DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
@@ -791,9 +792,11 @@ void BuildRematCandiates(std::vector<RematNode> &Candidates,
 }
 
 // For case like
-//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform
-//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform
-//  %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform
+//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0,
+//   implicit-def dead $scc; xb.uniform
+//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc;
+//  xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit
+//  killed $scc; xb.uniform
 // Sink S_AND right before S_CSELECT will overwrite SCC.
 // To avoid it, skip case when DefMI and UseMI has implicit define use.
 bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
@@ -973,7 +976,7 @@ int FilterRematCandiates(std::vector<RematNode> &Candidates,
 }
 
 void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
-                SmallVector<MachineInstr *, 2> &userMIs) {
+                 SmallVector<MachineInstr *, 2> &userMIs) {
   for (MachineInstr *UseMI : userMIs) {
     for (MachineOperand &MO : UseMI->operands()) {
       if (!MO.isReg())
@@ -999,7 +1002,6 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     }
   }
 
-
   // For userBlocks which dominate all hotBlocks, don't need to clone because
   // the value not cross hotBlocks when later blocks are cloned.
   // For userBlocks which dominated by all hotBlocks, they could share clones
@@ -1064,68 +1066,45 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
 // Look for an earlier insert point if the InstructionToMove
 // writes to scc and scc is live at the CurrentInsertPoint.
 static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
-    MachineInstr *InstructionToMove,
-    MachineBasicBlock *MBB,
-    MachineBasicBlock::iterator CurrentInsertPoint,
-    MachineRegisterInfo &MRI,
-   const SIRegisterInfo *SIRI,
-   const SIInstrInfo *SIII
-) 
-{
-    const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
-    if (WillSmashScc)
-    {
-        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
-            CurrentInsertPoint,
-            SIRI,
-            SIII,
-            &MRI
-        );
-    }
-
-    return CurrentInsertPoint;
+    MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  const bool WillSmashScc =
+      InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+  if (WillSmashScc) {
+    CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+        MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+  }
+
+  return CurrentInsertPoint;
 }
 
 // Look for an earlier insert point if the SubExp
 // writes to scc and scc is live at the CurrentInsertPoint.
 static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
-    const SubExp &SubExpToMove,
-    MachineBasicBlock *MBB,
-    MachineBasicBlock::iterator CurrentInsertPoint,
-    MachineRegisterInfo& MRI,
-    const SIRegisterInfo* SIRI,
-    const SIInstrInfo* SIII
-)
-{
-    const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
-    if (WillSmashScc)
-    {
-        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
-            CurrentInsertPoint,
-            SIRI,
-            SIII,
-            &MRI
-        );
-    }
-
-    return CurrentInsertPoint;
+    const SubExp &SubExpToMove, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
+  if (WillSmashScc) {
+    CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+        MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+  }
+
+  return CurrentInsertPoint;
 }
 
 // Return trun if moving MI to Location will smash a live scc value.
-static bool WillSmashSccAtLocation(
-    MachineInstr* MI,
-    MachineBasicBlock* MBB,
-    MachineBasicBlock::iterator Location
-)
-{
-    // It is ok to pass nullptr to `modifiesRegister` for TRI here since
-    // SCC has no subreg/suprereg relationships.
-    return MI->modifiesRegister(AMDGPU::SCC, nullptr)
-        && llvm::IsSccLiveAt(MBB, Location);
+static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator Location) {
+  // It is ok to pass nullptr to `modifiesRegister` for TRI here since
+  // SCC has no subreg/suprereg relationships.
+  return MI->modifiesRegister(AMDGPU::SCC, nullptr) &&
+         llvm::IsSccLiveAt(MBB, Location);
 }
 
-void ApplyCloneRemat(Remat *Remat,
-                     RematNode &Node, std::vector<BlockLiveInfo> &hotBlocks,
+void ApplyCloneRemat(Remat *Remat, RematNode &Node,
+                     std::vector<BlockLiveInfo> &hotBlocks,
                      MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
                      SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
                      const SIInstrInfo *SIII, MachineFunction &MF) {
@@ -1185,10 +1164,9 @@ void ApplyCloneRemat(Remat *Remat,
         InsertPointMI = UseMI;
       }
     }
-    
+
     MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash(
-        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII
-    );
+        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
 
     for (MachineMemOperand *MO : DefMI->memoperands()) {
       NewDef->addMemOperand(MF, MO);
@@ -1221,10 +1199,11 @@ void ApplyCloneRemat(Remat *Remat,
 
 void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
                             SlotIndexes *slotIndexes,
-                            const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+                            const SIRegisterInfo *SIRI,
+                            const SIInstrInfo *SIII) {
   MachineInstr *DefMI = Node.DefMI;
   MachineInstr *InsertPointMI = Node.InsertPointMI;
-  MachineBasicBlock* MBB = nullptr;
+  MachineBasicBlock *MBB = nullptr;
 
   // Find a valid insert point.
   MachineBasicBlock::iterator InsertPoint;
@@ -1236,10 +1215,9 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
     MBB = Node.InsertBlock;
   }
 
-  InsertPoint = AdjustInsertPointToAvoidSccSmash(
-      DefMI, MBB, InsertPoint, MRI, SIRI, SIII
-  );
-  
+  InsertPoint = AdjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+                                                 SIRI, SIII);
+
   // Move instruction to new location.
   DefMI->removeFromParent();
   InsertPoint->getParent()->insert(InsertPoint, DefMI);
@@ -1271,7 +1249,8 @@ void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
     } else if (Node.Kind == RematNode::RematKind::Clone) {
-      ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF);
+      ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII,
+                      MF);
     }
   }
 }
@@ -1505,7 +1484,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
               MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
               if (UseMI.getParent() != MBB)
                 continue;
-              int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false);
+              int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI,
+                                   /*bVGPR*/ false);
               if (gain > 0) {
                 // Skip case when DefMI has implicit define which used by UseMI.
                 if (isImplicitDefUse(&MI, &UseMI)) {
@@ -1539,8 +1519,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     bool bNeedVRemat = rematVCnt > 0;
     // If sgpr spill, always do remat.
     bool bSRematOK =
-        (newRematSCnt <= 0 && !SRematMap.empty()) ||
-        bForceRematSgpr;
+        (newRematSCnt <= 0 && !SRematMap.empty()) || bForceRematSgpr;
     bool bVRematOK =
         (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty();
     if (bNeedSRemat && bNeedVRemat) {
@@ -1575,7 +1554,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
 
   if (!SRematMap.empty()) {
     bUpdated = true;
-    ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF);
+    ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII,
+               MF);
     LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
   }
 
@@ -1595,49 +1575,46 @@ bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) {
   return DefMIs.size() == 1;
 }
 
-static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg)
-{
-    if (!MO.isImplicit() || !MO.isUse() || !MO.isReg())
-    {
-        return false;
-    }
+static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) {
+  if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) {
+    return false;
+  }
 
-    return MO.getReg() == Reg;
+  return MO.getReg() == Reg;
 }
 
-static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg)
-{
-    if (!MO.isImplicit() || !MO.isDef() || !MO.isReg())
-    {
-        return false;
-    }
+static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) {
+  if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) {
+    return false;
+  }
 
-    return MO.getReg() == Reg;
+  return MO.getReg() == Reg;
 }
 
-static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII)
-{
-    // Make sure UseMI is not wqm like sample.
-    if (SIII->isWQM(UseMI->getOpcode()))
-        return false;
-    if (UseMI->getOpcode() == AMDGPU::PHI)
-        return false;
-    
-    return true;
+static bool IsSafeRematCandidateUser(const MachineInstr *UseMI,
+                                     const SIInstrInfo *SIII) {
+  // Make sure UseMI is not wqm like sample.
+  if (SIII->isWQM(UseMI->getOpcode()))
+    return false;
+  if (UseMI->getOpcode() == AMDGPU::PHI)
+    return false;
+
+  return true;
 }
 
 static bool isConvergent(Remat *Remat, const MachineInstr &MI) {
   return MI.isConvergent() &&
-    // This flag is set on readfirstlane's to indicate that they
-    // are redundant (the value being read is already uniform).
-    // Normally, readfirstlanes are convergent, because different exec
-    // will cause a different value to be read; a known uniform
-    // readfirstlane is safe to move or clone and not actually convergent.
-    !Remat->TotalUniformInsts.count(&MI);
+         // This flag is set on readfirstlane's to indicate that they
+         // are redundant (the value being read is already uniform).
+         // Normally, readfirstlanes are convergent, because different exec
+         // will cause a different value to be read; a known uniform
+         // readfirstlane is safe to move or clone and not actually convergent.
+         !Remat->TotalUniformInsts.count(&MI);
 }
 
 bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
-                     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) {
+                     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                     bool bSink) {
   if (Reg.isPhysical())
     return false;
   bool bVGPR = SIRI->isVGPR(MRI, Reg);
@@ -1664,7 +1641,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
     if (!Op.isReg())
       continue;
     Register OpReg = Op.getReg();
-    if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
+    if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) ||
+        IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
       continue;
     if (IsImplicitUseOfReg(Op, AMDGPU::MODE))
       continue;
@@ -1675,7 +1653,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
       continue;
     if (OpReg.isPhysical())
       return false;
-    if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
+    if (!MRI.getUniqueVRegDef(OpReg) &&
+        !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
       return false;
     }
   }
@@ -1696,12 +1675,10 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
 }
 
 std::vector<SubExp> buildSubExpFromCandidates(
-    Remat *Remat,
-    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
     const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
-    GCNRPTracker::LiveRegSet &unUsedPassThrus,
-    bool bAllowPartialUseInSubExp) {
+    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
   InstSet CandidateDefs;
   DenseSet<unsigned> RemovedCandidates;
   std::vector<unsigned> CandidateRegs;
@@ -1798,7 +1775,7 @@ std::vector<SubExp> buildSubExpFromCandidates(
       break;
     }
 
-    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true))
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true))
       continue;
 
     // If all users of MI are in candidate defs, add MI into candidate defs.
@@ -1877,10 +1854,8 @@ std::vector<SubExp> buildSubExpFromCandidates(
   return dag.SubExps;
 }
 
-
 std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
-    Remat* Remat,
-    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
     const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) {
   InstSet CandidateDefs;
@@ -2052,7 +2027,7 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
   } dbgs() << "\nFinished Candidate Defs End\n";);
 
   LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it
-                                                            : LocalCandidates) {
+                                                    : LocalCandidates) {
     pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs());
   } dbgs() << "\nLocalCandidates End\n";);
   // Make sure all input reg are uniqueDef.
@@ -2064,7 +2039,6 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
   return dag.SubExps;
 }
 
-
 void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
   if (Reg.isVirtual()) {
     StringRef Name = MRI.getVRegName(Reg);
@@ -2102,8 +2076,7 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
 
 void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
                              MachineDominatorTree *pDT,
-                             SlotIndexes *slotIndexes,
-                             const SIInstrInfo *SIII,
+                             SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
                              const SIRegisterInfo *SIRI) {
   // Move from bottom.
   MachineBasicBlock *FromBB = Exp.FromBB;
@@ -2118,12 +2091,14 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
       continue;
 
     // Do not overwrite a live scc.
-    MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin());
+    MachineBasicBlock::iterator InsertPoint =
+        ToBB->SkipPHIsAndLabels(ToBB->begin());
     if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint))
       continue;
 
     DefMI->removeFromParent();
-    assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point");
+    assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) &&
+           "invalid insert point");
     ToBB->insert(InsertPoint, DefMI);
     // Debug insts don't need slot index.
     if (DefMI->isDebugInstr())
@@ -2134,12 +2109,11 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
   }
 }
 
-
 void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
-                             MachineDominatorTree *pDT,
-                             SlotIndexes *slotIndexes,
-                             const SIInstrInfo *SIII,
-                             const SIRegisterInfo *SIRI) {
+                               MachineDominatorTree *pDT,
+                               SlotIndexes *slotIndexes,
+                               const SIInstrInfo *SIII,
+                               const SIRegisterInfo *SIRI) {
   // Move from top.
   // Find lowest input def.
   MachineBasicBlock *ToBB = Exp.ToBB;
@@ -2155,9 +2129,8 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
       Terminator = ToBB->end();
   }
 
-  Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(
-      Exp, ToBB, Terminator, MRI, SIRI, SIII
-  );
+  Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator,
+                                                         MRI, SIRI, SIII);
 
   for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
     MachineInstr *DefMI = *it;
@@ -2391,11 +2364,12 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
       reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT);
 
   // Sort to make stable order.
-  std::sort(userBlocks.begin(), userBlocks.end(),
-    [](std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it0,
-      std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it1) {
+  std::sort(
+      userBlocks.begin(), userBlocks.end(),
+      [](std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it0,
+         std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it1) {
         return it0.first->getNumber() < it1.first->getNumber();
-    });
+      });
 
   const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
 
@@ -2484,7 +2458,6 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
   }
 }
 
-
 void ApplySubExpCloneNearUserInBlock(
     SubExp &Exp,
     DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
@@ -2623,7 +2596,7 @@ unsigned getPacifistLevel(unsigned Reg,
 }
 
 bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
-                                  const MachineRegisterInfo &MRI) {
+                   const MachineRegisterInfo &MRI) {
   for (MachineInstr &def : MRI.def_instructions(Reg)) {
     if (def.getParent() != MBB)
       continue;
@@ -2658,8 +2631,8 @@ bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive,
   return inputLive.count(Reg) && outputLive.count(Reg);
 }
 
-// Instructions which only use imm/passThru reg/output only reg will not kill any
-// live reg, so name them pacifist here.
+// Instructions which only use imm/passThru reg/output only reg will not kill
+// any live reg, so name them pacifist here.
 bool collectPacifist(MachineInstr &MI,
                      const GCNRPTracker::LiveRegSet &inputLive,
                      const GCNRPTracker::LiveRegSet &outputLive,
@@ -2676,7 +2649,8 @@ bool collectPacifist(MachineInstr &MI,
       continue;
 
     Register Reg = MO.getReg();
-    if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::MODE))
+    if (MO.isImplicit() &&
+        (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::MODE))
       continue;
     if (Reg.isPhysical())
       return false;
@@ -2702,7 +2676,8 @@ bool collectPacifist(MachineInstr &MI,
     if (Reg.isPhysical())
       return false;
 
-    if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+    if (nullptr ==
+        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
       return false;
 
     bHasDef = true;
@@ -2711,30 +2686,27 @@ bool collectPacifist(MachineInstr &MI,
   return bHasDef;
 }
 
-static MachineInstr* findFirstAliasingLoadOrStoreInMBB(
-    MachineInstr &MI,
-    MachineBasicBlock &MBB,
-    AliasAnalysis *AA
-)
-{
-    if (MI.mayLoadOrStore())
-    {
-        for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I)
-        {
-            const bool UseTBAA = false;
-            if (MI.mayAlias(AA, *I, UseTBAA))
-            {
-                return &*I;
-            }
-        }
+static MachineInstr *findFirstAliasingLoadOrStoreInMBB(MachineInstr &MI,
+                                                       MachineBasicBlock &MBB,
+                                                       AliasAnalysis *AA) {
+  if (MI.mayLoadOrStore()) {
+    for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end();
+         I != E; ++I) {
+      const bool UseTBAA = false;
+      if (MI.mayAlias(AA, *I, UseTBAA)) {
+        return &*I;
+      }
     }
+  }
 
-    return nullptr;
+  return nullptr;
 }
 
-static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI,
-                            AliasAnalysis *AA,
-                            SlotIndexes *slotIndexes) {
+static MachineInstr *findPacifistInsertPoint(MachineInstr &MI,
+                                             MachineBasicBlock &MBB,
+                                             MachineRegisterInfo &MRI,
+                                             AliasAnalysis *AA,
+                                             SlotIndexes *slotIndexes) {
 
   SmallVector<MachineInstr *, 2> users;
 
@@ -2742,14 +2714,13 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock
   // op with which it aliases. Find the first instruction
   // that aliases the pacifist MI (if any) and add it to the list
   // of users. The sort() below will select the earliest user instruction.
-  if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
+  if (MachineInstr *AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
     users.push_back(AliasMI);
   }
 
   for (MachineOperand &MO : MI.defs()) {
     unsigned Reg = MO.getReg();
-    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg))
-    {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       if (&MBB != UseMI.getParent())
         continue;
       users.emplace_back(&UseMI);
@@ -2773,8 +2744,7 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock
 bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
                      MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                      const SIInstrInfo *SIII, AliasAnalysis *AA,
-                     RematStatus &status) 
-{
+                     RematStatus &status) {
   const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
   const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
 
@@ -2794,10 +2764,11 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
   bool bUpdated = false;
 
   // Move pacifist to its first user.
-  //for (MachineInstr *MI : pacifistList) {
+  // for (MachineInstr *MI : pacifistList) {
   for (auto it = pacifistList.rbegin(); it != pacifistList.rend(); it++) {
     MachineInstr *MI = *it;
-    MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
+    MachineInstr *firstUser =
+        findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
     if (firstUser == MI)
       continue;
     if (firstUser == MI->getNextNode())
@@ -2814,14 +2785,15 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
         // BRANCH may have exec update before it.
         insertPoint--;
 
-      insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+      insertPoint =
+          llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
 
       while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
               insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
-             insertPoint != MI->getIterator())
-      {
+             insertPoint != MI->getIterator()) {
         insertPoint--;
-        insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+        insertPoint =
+            llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
       }
       if (insertPoint == MI->getIterator())
         continue;
@@ -2887,7 +2859,7 @@ bool collectVToSCrossHotSpot(
     const SIInstrInfo *SIII) {
   unsigned VLimit = status.TargetVLimit;
   unsigned SLimit = status.TargetSLimit;
-  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
 
   GCNDownwardRPTracker Tracker(*LIS);
 
@@ -2926,24 +2898,23 @@ bool collectVToSCrossHotSpot(
       VExtra--;
       bUpdated = true;
     }
-
   }
   return bUpdated;
 }
 
 // Return true if the user is outside of the def's loop.
-static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI)
-{
-  MachineLoop* L = MLI->getLoopFor(Def->getParent());
+static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User,
+                           MachineLoopInfo *MLI) {
+  MachineLoop *L = MLI->getLoopFor(Def->getParent());
   return L && !L->contains(User->getParent());
 }
 
 bool rematUniformVgprToSgpr(
-    Remat *Remat,
-    MachineFunction &MF, RematStatus &status,
+    Remat *Remat, MachineFunction &MF, RematStatus &status,
     DenseMap<MachineBasicBlock *, GCNRegPressure> &MBBPressureMap,
-    std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI,
-    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
+    std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS,
+    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
   DenseMap<unsigned, MachineInstr *> UniformVgprMap =
       collectUniformVgprs(Remat, MF, MRI, SIRI);
 
@@ -2977,7 +2948,8 @@ bool rematUniformVgprToSgpr(
       // Do not replace v->s across loops. Even if the value is uniform
       // branch divergence can cause a uniform value in a loop to be
       // non-uniform when used outside a loop.
-      if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI))
+      if (IsSafeRematCandidateUser(&userMI, SIII) &&
+          !IsCrossLoopUse(MI, &userMI, MLI))
         userMIs.emplace_back(&userMI);
     }
 
@@ -2993,7 +2965,7 @@ bool rematUniformVgprToSgpr(
     for (MachineInstr *userMI : userMIs) {
       const auto &Desc = userMI->getDesc();
       bool bIllegal = false;
-      for (unsigned i=0;i<userMI->getNumOperands();i++) {
+      for (unsigned i = 0; i < userMI->getNumOperands(); i++) {
         MachineOperand &MO = userMI->getOperand(i);
         if (!MO.isReg())
           continue;
@@ -3026,7 +2998,8 @@ bool rematUniformVgprToSgpr(
       auto rit = userMI->getReverseIterator();
       rit++;
       auto endIt = userMI->getParent()->rend();
-      while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit))
+      while (rit != endIt && !rit->isDebugInstr() &&
+             !slotIndexes->hasIndex(*rit))
         slotIndexes->insertMachineInstrInMaps(*(rit++));
     }
   }
@@ -3112,9 +3085,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
               DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
               unsigned VLimit, unsigned SLimit,
               const DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
-              LiveIntervals *LIS,
-              const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-              const SIInstrInfo *SIII) {
+              LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+              const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
   auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
   const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex();
   const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
@@ -3139,7 +3111,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
       continue;
 
     // Igonre inst in hot range.
-    if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) {
+    if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit ||
+        RP.getMaxSGPR() > SLimit) {
       Tracker.advance();
       continue;
     }
@@ -3254,7 +3227,7 @@ bool tryRematInHotSpot(
   unsigned VLimit = status.TargetVLimit;
   unsigned SLimit = status.TargetSLimit;
 
-  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
   const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
 
   const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
@@ -3305,9 +3278,8 @@ bool tryRematInHotSpot(
     // Use hotVMI when apply.
     inBlockHotSInstMap[&MBB] = nullptr;
     if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive,
-                    outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
-                    status.MemWriteMBBSet,
-                    LIS, MRI, SIRI, SIII))
+                 outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
+                 status.MemWriteMBBSet, LIS, MRI, SIRI, SIII))
       return true;
   }
 
@@ -3317,8 +3289,7 @@ bool tryRematInHotSpot(
     inBlockHotVInstMap[&MBB] = nullptr;
     return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false,
                     inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
-                    SLimit, status.MemWriteMBBSet,
-                    LIS, MRI, SIRI, SIII);
+                    SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII);
   }
   return false;
 }
@@ -3449,7 +3420,8 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
   }
 }
 
-// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1.
+// Compare pressure, return ture if maxV0/maxS0 pressure is higher than
+// maxV1/maxS1.
 bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
                     unsigned maxS1, const GCNSubtarget *ST) {
   unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0);
@@ -3472,10 +3444,11 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
 }
 
 // Return true if the subExp can help pressure for passThrus.
-bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
-                     const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                     const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
-                     MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) {
+bool canHelpPressureWhenSink(
+    SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
+    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+    MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound) {
   LLVM_DEBUG(subExp.dump(MRI, SIRI));
   if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
     return false;
@@ -3591,8 +3564,7 @@ bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
 }
 
 SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-groupPassThruByDefBlock(Remat *Remat,
-                        const GCNRPTracker::LiveRegSet &passThrus,
+groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus,
                         GCNRPTracker::LiveRegSet &usedPassThrus,
                         MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                         const SIInstrInfo *SIII) {
@@ -3618,8 +3590,9 @@ groupPassThruByDefBlock(Remat *Remat,
     GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()];
     DefInMBB[Reg] = it.second;
   }
-  
-  llvm::SmallVector<std::pair<MachineBasicBlock*, GCNRPTracker::LiveRegSet>> result = Candidates.takeVector();
+
+  llvm::SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+      result = Candidates.takeVector();
 
   LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it
                                                               : result) {
@@ -3636,7 +3609,7 @@ groupPassThruByDefBlock(Remat *Remat,
             });
 
   LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it
-                                                              : result) {
+                                                             : result) {
     MachineBasicBlock *MBB = it.first;
     auto &defInMBB = it.second;
     MBB->dump();
@@ -3693,7 +3666,8 @@ collectPassThrus(MachineBasicBlock *MBB,
   return passThrus;
 }
 // Try to build a free subExp which all input is passThrus.
-SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus,
+SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
+                       GCNRPTracker::LiveRegSet &passThrus,
                        MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
   SubExp freeExp;
   // Try to split the subExp to find a help case.
@@ -3818,9 +3792,9 @@ std::vector<SubExp> buildSubExpCandidates(
     // Try to remove out reg def sub exp from DefMBB.
     GCNRPTracker::LiveRegSet &DefInMBB = it.second;
     // Go up on the dag until reach share node.
-    auto subExps =
-        buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI,
-                                  slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp);
+    auto subExps = buildSubExpFromCandidates(
+        Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus,
+        bAllowPartialUseInSubExp);
     for (SubExp &subExp : subExps) {
       if (subExp.bHasMemInst) {
         // Skip when memory ld/st inst need to cross MBB which write memory.
@@ -3847,11 +3821,13 @@ std::vector<SubExp> buildSubExpCandidates(
         }
       }
       if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
-                           bCanClone, bSgprBound)) {
-        if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
-          SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
-          if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
-                              bCanClone, bSgprBound)) {
+                                   bCanClone, bSgprBound)) {
+        if (bAllowPartialUseInSubExp &&
+            subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
+          SubExp freeSubExp =
+              buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
+          if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII,
+                                      MLI, pDT, bCanClone, bSgprBound)) {
             subExpCandidates.emplace_back(freeSubExp);
           }
         }
@@ -3936,8 +3912,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
           LLVM_DEBUG(std::string movStr =
                          Exp.bHoist ? "output hoist:" : "output sink:";
-                     dbgs() << movStr << Register::virtReg2Index(Reg)
-                            << " " << Size);
+                     dbgs()
+                     << movStr << Register::virtReg2Index(Reg) << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB when sink and increase when hoist.
           if (SIRI->isVGPR(MRI, Reg)) {
@@ -3974,10 +3950,9 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           // It will increase live for MBB.
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
 
-          LLVM_DEBUG(std::string movStr =
-                         Exp.bHoist ? "input hoist:" : "input sink:";
-                     dbgs() << movStr << Register::virtReg2Index(Reg)
-                            << " " << Size);
+          LLVM_DEBUG(
+              std::string movStr = Exp.bHoist ? "input hoist:" : "input sink:";
+              dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             if (Exp.bHoist)
@@ -4019,8 +3994,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
         LaneBitmask profitMask = outMask & MBBBeginMask;
         if (MBBBeginMask.any()) {
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
-          LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg)
-                            << " " << Size);
+          LLVM_DEBUG(dbgs()
+                     << "move:" << Register::virtReg2Index(Reg) << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB.
           if (SIRI->isVGPR(MRI, Reg)) {
@@ -4048,8 +4023,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           // It will increase live for MBB.
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
 
-          LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg)
-                            << " " << Size);
+          LLVM_DEBUG(dbgs()
+                     << "add:" << Register::virtReg2Index(Reg) << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             vgprDiff += Size;
@@ -4095,8 +4070,8 @@ void addExpCandidates(std::vector<SubExp> &subExpCandidates,
 }
 
 bool tryToAddSubExps(
-    Remat *Remat,
-    HotBlock &hotBB, RematStatus &status, std::vector<SubExp> &subExpCandidates,
+    Remat *Remat, HotBlock &hotBB, RematStatus &status,
+    std::vector<SubExp> &subExpCandidates,
     std::vector<SubExp> &inBlockCloneSubExps,
     DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
     DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
@@ -4110,9 +4085,9 @@ bool tryToAddSubExps(
     SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT,
     bool bCanClone, bool bVOutBound, bool bSOutBound,
     GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
-  std::vector<SubExp> partialSubExps = buildSubExpCandidates(Remat,
-      Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone,
-      bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
+  std::vector<SubExp> partialSubExps = buildSubExpCandidates(
+      Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT,
+      bCanClone, bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
       bAllowPartialUseInSubExp);
 
   GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
@@ -4182,8 +4157,8 @@ bool tryToAddSubExps(
       // Try to remove out reg def sub exp from DefMBB.
       GCNRPTracker::LiveRegSet &UseInMBB = it.second;
       // Go up on the dag until reach share node.
-      auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI,
-                                                        SIII, MRI, slotIndexes);
+      auto subExps = buildSubExpFromCandidatesTopBottom(
+          Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes);
       for (SubExp &subExp : subExps) {
         if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound))
           continue;
@@ -4216,8 +4191,7 @@ bool tryToAddSubExps(
   if (EnableVmemDegree &&
       // Only expect vmem when last tryToAddSubExps.
       // If not, bAllowPartialUseInSubExp will no chance to be true.
-      (bAllowPartialUseInSubExp ||
-       !EnableSubExpAggressive)) {
+      (bAllowPartialUseInSubExp || !EnableSubExpAggressive)) {
     // Assume vmemLdSize could be optimized by not parallel.
     if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
          (vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
@@ -4256,8 +4230,7 @@ bool tryToAddSubExps(
 // Reason to do it per block is to make sure passthru reuse is precise.
 // If try remat on all hot blocks together, the passthru might be on one block,
 // but the reuse in on another block which the reg is not passthru there.
-bool perBlockPassthruRemat(Remat *Remat,
-                           std::vector<HotBlock> &hotBlocks,
+bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
                            RematStatus &status,
                            GCNRPTracker::LiveRegSet &liveRegCandidates,
                            const GCNSubtarget *ST, LiveIntervals *LIS,
@@ -4266,8 +4239,7 @@ bool perBlockPassthruRemat(Remat *Remat,
                            const SIRegisterInfo *SIRI,
                            const SIInstrInfo *SIII) {
   bool bUpdated = false;
-  bool bCanClone = EnableSubExpClone |
-                   EnableSubExpAggressive;
+  bool bCanClone = EnableSubExpClone | EnableSubExpAggressive;
 
   SlotIndexes *slotIndexes = LIS->getSlotIndexes();
   // Sort hot blocks by pressure first.
@@ -4331,19 +4303,19 @@ bool perBlockPassthruRemat(Remat *Remat,
 
     // Group pass thru regs by def MBB.
     SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-        Candidates =
-        groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII);
+        Candidates = groupPassThruByDefBlock(Remat, passThrus, usedPassThrus,
+                                             MRI, SIRI, SIII);
     // unUsedPassThrus used to collect passThru which is skipped when build
     // subExp.
     GCNRPTracker::LiveRegSet unusedPassThrus;
     // Build exp dag on define blocks.
     bool bAllowPartialUseInSubExp = false;
-    if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
-                        inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
-                        vgpr, sgpr, savingInputLive, savingOutputLive,
-                        passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
-                        LIS, pDT, bCanClone, bVOutBound, bSOutBound,
-                        unusedPassThrus, bAllowPartialUseInSubExp)) {
+    if (tryToAddSubExps(
+            Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+            inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
+            savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
+            SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+            unusedPassThrus, bAllowPartialUseInSubExp)) {
       // Remove unusedPassThrus from passThrus first.
       llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
       llvm::mergeLiveRegSet(usedPassThrus, passThrus);
@@ -4359,12 +4331,12 @@ bool perBlockPassthruRemat(Remat *Remat,
       return false;
 
     bAllowPartialUseInSubExp = true;
-    if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
-                         inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
-                         vgpr, sgpr, savingInputLive, savingOutputLive,
-                         passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
-                         LIS, pDT, bCanClone, bVOutBound, bSOutBound,
-                         unusedPassThrus, bAllowPartialUseInSubExp)) {
+    if (!tryToAddSubExps(
+            Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+            inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
+            savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
+            SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+            unusedPassThrus, bAllowPartialUseInSubExp)) {
       return false;
     }
     // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
@@ -4430,10 +4402,9 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
 
 } // namespace
 
-bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
-                MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT,
-                AliasAnalysis *AA)
-{
+bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
+                LiveIntervals *LIS, MachineDominatorTree *pDT,
+                MachinePostDominatorTree *pPDT, AliasAnalysis *AA) {
   if (MF.size() < 2)
     return false;
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
@@ -4495,7 +4466,6 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
                          maxLocalSPressure, status);
 
       maxLocalSPressure += RegForVCC;
-
     }
     if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
       continue;
@@ -4504,7 +4474,9 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
     if (bBothOutLimit && maxLocalVPressure <= VLimit)
       continue;
     GCNRPTracker::LiveRegSet liveSet;
-    hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 });
+    hotBlocks.push_back({&MBB, liveSet,
+                         std::make_pair(maxLocalVPressure, maxLocalSPressure),
+                         0, 0});
   }
   // Collect vmemLdInput/OutputSize.
   if (EnableVmemDegree) {
@@ -4546,8 +4518,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
   }
 
   if (EnableUniformVectorToScalar) {
-    if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI,
-                               SIRI, SIII, MLI)) {
+    if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap,
+                               hotBlocks, LIS, MRI, SIRI, SIII, MLI)) {
       // Rebuild LIS.
       LIS->reanalyze(MF);
       status = GetRematStatus(MF, MLI, LIS, MRI, ST);
@@ -4601,15 +4573,17 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
         PressureUnderLimitSet.insert(MBB);
       } else {
         if (MaxLocalVGPR < it.maxPressures.first)
-          it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second);
+          it.maxPressures =
+              std::make_pair(MaxLocalVGPR, it.maxPressures.second);
         if (MaxLocalSGPR < it.maxPressures.second)
           it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR);
       }
     }
   }
 
-  bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates,
-                                        ST, LIS, MLI, pDT, MRI, SIRI, SIII);
+  bool bUpdated =
+      perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST,
+                            LIS, MLI, pDT, MRI, SIRI, SIII);
 
   return bUpdated;
 }
@@ -4618,8 +4592,10 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
   if (MF.size() < 2)
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  MachineDominatorTree *DT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachinePostDominatorTree *PDT =
+      &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
   MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
@@ -4628,9 +4604,10 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
     CI.compute(MF);
     auto TTI = MF.getTarget().getTargetTransformInfo(MF.getFunction());
     MachineUniformityInfo MachineUniformity =
-      llvm::computeMachineUniformityInfo(MF, CI, *DT, /*HasBranchDivergence*/true);
+        llvm::computeMachineUniformityInfo(MF, CI, *DT,
+                                           /*HasBranchDivergence*/ true);
 
-    //llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
+    // llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : MBB) {
         if (MachineUniformity.isUniform(&MI)) {
@@ -4640,8 +4617,8 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
-  // For non-cs/ps, set target occ as 4.
+  // LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
+  //  For non-cs/ps, set target occ as 4.
   bool bNearTarget = false;
   bool bFinalUpdated = false;
   bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget);
@@ -4666,8 +4643,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize",
-                    false, false)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                    "AMDGPU rematerialize", false, false)
 
 char AMDGPUHotBlockRematerialize::ID = 0;
 char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
@@ -4675,4 +4652,3 @@ char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
 FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
   return new AMDGPUHotBlockRematerialize();
 }
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 6f44fec08239c..365fb058bf6b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -1,22 +1,11 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// AMDGPUMIRUtils.cpp                                                          //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Util functions for llvm MIR Passes.                                       //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 
-//#include "dxc/DXIL/DxilMetadataHelper.h"
+// #include "dxc/DXIL/DxilMetadataHelper.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -26,9 +15,9 @@
 
 #include "llvm/Support/Debug.h"
 
-#include "GCNRegPressure.h"
 #include "AMDGPUMIRUtils.h"
 #include "AMDGPUSubExpDag.h"
+#include "GCNRegPressure.h"
 #include <unordered_set>
 
 #define DEBUG_TYPE "xb-mir-util"
@@ -48,7 +37,7 @@ class CFGWithPhi {
         phiInsts.insert(&I);
         unsigned Reg = I.getOperand(0).getReg();
         // Add incoming values.
-        for (unsigned i=1;i<I.getNumOperands();i+=2) {
+        for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
           MachineOperand &MO = I.getOperand(i);
           if (!MO.isReg())
             continue;
@@ -66,7 +55,8 @@ class CFGWithPhi {
   } /// Adds custom features for a visualization of the ScheduleDAG.
   void addCustomGraphFeatures(llvm::GraphWriter<CFGWithPhi *> &) const {}
   MachineFunction &F;
-  DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>> blockToPhiInstsMap;
+  DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>>
+      blockToPhiInstsMap;
   void dump();
 };
 
@@ -110,7 +100,8 @@ template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
     return R;
   }
 
-  static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) {
+  static std::string getNodeLabel(const MachineBasicBlock *BB,
+                                  const CFGWithPhi *G) {
     enum { MaxColumns = 8000 };
     std::string Str;
     raw_string_ostream OS(Str);
@@ -347,7 +338,7 @@ void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
 }
 
 MachineBasicBlock *split(MachineInstr *Inst) {
-  
+
   // Create the fall-through block.
   MachineBasicBlock *MBB = Inst->getParent();
   MachineFunction *MF = MBB->getParent();
@@ -462,9 +453,8 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
                              .addImm(offset * LaneSize);
         MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
         MachineBasicBlock::iterator InsertPoint =
-            llvm::FindOrCreateInsertionPointForSccDef(
-                MI.getParent(), MI, SIRI, SIII, &MRI
-            );
+            llvm::FindOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
+                                                      SIII, &MRI);
         MI.getParent()->insert(InsertPoint, OffsetAddMI);
         SIII->legalizeOperands(*OffsetAddMI);
         OffsetOp->setReg(NewOffsetReg);
@@ -631,7 +621,7 @@ bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT,
   return bCross;
 }
 
-}
+} // namespace llvm
 
 namespace llvm {
 void viewCFGWithPhi(llvm::MachineFunction &F) {
@@ -1520,12 +1510,12 @@ void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) {
 }
 
 } // namespace pressure
-}// namespace llvm
+} // namespace llvm
 
 namespace {
 class ContributionList {
 public:
-  ContributionList(MachineFunction &MF) : MF(MF){};
+  ContributionList(MachineFunction &MF) : MF(MF) {};
   void build();
   bool propagateContribution();
   MachineFunction &MF;
@@ -1754,46 +1744,45 @@ void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) {
 }
 } // namespace llvm
 
-static bool IsPhysReg(const MachineOperand &Op)
-{
-    return Op.isReg() && Op.getReg().isPhysical();
+static bool IsPhysReg(const MachineOperand &Op) {
+  return Op.isReg() && Op.getReg().isPhysical();
 }
 
 // Sometimes split bb uses physical registers defined in BB, have to add them to
 // live-in or the ir is malformed.
-void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI)
-{
-    // Initialize with current set of liveins. For new blocks this will be empty.
-    SmallDenseSet<unsigned, 8> DefSet;
-    for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins())
-    {
-        DefSet.insert(P.PhysReg);
-    }
+void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+                                       const MachineRegisterInfo *MRI) {
+  // Initialize with current set of liveins. For new blocks this will be empty.
+  SmallDenseSet<unsigned, 8> DefSet;
+  for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins()) {
+    DefSet.insert(P.PhysReg);
+  }
 
-    for (auto &MI : *NewBB)
-    {
-        // Add all undefined physical registers to the live in set.
-        for (MachineOperand &Use : MI.operands())
-        {
-            // Only process physreg uses.
-            if (!IsPhysReg(Use) || !Use.isUse()) continue;
+  for (auto &MI : *NewBB) {
+    // Add all undefined physical registers to the live in set.
+    for (MachineOperand &Use : MI.operands()) {
+      // Only process physreg uses.
+      if (!IsPhysReg(Use) || !Use.isUse())
+        continue;
 
-            // Reserved regs do not need to be tracked through live-in sets.
-            unsigned Reg = Use.getReg();
-            if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue;
+      // Reserved regs do not need to be tracked through live-in sets.
+      unsigned Reg = Use.getReg();
+      if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
+        continue;
 
-            if (!DefSet.count(Reg))
-                NewBB->addLiveIn(Reg);
-        }
+      if (!DefSet.count(Reg))
+        NewBB->addLiveIn(Reg);
+    }
 
-        // Add all physical register defs (exlicit+implicit) to the def register set.
-        for (MachineOperand &Def : MI.operands()) 
-        {
-            // Only process physreg defs.
-            if (!IsPhysReg(Def) || !Def.isDef()) continue;
-            DefSet.insert(Def.getReg());
-        }
+    // Add all physical register defs (exlicit+implicit) to the def register
+    // set.
+    for (MachineOperand &Def : MI.operands()) {
+      // Only process physreg defs.
+      if (!IsPhysReg(Def) || !Def.isDef())
+        continue;
+      DefSet.insert(Def.getReg());
     }
+  }
 }
 
 void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
@@ -1829,50 +1818,41 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
   }
 }
 
-MachineReg llvm::CreateVirtualRegForOperand(
-    MachineOpcode Opcode,
-    unsigned OpNum,
-    MachineFunction &MF
-)
-{
-    const TargetSubtargetInfo &ST = MF.getSubtarget();
-    const TargetRegisterInfo *TRI = ST.getRegisterInfo();
-    const TargetInstrInfo *TII = ST.getInstrInfo();
-    const MCInstrDesc &Desc = TII->get(Opcode);
-    const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
-    if (!RC)
-    {
-        llvm::report_fatal_error("Unable to create virtual reg for instruction operand");
-    }
+MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode,
+                                            unsigned OpNum,
+                                            MachineFunction &MF) {
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
+  if (!RC) {
+    llvm::report_fatal_error(
+        "Unable to create virtual reg for instruction operand");
+  }
 
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    return MRI.createVirtualRegister(RC);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  return MRI.createVirtualRegister(RC);
 }
 
-MachineReg llvm::CreateVirtualDstReg(
-    MachineOpcode Opcode,
-    MachineFunction &MF
-)
-{
-    return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
+MachineReg llvm::CreateVirtualDstReg(MachineOpcode Opcode,
+                                     MachineFunction &MF) {
+  return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
 }
 
 // Return true if the MI is a copy of exec.
 // If true then sets pDst to the destination register.
-bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
-{
-    enum {DST=0, SRC=1};
-    bool FoundCopy = false;
-    if (MI.getOpcode() == AMDGPU::COPY
-        || MI.getOpcode() == AMDGPU::S_MOV_B32
-        || MI.getOpcode() == AMDGPU::S_MOV_B64)
-    {
-        const MachineOperand &Src = MI.getOperand(SRC);
-        if (Src.isReg() && Src.getReg() == Exec)
-        {
-            FoundCopy = true;
-        }
+bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec,
+                      MachineReg *pDst) {
+  enum { DST = 0, SRC = 1 };
+  bool FoundCopy = false;
+  if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::S_MOV_B32 ||
+      MI.getOpcode() == AMDGPU::S_MOV_B64) {
+    const MachineOperand &Src = MI.getOperand(SRC);
+    if (Src.isReg() && Src.getReg() == Exec) {
+      FoundCopy = true;
     }
+  }
 #if 0 // TODO: Delete this.
     else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO ||
              MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32)
@@ -1880,29 +1860,26 @@ bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
         FoundCopy = true;
     }
 #endif
-            
-    if (FoundCopy)
-    {
-        *pDst = MI.getOperand(DST).getReg();
-    }
 
-    return FoundCopy;
+  if (FoundCopy) {
+    *pDst = MI.getOperand(DST).getReg();
+  }
+
+  return FoundCopy;
 }
 
-llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF)
-{
-    llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister};
-    if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF))
-    {
-        LiveLaneMask.Reg = MI->getOperand(0).getReg();
-        LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
-    }
+llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) {
+  llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister,
+                                             AMDGPU::NoSubRegister};
+  if (MachineInstr *MI = GetWqmEntryActiveMaskInst(MF)) {
+    LiveLaneMask.Reg = MI->getOperand(0).getReg();
+    LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
+  }
 
-    return LiveLaneMask;
+  return LiveLaneMask;
 }
 
-MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
-{
+MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) {
 #if 0 // TODO: Get rid of this
     // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
     // This instruction is added by the SIWholeQuadMode pass.
@@ -1917,22 +1894,23 @@ MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
     }
 #endif
 
-    return nullptr;
+  return nullptr;
 }
 
-bool llvm::IsFetchShaderCall(const MachineInstr *MI)
-{
+bool llvm::IsFetchShaderCall(const MachineInstr *MI) {
 #if 0 // TODO: Get rid of this.
     return 
         MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
         MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall);
 #else
-    return false;
+  return false;
 #endif
 }
 
-bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) {
-  const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+                       llvm::MachineBasicBlock::iterator MI) {
+  const TargetRegisterInfo *TRI =
+      MBB->getParent()->getRegInfo().getTargetRegisterInfo();
   for (auto it = MI; it != MBB->end(); ++it) {
     const MachineInstr &CurMI = *it;
     // Hit use of scc, it is live.
@@ -1962,79 +1940,70 @@ bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::it
 // as the new insert location.
 //
 MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
-    MachineBasicBlock *MBB,
-    MachineBasicBlock::iterator MI,
-    const TargetRegisterInfo* TRI,
-    const SIInstrInfo* TII,
-    MachineRegisterInfo* MRI,
-    SccDefInsertPointConstraintFlags Constraints
-)
-{
-    // If SCC is dead at MI when we can use MI as the insert point.
-    if (!llvm::IsSccLiveAt(MBB, MI))
-    {
-        return MI;
-    }
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
+    const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
+    MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
+  // If SCC is dead at MI when we can use MI as the insert point.
+  if (!llvm::IsSccLiveAt(MBB, MI)) {
+    return MI;
+  }
 
-    const bool CheckForExecWrite =
-        Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+  const bool CheckForExecWrite =
+      Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
 
-    // Get the starting reverse iterator taking care to handle the MBB->end() case.
-    MachineBasicBlock::reverse_iterator Start;
-    if (MI == MBB->end())
-    {
-        Start = MBB->rbegin();
-    }
-    else
-    {
-        Start = MI.getReverse();
-    }
-
-    // Otherwise, walk backwards through the block looking for a location where
-    // SCC is dead.
-    for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It)
-    {
-        // If the instruction modifies exec then we cannot use it as
-        // an insertion point (if that is a constraint from the caller).
-        // The check for EXEC works for both wave64 and wave32 because
-        // it will also catch writes to the subregisters (e.g. exec_lo).
-        if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
-        {
-            break;
-        }
+  // Get the starting reverse iterator taking care to handle the MBB->end()
+  // case.
+  MachineBasicBlock::reverse_iterator Start;
+  if (MI == MBB->end()) {
+    Start = MBB->rbegin();
+  } else {
+    Start = MI.getReverse();
+  }
 
-        if (It->modifiesRegister(AMDGPU::SCC, TRI) 
-            && !It->readsRegister(AMDGPU::SCC, TRI))
-        {
-            return It->getIterator();
-        }
+  // Otherwise, walk backwards through the block looking for a location where
+  // SCC is dead.
+  for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend();
+       It != End; ++It) {
+    // If the instruction modifies exec then we cannot use it as
+    // an insertion point (if that is a constraint from the caller).
+    // The check for EXEC works for both wave64 and wave32 because
+    // it will also catch writes to the subregisters (e.g. exec_lo).
+    if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
+      break;
     }
 
-    // If no safe location can be found in the block we can save and restore
-    // SCC around MI. There is no way to directly read or write SCC so we use
-    // s_cselect to read the current value of SCC and s_cmp to write the saved
-    // value back to SCC.
-    //
-    // The generated code will look like this;
-    //
-    //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
-    //      <----- Newly created safe insert point.
-    //      MI
-    //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
-    //
-    unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-    DebugLoc DL = MI->getDebugLoc();
-    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
-        .addImm(-1)
-        .addImm(0);
-    BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32))
-        .addReg(TmpScc, RegState::Kill)
-        .addImm(0);
+    if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
+        !It->readsRegister(AMDGPU::SCC, TRI)) {
+      return It->getIterator();
+    }
+  }
 
-    return MI;
+  // If no safe location can be found in the block we can save and restore
+  // SCC around MI. There is no way to directly read or write SCC so we use
+  // s_cselect to read the current value of SCC and s_cmp to write the saved
+  // value back to SCC.
+  //
+  // The generated code will look like this;
+  //
+  //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
+  //      <----- Newly created safe insert point.
+  //      MI
+  //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
+  //
+  unsigned int TmpScc =
+      MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+      .addImm(-1)
+      .addImm(0);
+  BuildMI(*MBB, std::next(MI->getIterator()), DL,
+          TII->get(AMDGPU::S_CMP_LG_U32))
+      .addReg(TmpScc, RegState::Kill)
+      .addImm(0);
+
+  return MI;
 }
 
-
 namespace {
 bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
                     SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
@@ -2099,9 +2068,7 @@ bool llvm::isLocalLiveInterval(
   return isLocalLiveRange(&LI, Indexes, touchedMBBSet);
 }
 
-
-bool llvm::isLocalLiveInterval(
-    const LiveInterval &LI, SlotIndexes *Indexes) {
+bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
   if (LI.hasSubRanges()) {
     for (const auto &S : LI.subranges()) {
       if (!isLocalLiveRange(&S, Indexes))
@@ -2117,8 +2084,8 @@ bool llvm::isLocalLiveInterval(
 void llvm::buildEndLiveMap(
     llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
     const llvm::MachineRegisterInfo &MRI,
-    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
-        &MBBLiveMap, bool After) {
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet> &MBBLiveMap,
+    bool After) {
   // When only have one block, end live reg must be empty.
   if (MF.size() == 1)
     return;
@@ -2158,7 +2125,8 @@ void llvm::buildEndLiveMap(
   }
 }
 
-unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF,
+                                   const SIRegisterInfo *SIRI) {
   auto &MRI = MF.getRegInfo();
   for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
     if (MRI.isPhysRegUsed(Reg)) {
@@ -2168,14 +2136,16 @@ unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterIn
   return 0;
 }
 
-unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF,
+                                   const SIRegisterInfo *SIRI) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned MaxSGPR = 0;
   for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
     if (MRI.isPhysRegUsed(Reg)) {
-      // Skip scratch reserved reg, which is a big register that don't really contribute to this stat.
+      // Skip scratch reserved reg, which is a big register that don't really
+      // contribute to this stat.
       if (ScratchRSrcReg != 0) {
         if (SIRI->isSubRegister(ScratchRSrcReg, Reg))
           continue;
@@ -2187,8 +2157,7 @@ unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterIn
   return 1 + llvm::RegForVCC + MaxSGPR;
 }
 
-void llvm::dumpLiveSet(const LiveSet &LiveSet,
-                 const SIRegisterInfo *SIRI) {
+void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
 
   dbgs() << "\n live set: \n";
   for (auto it : LiveSet) {
@@ -2227,15 +2196,16 @@ bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage)
 }
 #endif
 
-MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ)
-{
-    for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It)
-    {
-        if (*It == Succ)
-        {
-            return It;
-        }
+MachineBasicBlock::succ_iterator
+llvm::FindSuccessor(llvm::MachineBasicBlock *MBB,
+                    llvm::MachineBasicBlock *Succ) {
+  for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(),
+                                        End = MBB->succ_end();
+       It != End; ++It) {
+    if (*It == Succ) {
+      return It;
     }
+  }
 
-    return MBB->succ_end();
+  return MBB->succ_end();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 16b55c5c94583..1e9f0bad12d19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -2,9 +2,9 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/IR/CallingConv.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/LaneBitmask.h"
 
 namespace llvm {
 
@@ -37,10 +37,10 @@ using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
                     const llvm::MachineRegisterInfo &MRI,
                     const llvm::SIRegisterInfo *SIRI);
-void CollectLiveSetPressure(
-    const LiveSet &liveSet,
-    const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
-    unsigned &VPressure, unsigned &SPressure);
+void CollectLiveSetPressure(const LiveSet &liveSet,
+                            const llvm::MachineRegisterInfo &MRI,
+                            const llvm::SIRegisterInfo *SIRI,
+                            unsigned &VPressure, unsigned &SPressure);
 
 bool isExecUpdateForControlFlow(llvm::MachineInstr &MI);
 
@@ -60,60 +60,57 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
                        const llvm::SIInstrInfo *TII,
                        llvm::SlotIndexes *SlotIndexes);
 
-bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT,
+bool reach_block(llvm::MachineBasicBlock *FromBB,
+                 llvm::MachineDominatorTree *DT,
                  llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
                  llvm::MachineBasicBlock *ToBB);
 
-
 void viewCFGWithPhi(llvm::MachineFunction &MF);
 void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
 
-llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII);
+llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF,
+                                               const llvm::SIInstrInfo *TII);
 
 bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
                        llvm::MachineBasicBlock &MBB);
 
-void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI);
+void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+                                 const llvm::MachineRegisterInfo *MRI);
 
 void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
-                                 llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
-                                 const llvm::MachineRegisterInfo *MRI);
+                                llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
+                                const llvm::MachineRegisterInfo *MRI);
 
-MachineReg CreateVirtualRegForOperand(
-    MachineOpcode Opcode,
-    unsigned Operand,
-    llvm::MachineFunction &MF
-);
+MachineReg CreateVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand,
+                                      llvm::MachineFunction &MF);
 
-MachineReg CreateVirtualDstReg(
-    MachineOpcode Opcode,
-    llvm::MachineFunction &MF
-);
+MachineReg CreateVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF);
 
-bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst);
+bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec,
+                MachineReg *pDst);
 struct MachineRegWithSubReg {
-  MachineReg Reg = AMDGPU::NoRegister;
-  unsigned SubReg = AMDGPU::NoSubRegister;
+  MachineReg Reg = /*NoRegister*/ 0;
+  unsigned SubReg = /*NoSubRegister*/ 0;
 };
 MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF);
 llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
 
-// Return true if this machine instruction represents a call to the fetch shader.
-// We curently have two mechanisims for calling fetch shader:
+// Return true if this machine instruction represents a call to the fetch
+// shader. We curently have two mechanisims for calling fetch shader:
 // 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
 // 2. A CALL instruction with the `FetchShaderCall` flag set to true.
-bool IsFetchShaderCall(const llvm::MachineInstr* MI);
-
-bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI);
+bool IsFetchShaderCall(const llvm::MachineInstr *MI);
 
+bool IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+                 llvm::MachineBasicBlock::iterator MI);
 
 // An enum used to pass additional constraints to
 // `FindOrCreateInsertionPointForSccDef()`. This will further
 // constrain the location where the scc def can be inserted.
-enum SccDefInsertPointConstraintFlags
-{
-    None        = 0,   // No additional constraints.
-    NoExecWrite = 1,   // Should be no modification of exec between BeforeInst and insert point.
+enum SccDefInsertPointConstraintFlags {
+  None = 0,        // No additional constraints.
+  NoExecWrite = 1, // Should be no modification of exec between BeforeInst and
+                   // insert point.
 };
 
 // Look for a safe place to insert an instruction that defines scc.
@@ -130,55 +127,53 @@ enum SccDefInsertPointConstraintFlags
 // as the new insert location.
 //
 llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef(
-    llvm::MachineBasicBlock* MBB,
-    llvm::MachineBasicBlock::iterator BeforeInst,
-    const llvm::TargetRegisterInfo* TRI,
-    const llvm::SIInstrInfo* TII,
-    llvm::MachineRegisterInfo* MRI,
-    SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None
-);
+    llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
+    const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
+    llvm::MachineRegisterInfo *MRI,
+    SccDefInsertPointConstraintFlags Constraints =
+        SccDefInsertPointConstraintFlags::None);
 
 // Check if LI live cross basic blocks, save all touched basic block if is
 // local.
 bool isLocalLiveInterval(
     const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
     llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &touchedMBBSet);
-bool isLocalLiveInterval(
-    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes);
+bool isLocalLiveInterval(const llvm::LiveInterval &LI,
+                         llvm::SlotIndexes *Indexes);
 
 // build liveRegSet at end of each MBB.
 void buildEndLiveMap(
     llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
     const llvm::MachineRegisterInfo &MRI,
-    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
-        &MBBLiveMap, bool After);
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet> &MBBLiveMap, bool After);
 
-void dumpLiveSet(const LiveSet &LiveSet,
-                 const llvm::SIRegisterInfo *SIRI);
+void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
 
-unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
-unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF,
+                             const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF,
+                             const llvm::SIRegisterInfo *SIRI);
 
 bool isFastMathInst(llvm::MachineInstr &MI);
 
 namespace pressure {
 void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI,
-               const llvm::SIRegisterInfo *SIRI,
-               llvm::raw_ostream &os);
+               const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &os);
 void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
                     const char *Filename);
 void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
                     llvm::raw_ostream &os);
-}
+} // namespace pressure
 // bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage);
 
 // Look for the successor `Succ` of the given `MBB`.
 // Returns MBB->succ_end() if `Succ` is not a successor of MBB.
-llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ);
+llvm::MachineBasicBlock::succ_iterator
+FindSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ);
 
 // The enum and helper function for v_perm selection mask.
 //
-// The input byte layout of v_perm is as below: 
+// The input byte layout of v_perm is as below:
 //
 // BYTE in[8]
 // in[0] = $src1_BYTE0;
@@ -211,7 +206,7 @@ constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0,
                                    V_PERM_IN_BYTE_POS Sel_1,
                                    V_PERM_IN_BYTE_POS Sel_2,
                                    V_PERM_IN_BYTE_POS Sel_3) {
-  return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) |
-          ((int)Sel_1 << 8) | (int)Sel_0);
-}
+  return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) | ((int)Sel_1 << 8) |
+          (int)Sel_0);
 }
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
index ceb22b5ff9243..21aa5db0c6f27 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
@@ -69,7 +69,8 @@
 // ...
 //
 // label:
-// v3 = phi v0, v1                         ; divergent! because of divergent branch.
+// v3 = phi v0, v1                         ; divergent! because of divergent
+// branch.
 //
 // The boolean value is bit-divergent. When passed to the branch as an operand,
 // the branch becomes divergent, whose sync dependency will be computed as
@@ -81,13 +82,14 @@
 // control flow.
 // For case like
 //  %163:sreg_64_xexec = S_MOV_B64 $exec
-//bb.1:
+// bb.1:
 //; predecessors: %bb.1, %bb.0
-//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
-//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
+//  %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
 //  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
 //  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
+//  implicit-def $scc, implicit $exec
 //...
 //  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
 //  S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -164,20 +166,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "AMDGPUMirDivergenceAnalysis.h"
-#include "GCNSubtarget.h"
+#include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "TargetInfo/AMDGPUTargetInfo.h"
-#include "SIInstrInfo.h"
-//#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
+// #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/Support/Debug.h"
-//#include "newbe/cli/newbe_opts.h"  // AMDGPU change.
+// #include "newbe/cli/newbe_opts.h"  // AMDGPU change.
 #include "llvm/Support/raw_ostream.h"
 #include <vector>
 
@@ -1223,24 +1225,24 @@ bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) {
   case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10:
   case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si:
   case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si:
@@ -1555,8 +1557,8 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
     if (MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
-        Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO)
+    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::VCC ||
+        Reg == AMDGPU::VCC_LO)
       return true;
 
     // Check if the written register class overlaps the bool register class.
@@ -1567,15 +1569,15 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
     //
     // The underlying problem is that we have two notions of divergence
     // (bit divergence and wave divergence) but the algorithm only propagates
-    // wave divergence. The bit divergence is important for bools because it determines
-    // if a branch is uniform or not (and thus catches cases where a uniform value is
-    // used outside of a divergent control flow region). For bool values the
-    // algorithm will treat normally uniform values (i.e. scalar registers) as divergent
-    // in order to try and propagate bit divergence.
+    // wave divergence. The bit divergence is important for bools because it
+    // determines if a branch is uniform or not (and thus catches cases where a
+    // uniform value is used outside of a divergent control flow region). For
+    // bool values the algorithm will treat normally uniform values (i.e. scalar
+    // registers) as divergent in order to try and propagate bit divergence.
     //
-    // To fix all the possible bugs here I think we need to actually proagate bit
-    // divergence as well as wave divergences. That is a bigger fix and this check should
-    // cover most cases of treating a bool value as divergent.
+    // To fix all the possible bugs here I think we need to actually proagate
+    // bit divergence as well as wave divergences. That is a bigger fix and this
+    // check should cover most cases of treating a bool value as divergent.
     const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
     if (SIRI->getCommonSubClass(BoolRC, RC))
       return true;
@@ -1597,13 +1599,13 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
         !MI->isTerminator())
       return true;
     break;
-  //case AMDGPU::AMDGPU_MAKE_UNIFORM:
-  //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
+  // case AMDGPU::AMDGPU_MAKE_UNIFORM:
+  // case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
   case AMDGPU::V_READFIRSTLANE_B32:
   case AMDGPU::V_READLANE_B32:
-  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
-  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
-    // bool readfirstlane should be 1 bit, which means bit uniform.
+    // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
+    // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
+    //  bool readfirstlane should be 1 bit, which means bit uniform.
     return true;
   case AMDGPU::S_OR_B32:
   case AMDGPU::S_OR_B64: {
@@ -1638,7 +1640,8 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
 }
 
 bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) {
-  return reg.isPhysical();;
+  return reg.isPhysical();
+  ;
 }
 
 bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
@@ -1646,13 +1649,14 @@ bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
 }
 
 // For input reg of MF, vgpr will be divergent.
-bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI) {
   if (isPhysicalReg(MRI, Reg)) {
     unsigned vir_reg = MRI.getLiveInVirtReg(Reg);
     if (SIRI->isVGPR(MRI, vir_reg))
       return true;
   } else {
-   if (SIRI->isVGPR(MRI, Reg))
+    if (SIRI->isVGPR(MRI, Reg))
       return true;
   }
   return false;
@@ -1660,8 +1664,8 @@ bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegiste
 
 bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI,
                           const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
-  //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
-  //  return true;
+  // if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
+  //   return true;
   if (isAMDGPUOpcodeDivergent(MI))
     return true;
 
@@ -1715,8 +1719,7 @@ bool isWriteExec(const MachineInstr *MI) {
     if (MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::EXEC ||
-        Reg == AMDGPU::EXEC_LO)
+    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
       return true;
   }
   return false;
@@ -1735,7 +1738,6 @@ bool isVCndMask(unsigned Opcode) {
   }
 }
 
-
 bool isExecRegionOp(unsigned Op) {
   switch (Op) {
   default:
@@ -1812,17 +1814,18 @@ bool isInsideExecRegion(const MachineBasicBlock &MBB,
   return PDT.dominates(RegionEndMBB, &MBB);
 }
 
-// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region?
-// Then when hit saveExec, propagate leaked users of define inside the exec region.
+// Map from BB to nearest Exec Region. How to build? Add every MBB unless
+// already has smaller region? Then when hit saveExec, propagate leaked users of
+// define inside the exec region.
 
 } // namespace
 
 namespace llvm {
 // class DivergenceAnalysis
 DivergenceAnalysis::DivergenceAnalysis(
-    const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT,
-    const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI,
-    SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
+    const MachineFunction &F, const MachineLoop *RegionLoop,
+    const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+    const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
     // AMDGPU change begin.
     DivergentJoinMapTy &JoinMap
     // AMDGPU change end.
@@ -1841,7 +1844,7 @@ void DivergenceAnalysis::markDivergent(const ValueTy DivVal) {
   LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
              const SIRegisterInfo *SIRI = ST->getRegisterInfo();
              dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI););
-  //AMDGPU change end.
+  // AMDGPU change end.
   DivergentValues.insert(DivVal);
 }
 
@@ -1948,7 +1951,7 @@ bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const {
     // Check bit uniform here if not divergent.
     return !isBitUniform(Term, Processed);
   }
-  //case AMDGPU::AMDGPU_CALL_INDIRECT:
+  // case AMDGPU::AMDGPU_CALL_INDIRECT:
   case AMDGPU::SI_CALL:
     return true;
   }
@@ -1965,13 +1968,10 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
       continue;
     Register Reg = Op.getReg();
     if (Reg.isPhysical()) {
-      if (Reg == AMDGPU::EXEC ||
-          Reg == AMDGPU::EXEC_LO ||
-          Reg == AMDGPU::SCC)
+      if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::SCC)
         continue;
-      else 
-      if (const MachineInstr *DefMI =
-              findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
+      else if (const MachineInstr *DefMI =
+                   findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
         if (isDivergent(*DefMI))
           return true;
       } else {
@@ -1986,15 +1986,17 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
   return false;
 }
 
-bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
-                                             const ValueTy Val,
-                                             const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
-  const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants.
+bool DivergenceAnalysis::isTemporalDivergent(
+    const MachineBasicBlock &ObservingBlock, const ValueTy Val,
+    const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
+  const MachineBasicBlock *DefBlock =
+      &IncomingBlock; // AMDGPU change: Take def point as incoming block for
+                      // constants.
   const auto *Inst = MRI.getUniqueVRegDef(Val);
   if (Inst == nullptr)
     return true;
   if (Inst)
-      DefBlock = Inst->getParent(); 
+    DefBlock = Inst->getParent();
 
   // check whether any divergent loop carrying Val terminates before control
   // proceeds to ObservingBlock
@@ -2020,13 +2022,14 @@ static bool HasIncomingUndefValue(const PHINode_ *Phi) {
 
 // For case like
 //  %163:sreg_64_xexec = S_MOV_B64 $exec
-//bb.1:
+// bb.1:
 //; predecessors: %bb.1, %bb.0
-//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
-//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
+//  %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
 //  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
 //  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
+//  implicit-def $scc, implicit $exec
 //...
 //  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
 //  S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -2091,8 +2094,8 @@ findSaveExec(const MachineInstr *MI,
 // It will only run on divergent branch, so (A, B) is not in
 // DivergentDisjointMap when A is uniform.
 static bool isJoinDivergentOnlyOnSameIncomingValue(
-    const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT,
-    DivergentJoinMapTy &DivergentJoinMap) {
+    const PHINode_ &Phi, const DivergenceAnalysis *pDA,
+    const MachineDominatorTree &DT, DivergentJoinMapTy &DivergentJoinMap) {
   // for phi which join divergent, if the incoming values from divergent
   // branch are the same, the phi is still uniform.
   // A
@@ -2183,14 +2186,14 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
   // joining divergent disjoint path in Phi parent block
   if (isJoinDivergent(*Phi.getParent())) {
     // AMDGPU CHANGE BEGIN
-    if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
+    if (true /*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
       // Continue if the divergent join only on same incoming value.
       if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT,
                                                   DivergentJoinMap))
         return true;
     } else
-    // AMDGPU CHANGE END
-    return true;
+      // AMDGPU CHANGE END
+      return true;
   }
 
   // An incoming value could be divergent by itself.
@@ -2213,7 +2216,6 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
     if (isDivergent(Reg) ||
         isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB()))
       return true;
-
   }
 
   return false;
@@ -2259,7 +2261,8 @@ bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const {
 
 // marks all users of loop-carried values of the loop headed by LoopHeader as
 // divergent
-void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) {
+void DivergenceAnalysis::taintLoopLiveOuts(
+    const MachineBasicBlock &LoopHeader) {
   auto *DivLoop = LI.getLoopFor(&LoopHeader);
   assert(DivLoop && "loopHeader is not actually part of a loop");
 
@@ -2324,7 +2327,7 @@ void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader)
   }
 }
 
-void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { 
+void DivergenceAnalysis::pushInstruction(const MachineInstr &I) {
   Worklist.push_back(&I);
 }
 void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) {
@@ -2355,8 +2358,8 @@ void DivergenceAnalysis::pushUsers(const MachineInstr &I) {
   }
 }
 
-bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
-                                                 const MachineLoop *BranchLoop) {
+bool DivergenceAnalysis::propagateJoinDivergence(
+    const MachineBasicBlock &JoinBlock, const MachineLoop *BranchLoop) {
   LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
 
   // ignore divergence outside the region
@@ -2403,8 +2406,10 @@ void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) {
   }
 }
 
-void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) {
-  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n");
+void DivergenceAnalysis::propagateLoopDivergence(
+    const MachineLoop &ExitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber()
+                    << "\n");
 
   // don't propagate beyond region
   if (!inRegion(*ExitingLoop.getHeader()))
@@ -2444,20 +2449,21 @@ void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop)
 // For case like
 //  %149:sreg_64_xexec = S_MOV_B64 $exec
 //
-//bb.3:
+// bb.3:
 //; predecessors: %bb.3, %bb.2
-//  successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%)
+//  successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%),
+//  %bb.4(50.00%)
 //
 //  %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3
 //  %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec
 //  %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec
-//  %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
-//  $m0 = S_MOV_B32 %153:sgpr_32
-//  %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec
-//  $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
+//  %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec,
+//  implicit-def $scc, implicit $exec $m0 = S_MOV_B32 %153:sgpr_32 %55:vreg_512
+//  = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit
+//  $exec $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
 //  S_CBRANCH_EXECNZ %bb.3, implicit $exec
 //
-//bb.4:
+// bb.4:
 //; predecessors: %bb.3
 //  successors: %bb.5(0x80000000); %bb.5(100.00%)
 //
@@ -2596,7 +2602,7 @@ void DivergenceAnalysis::compute() {
 
   // propagate divergence
   while (!Worklist.empty()) {
-    const MachineInstr *I= Worklist.back();
+    const MachineInstr *I = Worklist.back();
     Worklist.pop_back();
 
     // maintain uniformity of overrides
@@ -2715,23 +2721,23 @@ bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const {
 void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const {
   // iterate instructions using instructions() to ensure a deterministic order.
   for (auto &MBB : F)
-  for (auto &I : MBB) {
-    if (isDivergent(I))
-      OS << "DIVERGENT:" << I ;
-    // AMDGPU changes begin
-    else
-      OS << "UNIFORM:" << I ;
-    // AMDGPU changes end
-  }
+    for (auto &I : MBB) {
+      if (isDivergent(I))
+        OS << "DIVERGENT:" << I;
+      // AMDGPU changes begin
+      else
+        OS << "UNIFORM:" << I;
+      // AMDGPU changes end
+    }
 }
 
 // class GPUDivergenceAnalysis
-MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F,
-                                             const MachineDominatorTree &DT,
-                                             const MachinePostDominatorTree &PDT,
-                                             const MachineLoopInfo &LI)
-    : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap),
-      DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) {
+MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(
+    MachineFunction &F, const MachineDominatorTree &DT,
+    const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+    : SDA(DT, PDT, LI, /*AMDGPU change*/ DivergentJoinMap),
+      DA(F, nullptr, DT, PDT, LI, SDA, false,
+         /*AMDGPU change*/ DivergentJoinMap) {
   MachineRegisterInfo &MRI = F.getRegInfo();
   const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *SIRI = ST->getRegisterInfo();
@@ -2758,10 +2764,11 @@ bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const {
   return DA.isDivergent(*I);
 }
 
-void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const {
+void MirGPUDivergenceAnalysis::print(raw_ostream &OS,
+                                     const Module_ *mod) const {
   OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
   DA.print(OS, mod);
   OS << "}\n";
 }
 
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
index edcf96ec44a4d..e721ac323255e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
@@ -1,4 +1,4 @@
-//===- AMDGPUMirDivergenceAnalysis.h -        Mir Divergence Analysis -*- C++ -*-===//
+//===- AMDGPUMirDivergenceAnalysis.h -  Mir Divergence Analysis -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "llvm/ADT/DenseSet.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/Pass.h"
 #include <vector>
 
@@ -50,8 +50,10 @@ class DivergenceAnalysis {
   /// Otherwise the whole function is analyzed.
   /// \param IsLCSSAForm whether the analysis may assume that the IR in the
   /// region in in LCSSA form.
-  DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop,
-                     const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+  DivergenceAnalysis(const llvm::MachineFunction &F,
+                     const MachineLoop *RegionLoop,
+                     const MachineDominatorTree &DT,
+                     const MachinePostDominatorTree &PDT,
                      const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA,
                      bool IsLCSSAForm,
                      // AMDGPU change begin.
@@ -98,10 +100,12 @@ class DivergenceAnalysis {
   bool updateTerminator(const MachineInstr &Term) const;
   bool updatePHINode(const PHINode_ &Phi) const;
   bool updateVCndMask(const MachineInstr &VCndMask) const;
-  bool isBitUniform(const MachineInstr &I,
-                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
-  bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
-                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+  bool
+  isBitUniform(const MachineInstr &I,
+               llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+  bool
+  isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
+               llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
 
   /// \brief Computes whether \p Inst is divergent based on the
   /// divergence of its operands.
@@ -136,9 +140,9 @@ class DivergenceAnalysis {
   }
 
   /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
-  bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
-                           const ValueTy Val,
-                           const MachineBasicBlock &incomingBlock) const; // AMDGPU change
+  bool isTemporalDivergent(
+      const MachineBasicBlock &ObservingBlock, const ValueTy Val,
+      const MachineBasicBlock &incomingBlock) const; // AMDGPU change
 
   /// \brief Whether \p Block is join divergent
   ///
@@ -207,14 +211,14 @@ class DivergenceAnalysis {
 
   // Set of known-uniform values.
   llvm::DenseSet<unsigned> UniformOverrides;
-  llvm::DenseSet<const llvm::MachineInstr*> UniformOverridesInsts;
+  llvm::DenseSet<const llvm::MachineInstr *> UniformOverridesInsts;
 
   // Blocks with joining divergent control from different predecessors.
   llvm::DenseSet<const MachineBasicBlock *> DivergentJoinBlocks;
 
   // Detected/marked divergent values.
   llvm::DenseSet<unsigned> DivergentValues;
-  llvm::DenseSet<const llvm::MachineInstr*> DivergentInsts;
+  llvm::DenseSet<const llvm::MachineInstr *> DivergentInsts;
 
   // Mir change for EXEC control flow.
   // Map from MBB to the exec region it belongs too.
@@ -226,16 +230,15 @@ class DivergenceAnalysis {
   struct ExecRegion {
     const llvm::MachineInstr *begin;
     const llvm::MachineInstr *end;
-    std::vector<const llvm::MachineBasicBlock*> blocks;
+    std::vector<const llvm::MachineBasicBlock *> blocks;
     bool bPropagated = false;
-    ExecRegion(const llvm::MachineInstr *b,
-               const llvm::MachineInstr *e)
+    ExecRegion(const llvm::MachineInstr *b, const llvm::MachineInstr *e)
         : begin(b), end(e), bPropagated(false) {}
   };
   llvm::DenseMap<const llvm::MachineBasicBlock *, ExecRegion *> ExecRegionMap;
 
   // Internal worklist for divergence propagation.
-  std::vector<const llvm::MachineInstr*> Worklist;
+  std::vector<const llvm::MachineInstr *> Worklist;
 };
 
 /// \brief Divergence analysis frontend for GPU kernels.
@@ -251,15 +254,17 @@ class MirGPUDivergenceAnalysis {
   // When A is divergent branch, B and C are divergent join at D.
   // Then DivergentJoinMap[B].count(C) > 0 and
   // DivergentJoinMap[C].count(B) > 0.
-  DivergentJoinMapTy  DivergentJoinMap;
+  DivergentJoinMapTy DivergentJoinMap;
   // AMDGPU change end
   SyncDependenceAnalysis SDA;
   DivergenceAnalysis DA;
 
 public:
   /// Runs the divergence analysis on @F, a GPU kernel
-  MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT,
-                        const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI);
+  MirGPUDivergenceAnalysis(llvm::MachineFunction &F,
+                           const MachineDominatorTree &DT,
+                           const MachinePostDominatorTree &PDT,
+                           const MachineLoopInfo &LI);
 
   /// Whether any divergence was detected.
   bool hasDivergence() const { return DA.hasDetectedDivergence(); }
@@ -278,4 +283,3 @@ class MirGPUDivergenceAnalysis {
 };
 
 } // namespace llvm
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
index 7213f7b4b11b4..302939c76a4df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
@@ -1,4 +1,5 @@
-//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation
+//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence
+//Calculation
 //--===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -101,15 +102,15 @@
 // loop exit and the loop header (_after_ SSA construction).
 //
 //===----------------------------------------------------------------------===//
+#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 
 #include <stack>
 #include <unordered_set>
@@ -120,19 +121,18 @@ namespace llvm {
 
 ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
 
-SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT,
-                                               const MachinePostDominatorTree &PDT,
-                                               const MachineLoopInfo &LI,
-                                               // AMDGPU change begin.
-                                               DivergentJoinMapTy &JoinMap
-                                               // AMDGPU change end.
+SyncDependenceAnalysis::SyncDependenceAnalysis(
+    const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+    const MachineLoopInfo &LI,
+    // AMDGPU change begin.
+    DivergentJoinMapTy &JoinMap
+    // AMDGPU change end.
     )
     : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI),
-    // AMDGPU change begin.
+      // AMDGPU change begin.
       DivergentJoinMap(JoinMap)
-    // AMDGPU change end.
-{
-}
+// AMDGPU change end.
+{}
 
 SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
 
@@ -155,19 +155,23 @@ struct DivergencePropagator {
   // if DefMap[B] ~ undef then we haven't seen B yet
   // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
   // an immediate successor of X (initial value).
-  using DefiningBlockMap = std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
+  using DefiningBlockMap =
+      std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
   DefiningBlockMap DefMap;
 
   // all blocks with pending visits
   std::unordered_set<const MachineBasicBlock *> PendingUpdates;
 
-  DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT,
-                       const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+  DivergencePropagator(const FunctionRPOT &FuncRPOT,
+                       const MachineDominatorTree &DT,
+                       const MachinePostDominatorTree &PDT,
+                       const MachineLoopInfo &LI)
       : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
         JoinBlocks(new ConstBlockSet) {}
 
   // set the definition at @block and mark @block as pending for a visit
-  void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) {
+  void addPending(const MachineBasicBlock &Block,
+                  const MachineBasicBlock &DefBlock) {
     bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
     if (WasAdded)
       PendingUpdates.insert(&Block);
@@ -190,7 +194,8 @@ struct DivergencePropagator {
 
   // process @succBlock with reaching definition @defBlock
   // the original divergent branch was in @parentLoop (if any)
-  void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop,
+  void visitSuccessor(const MachineBasicBlock &SuccBlock,
+                      const MachineLoop *ParentLoop,
                       const MachineBasicBlock &DefBlock) {
 
     // @succBlock is a loop exit
@@ -223,14 +228,14 @@ struct DivergencePropagator {
   // divergent exits.
   // @rootBlock is either the block containing the branch or the header of the
   // divergent loop.
-  // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator)
-  // headed by @rootBlock.
-  // @parentLoop is the parent loop of the MachineLoop or the loop that contains the
-  // Terminator.
+  // @nodeSuccessors is the set of successors of the node (MachineLoop or
+  // Terminator) headed by @rootBlock.
+  // @parentLoop is the parent loop of the MachineLoop or the loop that contains
+  // the Terminator.
   template <typename SuccessorIterable>
-  std::unique_ptr<ConstBlockSet>
-  computeJoinPoints(const MachineBasicBlock &RootBlock,
-                    SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) {
+  std::unique_ptr<ConstBlockSet> computeJoinPoints(
+      const MachineBasicBlock &RootBlock, SuccessorIterable NodeSuccessors,
+      const MachineLoop *ParentLoop, const MachineBasicBlock *PdBoundBlock) {
     assert(JoinBlocks);
 
     // bootstrap with branch targets
@@ -250,7 +255,8 @@ struct DivergencePropagator {
     auto ItBeginRPO = FuncRPOT.begin();
 
     // skip until term (TODO RPOT won't let us start at @term directly)
-    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {
+    }
 
     auto ItEndRPO = FuncRPOT.end();
     assert(ItBeginRPO != ItEndRPO);
@@ -337,30 +343,26 @@ struct DivergencePropagator {
       //     |  B   C
       //     |  | / |
       //     +--L   P
-      //   
+      //
       // In this cfg, C is the RootBlock and P is C's post-dominator.
       // It will only visit L and P and then stop because it hits the
       // post dominator. Most loops do not hit this case because the
       // loop exiting block (C) will branch directly back to the loop
       // header.
-      // 
-      if (HeaderDefBlock)
-      {
-          for (const auto *ExitBlock : ReachedLoopExits) {
-            auto ItExitDef = DefMap.find(ExitBlock);
-            assert((ItExitDef != DefMap.end()) &&
-                   "no reaching def at reachable loop exit");
-            if (ItExitDef->second != HeaderDefBlock) {
-              JoinBlocks->insert(ExitBlock);
-            }
-          }
-      }
-      else
-      {
-          for (const auto *ExitBlock : ReachedLoopExits)
-          {
-              JoinBlocks->insert(ExitBlock);
+      //
+      if (HeaderDefBlock) {
+        for (const auto *ExitBlock : ReachedLoopExits) {
+          auto ItExitDef = DefMap.find(ExitBlock);
+          assert((ItExitDef != DefMap.end()) &&
+                 "no reaching def at reachable loop exit");
+          if (ItExitDef->second != HeaderDefBlock) {
+            JoinBlocks->insert(ExitBlock);
           }
+        }
+      } else {
+        for (const auto *ExitBlock : ReachedLoopExits) {
+          JoinBlocks->insert(ExitBlock);
+        }
       }
     }
 
@@ -370,12 +372,14 @@ struct DivergencePropagator {
 
 // AMDGPU change begin.
 // For all join blocks caused by divergent RootBlock, the prevs of a join block
-// which are in DefMap or the RootBlock are divergent join each other on the join block because
-// of divergent RootBlock.
-static void updateJoinMap(
-    const MachineBasicBlock *RootBlock,
-    DenseMap<const MachineBasicBlock *, SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
-    DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) {
+// which are in DefMap or the RootBlock are divergent join each other on the
+// join block because of divergent RootBlock.
+static void
+updateJoinMap(const MachineBasicBlock *RootBlock,
+              DenseMap<const MachineBasicBlock *,
+                       SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
+              DivergencePropagator::DefiningBlockMap &DefMap,
+              ConstBlockSet &JoinBlocks) {
   for (const MachineBasicBlock *JoinBB : JoinBlocks) {
     // makr divergent join for all pred pair which in DefMap.
     for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end();
@@ -400,7 +404,8 @@ static void updateJoinMap(
 }
 // AMDGPU change end.
 
-const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
   using LoopExitVec = SmallVector<MachineBasicBlock *, 4>;
   LoopExitVec LoopExits;
   MachineLoop.getExitBlocks(LoopExits);
@@ -415,7 +420,8 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach
   }
 
   // dont propagte beyond the immediate post dom of the loop
-  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
+  const auto *PdNode =
+      PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
   const auto *IpdNode = PdNode->getIDom();
   const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
   while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) {
@@ -426,15 +432,17 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
-      *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock);
+      *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(),
+      PdBoundBlock);
 
   // AMDGPU change begin.
   // Save divergent join pairs.
   updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap,
-                    *JoinBlocks.get());
+                *JoinBlocks.get());
   // AMDGPU change end.
 
-  auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
+  auto ItInserted =
+      CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
   assert(ItInserted.second);
   return *ItInserted.first->second;
 }
@@ -452,18 +460,18 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
     return *ItCached->second;
 
   // dont propagate beyond the immediate post dominator of the branch
-  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
+  const auto *PdNode =
+      PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
   const auto *IpdNode = PdNode->getIDom();
   const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  
 
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   const auto &TermBlock = *Term.getParent();
-  
+
   // AMDGPU CHANGE
   // Make sure the post-dominator is outside the loop for the loop header.
-  // Otherwise, we may not find all the join blocks in the loop 
+  // Otherwise, we may not find all the join blocks in the loop
   // because the search stops too early. Some join points can be reached
   // after the post-dominator!
   //
@@ -477,30 +485,30 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
   //
   // In this cfg, A is the loop header and P is A's post-dominator.
   // The algorithm to mark join points does an Reverse Post Order walk
-  // from A and stops when it reaches the post dominator. It would not 
+  // from A and stops when it reaches the post dominator. It would not
   // mark the phi node in L as divergent even when A had a divergent branch.
   // The fix we made was to make the join point search continue all the way
   // to the loops post dominator (which is X in this example).
   //
   // NOTE: They already made this change for the loop case above, but for
-  //       a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&)
-  //   
+  //       a different bug apparently. See
+  //       SyncDependenceAnalysis::join_blocks(MachineLoop&)
+  //
   const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock);
-  if (MachineLoop && (MachineLoop->getHeader() == &TermBlock))
-  {
-      while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
-        IpdNode = IpdNode->getIDom();
-        PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-      }
+  if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) {
+    while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
+      IpdNode = IpdNode->getIDom();
+      PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+    }
   }
- 
+
   auto JoinBlocks = Propagator.computeJoinPoints(
       TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock);
 
   // AMDGPU change begin.
   // Save divergent join pairs.
   updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap,
-                    *JoinBlocks.get());
+                *JoinBlocks.get());
   // AMDGPU change end.
 
   auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
index a52bcc7bc9e7c..92059d85b848a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
@@ -1,4 +1,5 @@
-//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===//
+//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++
+//-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -20,8 +21,9 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include <memory>
+#include "llvm/CodeGen/MachineFunction.h"
 #include <map>
+#include <memory>
 
 namespace llvm {
 class MachineBasicBlock;
@@ -44,14 +46,16 @@ using ConstBlockSet = llvm::SmallPtrSet<const MachineBasicBlock *, 4>;
 /// This analysis relates points of divergent control to points of converging
 /// divergent control. The analysis requires all loops to be reducible.
 class SyncDependenceAnalysis {
-  void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop,
+  void visitSuccessor(const MachineBasicBlock &succBlock,
+                      const MachineLoop *termLoop,
                       const MachineBasicBlock *defBlock);
 
 public:
   bool inRegion(const MachineBasicBlock &BB) const;
 
   ~SyncDependenceAnalysis();
-  SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+  SyncDependenceAnalysis(const MachineDominatorTree &DT,
+                         const MachinePostDominatorTree &PDT,
                          const MachineLoopInfo &LI,
                          // AMDGPU change begin
                          DivergentJoinMapTy &JoinMap
@@ -88,11 +92,10 @@ class SyncDependenceAnalysis {
   // AMDGPU change begin.
   DivergentJoinMapTy &DivergentJoinMap;
   // AMDGPU change end.
-  std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+  std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>>
+      CachedLoopExitJoins;
   std::map<const MachineInstr *, std::unique_ptr<ConstBlockSet>>
       CachedBranchJoins;
 };
 
 } // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 648df7f724617..2e48ec44f979c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -1,22 +1,22 @@
-//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//===--------------------------------------------------------------------------------===//
+//==------------------------------------------------------------------------==//
 //
 /// \file
 /// \brief Helper functions for occupancy and latency.
 //
-//===--------------------------------------------------------------------------------===//
+//==------------------------------------------------------------------------==//
 
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "AMDGPUSubtarget.h"
 #include "GCNSubtarget.h"
-#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
 
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
@@ -57,7 +57,7 @@ bool SchedScore::isBetter(const SchedScore &s) const {
 bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
   unsigned gain = latencyGain(TargetOccupancy, ExtraOcc);
   // 10% is good enough.
-  if ((10*gain) >= Alu)
+  if ((10 * gain) >= Alu)
     return true;
   else
     return false;
@@ -65,7 +65,7 @@ bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
 
 unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
   unsigned latency = MemLatency;
-  return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc));
+  return (latency / (TgtOcc)) - (latency / (TgtOcc + ExtraOcc));
 }
 
 // AMDGPULatencyTracker
@@ -73,7 +73,8 @@ AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
     : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
 
 void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
-  if (MI.isDebugInstr()) return;
+  if (MI.isDebugInstr())
+    return;
   int latency = SIII->getInstrLatency(ItinerayData, MI);
   // If inside latency hide.
   if (!LatencyMIs.empty()) {
@@ -184,5 +185,3 @@ SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
 }
 
 } // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index f108bab24bd39..a9a15f7538a58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -1,18 +1,19 @@
-//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//===--------------------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 /// \file
 /// \brief Helper functions for occupancy and latency.
 //
-//===--------------------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInstrItineraries.h"
 
 namespace llvm {
 
@@ -30,7 +31,7 @@ struct SchedScore {
   unsigned MemLatency = 0;  // Only save mem latency.
                             // We want mem latency small and hide big. Compare
                             // memLatency - hide * Occ, smaller is better.
-  unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
+  unsigned MixAlu = 0;      // VAlu and SAlu can running parallel if Occ > 1.
   unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
   unsigned Lds = 0; // Todo: count lds.
   SchedScore() {}
@@ -39,9 +40,9 @@ struct SchedScore {
   float computeScore() const;
   float computeScore2() const;
 
-  void sum(const SchedScore &s, unsigned loopDepth=0);
+  void sum(const SchedScore &s, unsigned loopDepth = 0);
   bool isBetter(const SchedScore &s) const;
-  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const;
+  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
   // More latency can be hiden with ExtraOcc.
   unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
 };
@@ -71,4 +72,4 @@ struct AMDGPULatencyTracker {
 SchedScore CollectLatency(llvm::MachineFunction &MF,
                           const llvm::GCNSubtarget &ST,
                           const llvm::MachineLoopInfo *MLI = nullptr);
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index a0f2a5d4dc121..b133659d8fb66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -1,9 +1,9 @@
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 
-//#include "dxc/DXIL/DxilMetadataHelper.h"
+// #include "dxc/DXIL/DxilMetadataHelper.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -14,9 +14,9 @@
 
 #include "llvm/Support/Debug.h"
 
-#include "GCNRegPressure.h"
 #include "AMDGPUMIRUtils.h"
 #include "AMDGPUSubExpDag.h"
+#include "GCNRegPressure.h"
 #include <unordered_set>
 
 #define DEBUG_TYPE "xb-sub-exp-dag"
@@ -27,37 +27,35 @@ namespace llvm {
 // Expression Dag.
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const {
-    dbgs() << "\nSubExp:\n";
-    dbgs() << "input regs:\n";
-    for (auto &input : inputLive) {
-      pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
-      dbgs() << "\n";
-    }
-    dbgs() << "output regs:\n";
-    for (auto &output : outputLive) {
-      pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
-      dbgs() << "\n";
-    }
+void SubExp::dump(const MachineRegisterInfo &MRI,
+                  const SIRegisterInfo *SIRI) const {
+  dbgs() << "\nSubExp:\n";
+  dbgs() << "input regs:\n";
+  for (auto &input : inputLive) {
+    pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
+    dbgs() << "\n";
+  }
+  dbgs() << "output regs:\n";
+  for (auto &output : outputLive) {
+    pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
+    dbgs() << "\n";
+  }
 
-    for (MachineInstr *MI : SUnits) {
-      MI->dump();
-    }
-    dbgs() << "End of SubExp\n";
+  for (MachineInstr *MI : SUnits) {
+    MI->dump();
+  }
+  dbgs() << "End of SubExp\n";
 }
 #endif
 
-bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const
-{
-    for (const MachineInstr *MI : SUnits)
-    {
-        if (MI->modifiesRegister(Reg, SIRI))
-        {
-            return true;
-        }
+bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const {
+  for (const MachineInstr *MI : SUnits) {
+    if (MI->modifiesRegister(Reg, SIRI)) {
+      return true;
     }
+  }
 
-    return false;
+  return false;
 }
 
 void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
@@ -95,7 +93,9 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
 
   for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) {
     MachineInstr *MI = *it;
-    auto *ST = &MI->getMF()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+    auto *ST =
+        &MI->getMF()
+             ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
     for (MachineOperand &MO : MI->operands()) {
       if (!MO.isReg())
         continue;
@@ -149,8 +149,8 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
 }
 
 ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
-               const llvm::SIRegisterInfo *SIRI,
-               const SIInstrInfo *SIII, const bool bJoinInput)
+               const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+               const bool bJoinInput)
     : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {}
 
 template <typename T>
@@ -196,9 +196,9 @@ template void
 ExpDag::build<DenseSet<MachineInstr *>>(const LiveSet &InputLiveReg,
                                         const LiveSet &OutputLiveReg,
                                         DenseSet<MachineInstr *> &instRange);
-template void ExpDag::build<std::vector<MachineInstr *>>(const LiveSet &InputLiveReg,
-                                               const LiveSet &OutputLiveReg,
-                                               std::vector<MachineInstr *> &instRange);
+template void ExpDag::build<std::vector<MachineInstr *>>(
+    const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+    std::vector<MachineInstr *> &instRange);
 
 void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
                          const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
@@ -311,7 +311,8 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
             // UserMI should always be in same subExp.
             unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum];
             if (UseSubIdx != OriginSubIdx) {
-              // When reg has multiple def, it is possible for user def in different subExp.
+              // When reg has multiple def, it is possible for user def in
+              // different subExp.
               if (MRI.getUniqueVRegDef(Reg))
                 llvm::report_fatal_error("user and def in different subExp");
               break;
@@ -470,9 +471,8 @@ void BlockExpDag::buildWithPressure() {
   buildPressure(StartLiveReg, EndLiveReg);
 }
 
-void BlockExpDag::buildAvail(
-    const LiveSet &passThruSet,
-    DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
+void BlockExpDag::buildAvail(const LiveSet &passThruSet,
+                             DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
   DenseSet<SUnit *> Processed;
 
   DenseSet<SUnit *> WorkList;
@@ -596,10 +596,10 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
       // Using pass thru as base because output of current SU should not
       // affect other output SUs.
       GCNUpwardRPTracker RP(*LIS);
-      RP.reset(BeginMI, &passThruSet, /*After*/true);
+      RP.reset(BeginMI, &passThruSet, /*After*/ true);
       MachineInstr *MI = SU.getInstr();
       if (MI) {
-        RP.reset(*MI, &passThruSet, /*After*/true);
+        RP.reset(*MI, &passThruSet, /*After*/ true);
         RP.recede(*MI);
       }
       DagPressureMap[&SU] = RP.getLiveRegs();
@@ -639,9 +639,9 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
         GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU];
 
         GCNUpwardRPTracker RP(*LIS);
-        RP.reset(BeginMI, &SuccLive, /*After*/true);
+        RP.reset(BeginMI, &SuccLive, /*After*/ true);
         if (MI) {
-          RP.reset(*MI, &SuccLive, /*After*/true);
+          RP.reset(*MI, &SuccLive, /*After*/ true);
           // Update SuccLive based on MI.
           RP.recede(*MI);
         }
@@ -684,9 +684,7 @@ std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const {
 }
 
 /// Return the label.
-std::string ExpDag::getDAGName() const {
-  return "dag.exp";
-}
+std::string ExpDag::getDAGName() const { return "dag.exp"; }
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
 /// rendered using 'dot'.
@@ -707,7 +705,7 @@ void ExpDag::dump() {
   viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName());
 }
 
-}
+} // namespace llvm
 
 // Expression Dag dump.
 namespace llvm {
@@ -757,7 +755,8 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
     SS << "SU:" << SU->NodeNum;
     return SS.str();
   }
-  static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) {
+  static std::string getNodeDescription(const SUnit *SU,
+                                        const llvm::ExpDag *G) {
     return G->getGraphNodeLabel(SU);
   }
   static std::string getNodeAttributes(const SUnit *N,
@@ -804,7 +803,9 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
   const GCNRPTracker::LiveRegSet outputLive =
       llvm::getLiveRegs(EndSlot, *LIS, MRI);
 
-  auto* ST = &MBB->getParent()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+  auto *ST =
+      &MBB->getParent()
+           ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
   if (MBB->empty()) {
     GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive);
     MaxSGPR = MaxPressure.getSGPRNum();
@@ -845,7 +846,7 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
   auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI);
 
   GCNUpwardRPTracker RPTracker(*LIS);
-  RPTracker.reset(MBB->front(), &outputLive, /*After*/true);
+  RPTracker.reset(MBB->front(), &outputLive, /*After*/ true);
   for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) {
     const SUnit *SU = *it;
     if (!SU->isInstr())
@@ -1116,8 +1117,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
   return Heir;
 }
 
-HRB::Lineage HRB::buildChain(SUnit *Node,
-                             std::vector<llvm::SUnit> &SUnits) {
+HRB::Lineage HRB::buildChain(SUnit *Node, std::vector<llvm::SUnit> &SUnits) {
   HRB::Lineage chain;
   chain.addNode(Node);
   ChainedNodes.insert(Node);
@@ -1754,7 +1754,7 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
         SUnit *SU = *it;
 
         if (!Color.isHead(SU)) {
-            continue;
+          continue;
         }
         Candidate = SU;
         // Remove Candidate from ReadyList.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
index c234f32370793..a7d29430b4276 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -4,7 +4,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/MC/LaneBitmask.h"
 
-#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
 
 namespace llvm {
 class MachineFunction;
@@ -14,8 +14,7 @@ class SIRegisterInfo;
 class SIInstrInfo;
 class MachineInstr;
 class MachineBasicBlock;
-template<typename GraphType>
-class GraphWriter;
+template <typename GraphType> class GraphWriter;
 class SUnit;
 class IntEqClasses;
 class Twine;
@@ -55,13 +54,12 @@ struct SubExp {
                        const llvm::SIRegisterInfo *SIRI);
   void dump(const llvm::MachineRegisterInfo &MRI,
             const llvm::SIRegisterInfo *SIRI) const;
-  bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const;
+  bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo *SIRI) const;
 };
 
 struct ExpDag {
   ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
-         const llvm::SIInstrInfo *SIII,
-         const bool bJoinInput);
+         const llvm::SIInstrInfo *SIII, const bool bJoinInput);
   const llvm::MachineRegisterInfo &MRI;
   const llvm::SIRegisterInfo *SIRI;
   const llvm::SIInstrInfo *SIII;
@@ -83,13 +81,14 @@ struct ExpDag {
   std::string getDAGName() const;
   /// Adds custom features for a visualization of the ScheduleDAG.
   void addCustomGraphFeatures(llvm::GraphWriter<ExpDag *> &) const {}
+
 private:
-  template<typename T>
-  void initNodes(const LiveSet &InputLiveReg, T &insts);
+  template <typename T> void initNodes(const LiveSet &InputLiveReg, T &insts);
   void addDataDep(const llvm::SIRegisterInfo *SIRI);
   void addCtrlDep();
   void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
-                   const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+                   const llvm::SIRegisterInfo *SIRI,
+                   const llvm::SIInstrInfo *SIII);
 };
 
 struct BlockExpDag : public ExpDag {
@@ -103,11 +102,11 @@ struct BlockExpDag : public ExpDag {
   std::vector<SubExp> SubExps;
   void build();
   void buildWithPressure();
+
 private:
   void buildAvail(const LiveSet &passThruSet,
                   llvm::DenseMap<llvm::SUnit *, LiveSet> &DagAvailRegMap);
-  void buildPressure(const LiveSet &StartLiveReg,
-                     const LiveSet &EndLiveReg);
+  void buildPressure(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg);
 };
 
 void getRegBound(llvm::MachineBasicBlock *MBB,
@@ -194,4 +193,4 @@ std::vector<const llvm::SUnit *> hrbSched(std::vector<llvm::SUnit> &SUnits,
                                           const llvm::MachineRegisterInfo &MRI,
                                           const llvm::SIRegisterInfo *SIRI);
 
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
index c9172bae2cb4a..c49590a7d8f7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===//
+//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,9 +14,9 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 
-#include <vector>
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
+#include <vector>
 
 namespace llvm {
 class MachineBasicBlock;
@@ -42,7 +42,6 @@ class SimpleDAG {
   void addCtrlDep();
 };
 
-
 // Collect height/depth for high latency mem ld, which only update height/depth
 // when cross high latency mem ld. Call the height/depth as VMem degree here.
 // The rule is sample and its user should has different degree.
@@ -60,15 +59,13 @@ class SimpleDAG {
 
 class VMemDegreeDAG {
 public:
-  VMemDegreeDAG(std::vector<llvm::SUnit> &Units,
-              const llvm::SIInstrInfo *TII)
+  VMemDegreeDAG(std::vector<llvm::SUnit> &Units, const llvm::SIInstrInfo *TII)
       : SUnits(Units), SIII(TII) {}
   std::vector<llvm::SUnit> &SUnits;
   // InstrInfo.
   const llvm::SIInstrInfo *SIII;
   void build();
 
-
   bool isHighLatency(const llvm::SUnit *SU) const;
   bool isHighLatency(const llvm::MachineInstr *MI) const;
   // height/depth based on Long latency inst.
@@ -79,28 +76,24 @@ class VMemDegreeDAG {
   std::vector<unsigned> VMemFullDepth;
   llvm::SmallVector<llvm::SUnit *, 16> VMemSUs;
   llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUs;
-  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUsByDepth;
-
+  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16>
+      GroupedVMemSUsByDepth;
 
   void dump();
 
 private:
   static constexpr unsigned kNoReg = -1;
 
-
-  std::pair<unsigned, unsigned> buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
-                            std::vector<unsigned> &VMemDepth, bool bDataOnly);
+  std::pair<unsigned, unsigned>
+  buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
+                       std::vector<unsigned> &VMemDepth, bool bDataOnly);
   // Compute vmem height/depth.
   void buildVMemDepthHeight();
   void buildVMemDataDepthHeight();
   void groupVmemSUnits();
-
 };
 
-
-
 // Split block based on vmem depth.
 void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag);
 
-}
-
+} // namespace llvm

From bf396df7b968e2c82e58504957a2fd9bacb3a307 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Tue, 11 Mar 2025 12:20:21 -0700
Subject: [PATCH 07/25] Added option to enable it in the target profile

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 564c92239acdf..ec39b385ecbd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -394,6 +394,12 @@ static cl::opt<bool>
                          cl::desc("Enable s_delay_alu insertion"),
                          cl::init(true), cl::Hidden);
 
+// Enable Hot block rematerialize
+static cl::opt<bool>
+    EnableHotBlockRemat("amdgpu-enable-hot-block-remat",
+                         cl::desc("Enable HotBlock Rematerialize optimization"),
+                         cl::init(false), cl::Hidden);
+
 // Enable GFX11+ VOPD
 static cl::opt<bool>
     EnableVOPD("amdgpu-enable-vopd",
@@ -1523,6 +1529,10 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (TM->getOptLevel() > CodeGenOptLevel::Less)
     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
+  // Must be run before phi elimination
+  if (isPassEnabled(EnableHotBlockRemat))
+    addPass(&AMDGPUHotBlockRematerializeID);
+
   TargetPassConfig::addOptimizedRegAlloc();
 }
 

From c64c4e40dd8fe0391ac4cd135bbcf913380c46fe Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Tue, 11 Mar 2025 11:28:59 -0700
Subject: [PATCH 08/25] Fix PHI node handling in regpressure tracker

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index a438ad00bc41d..390c2f05ffe69 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -549,22 +549,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
         if (!S.liveAt(SI)) {
           if (It == LiveRegs.end()) {
             It = LiveRegs.find(MO.getReg());
-            if (It == LiveRegs.end())
-              llvm_unreachable("register isn't live");
           }
-          auto PrevMask = It->second;
-          It->second &= ~S.LaneMask;
-          CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+          if (It != LiveRegs.end()) {
+            auto PrevMask = It->second;
+            It->second &= ~S.LaneMask;
+            CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+          }
         }
       }
       if (It != LiveRegs.end() && It->second.none())
         LiveRegs.erase(It);
     } else if (!LI.liveAt(SI)) {
       auto It = LiveRegs.find(MO.getReg());
-      if (It == LiveRegs.end())
-        llvm_unreachable("register isn't live");
-      CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
-      LiveRegs.erase(It);
+      if (It != LiveRegs.end()) {
+        CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
+        LiveRegs.erase(It);
+      }
     }
   }
 

From 3dc22d43a3cda3abfd13bab02d5c75a948485cfa Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Tue, 11 Mar 2025 16:31:14 -0700
Subject: [PATCH 09/25] Fixed the PHI issue

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |  16 +-
 llvm/test/CodeGen/AMDGPU/remat/phi.mir        | 709 ++++++++++++++++++
 2 files changed, 724 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 4656e28499a0d..2cd28513f10f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -204,9 +204,23 @@ FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
   return BB;
 }
 
+// Maybe expensive to be called all over the place
+bool IsUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+  for (auto &Def : DefMI->defs()) {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) {
+      if (UseMI.isPHI())
+        return true;
+    }
+  }
+  return false;
+}
+
 bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
-  unsigned OpNum = DefMI->getNumOperands();
+  // Do not move PHI nodes
+  if (IsUsedByPhi(DefMI, MRI))
+    return false;
 
+  unsigned OpNum = DefMI->getNumOperands();
   // Only move DefMI which all operand is unique def.
   for (unsigned i = 0; i < OpNum; i++) {
     MachineOperand &Op = DefMI->getOperand(i);
diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir
new file mode 100644
index 0000000000000..5ee563e7a633f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/phi.mir
@@ -0,0 +1,709 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that none of these defs are moved to their uses since they're used by
+# PHIS.
+# CHECK: bb.0:
+# CHECK: %[[#r3000:]]:sgpr_32 = S_MOV_B32 0
+# CHECK: %[[#r3001:]]:sgpr_32 = S_MOV_B32 1
+# CHECK: %[[#r3002:]]:sgpr_32 = S_MOV_B32 2
+# CHECK: %[[#r3003:]]:sgpr_32 = S_MOV_B32 3
+# CHECK: %[[#r3004:]]:sgpr_32 = S_MOV_B32 4
+# CHECK: %[[#r3005:]]:sgpr_32 = S_MOV_B32 5
+# CHECK: %[[#r3006:]]:sgpr_32 = S_MOV_B32 6
+# CHECK: %[[#r3007:]]:sgpr_32 = S_MOV_B32 7
+# CHECK: %[[#r3008:]]:sgpr_32 = S_MOV_B32 8
+# CHECK: %[[#r3009:]]:sgpr_32 = S_MOV_B32 9
+# CHECK: %[[#r3010:]]:sgpr_32 = S_MOV_B32 10
+# CHECK: %[[#r3011:]]:sgpr_32 = S_MOV_B32 11
+# CHECK: %[[#r3012:]]:sgpr_32 = S_MOV_B32 12
+# CHECK: %[[#r3013:]]:sgpr_32 = S_MOV_B32 13
+# CHECK: %[[#r3014:]]:sgpr_32 = S_MOV_B32 14
+# CHECK: %[[#r3015:]]:sgpr_32 = S_MOV_B32 15
+# CHECK: %[[#r3016:]]:sgpr_32 = S_MOV_B32 16
+# CHECK: %[[#r3017:]]:sgpr_32 = S_MOV_B32 17
+# CHECK: %[[#r3018:]]:sgpr_32 = S_MOV_B32 18
+# CHECK: %[[#r3019:]]:sgpr_32 = S_MOV_B32 19
+# CHECK: %[[#r3020:]]:sgpr_32 = S_MOV_B32 20
+# CHECK: %[[#r3021:]]:sgpr_32 = S_MOV_B32 21
+# CHECK: %[[#r3022:]]:sgpr_32 = S_MOV_B32 22
+# CHECK: %[[#r3023:]]:sgpr_32 = S_MOV_B32 23
+# CHECK: %[[#r3024:]]:sgpr_32 = S_MOV_B32 24
+# CHECK: %[[#r3025:]]:sgpr_32 = S_MOV_B32 25
+# CHECK: %[[#r3026:]]:sgpr_32 = S_MOV_B32 26
+# CHECK: %[[#r3027:]]:sgpr_32 = S_MOV_B32 27
+# CHECK: %[[#r3028:]]:sgpr_32 = S_MOV_B32 28
+# CHECK: %[[#r3029:]]:sgpr_32 = S_MOV_B32 29
+# CHECK: %[[#r3030:]]:sgpr_32 = S_MOV_B32 30
+# CHECK: %[[#r3031:]]:sgpr_32 = S_MOV_B32 31
+# CHECK: %[[#r3032:]]:sgpr_32 = S_MOV_B32 32
+# CHECK: %[[#r3033:]]:sgpr_32 = S_MOV_B32 33
+# CHECK: %[[#r3034:]]:sgpr_32 = S_MOV_B32 34
+# CHECK: %[[#r3035:]]:sgpr_32 = S_MOV_B32 35
+# CHECK: %[[#r3036:]]:sgpr_32 = S_MOV_B32 36
+# CHECK: %[[#r3037:]]:sgpr_32 = S_MOV_B32 37
+# CHECK: %[[#r3038:]]:sgpr_32 = S_MOV_B32 38
+# CHECK: %[[#r3039:]]:sgpr_32 = S_MOV_B32 39
+# CHECK: %[[#r3040:]]:sgpr_32 = S_MOV_B32 40
+# CHECK: %[[#r3041:]]:sgpr_32 = S_MOV_B32 41
+# CHECK: %[[#r3042:]]:sgpr_32 = S_MOV_B32 42
+# CHECK: %[[#r3043:]]:sgpr_32 = S_MOV_B32 43
+# CHECK: %[[#r3044:]]:sgpr_32 = S_MOV_B32 44
+# CHECK: %[[#r3045:]]:sgpr_32 = S_MOV_B32 45
+# CHECK: %[[#r3046:]]:sgpr_32 = S_MOV_B32 46
+# CHECK: %[[#r3047:]]:sgpr_32 = S_MOV_B32 47
+# CHECK: %[[#r3048:]]:sgpr_32 = S_MOV_B32 48
+# CHECK: %[[#r3049:]]:sgpr_32 = S_MOV_B32 49
+# CHECK: %[[#r3050:]]:sgpr_32 = S_MOV_B32 50
+# CHECK: %[[#r3051:]]:sgpr_32 = S_MOV_B32 51
+# CHECK: %[[#r3052:]]:sgpr_32 = S_MOV_B32 52
+# CHECK: %[[#r3053:]]:sgpr_32 = S_MOV_B32 53
+# CHECK: %[[#r3054:]]:sgpr_32 = S_MOV_B32 54
+# CHECK: %[[#r3055:]]:sgpr_32 = S_MOV_B32 55
+# CHECK: %[[#r3056:]]:sgpr_32 = S_MOV_B32 56
+# CHECK: %[[#r3057:]]:sgpr_32 = S_MOV_B32 57
+# CHECK: %[[#r3058:]]:sgpr_32 = S_MOV_B32 58
+# CHECK: %[[#r3059:]]:sgpr_32 = S_MOV_B32 59
+# CHECK: %[[#r3060:]]:sgpr_32 = S_MOV_B32 60
+# CHECK: %[[#r3061:]]:sgpr_32 = S_MOV_B32 61
+# CHECK: %[[#r3062:]]:sgpr_32 = S_MOV_B32 62
+# CHECK: %[[#r3063:]]:sgpr_32 = S_MOV_B32 63
+# CHECK: %[[#r3064:]]:sgpr_32 = S_MOV_B32 64
+# CHECK: %[[#r3065:]]:sgpr_32 = S_MOV_B32 65
+# CHECK: %[[#r3066:]]:sgpr_32 = S_MOV_B32 66
+# CHECK: %[[#r3067:]]:sgpr_32 = S_MOV_B32 67
+# CHECK: %[[#r3068:]]:sgpr_32 = S_MOV_B32 68
+# CHECK: %[[#r3069:]]:sgpr_32 = S_MOV_B32 69
+# CHECK: %[[#r3070:]]:sgpr_32 = S_MOV_B32 70
+# CHECK: %[[#r3071:]]:sgpr_32 = S_MOV_B32 71
+# CHECK: %[[#r3072:]]:sgpr_32 = S_MOV_B32 72
+# CHECK: %[[#r3073:]]:sgpr_32 = S_MOV_B32 73
+# CHECK: %[[#r3074:]]:sgpr_32 = S_MOV_B32 74
+# CHECK: %[[#r3075:]]:sgpr_32 = S_MOV_B32 75
+# CHECK: %[[#r3076:]]:sgpr_32 = S_MOV_B32 76
+# CHECK: %[[#r3077:]]:sgpr_32 = S_MOV_B32 77
+# CHECK: %[[#r3078:]]:sgpr_32 = S_MOV_B32 78
+# CHECK: %[[#r3079:]]:sgpr_32 = S_MOV_B32 79
+# CHECK: %[[#r3080:]]:sgpr_32 = S_MOV_B32 80
+# CHECK: %[[#r3081:]]:sgpr_32 = S_MOV_B32 81
+# CHECK: %[[#r3082:]]:sgpr_32 = S_MOV_B32 82
+# CHECK: %[[#r3083:]]:sgpr_32 = S_MOV_B32 83
+# CHECK: %[[#r3084:]]:sgpr_32 = S_MOV_B32 84
+# CHECK: %[[#r3085:]]:sgpr_32 = S_MOV_B32 85
+# CHECK: %[[#r3086:]]:sgpr_32 = S_MOV_B32 86
+# CHECK: %[[#r3087:]]:sgpr_32 = S_MOV_B32 87
+# CHECK: %[[#r3088:]]:sgpr_32 = S_MOV_B32 88
+# CHECK: %[[#r3089:]]:sgpr_32 = S_MOV_B32 89
+# CHECK: %[[#r3090:]]:sgpr_32 = S_MOV_B32 90
+# CHECK: %[[#r3091:]]:sgpr_32 = S_MOV_B32 91
+# CHECK: %[[#r3092:]]:sgpr_32 = S_MOV_B32 92
+# CHECK: %[[#r3093:]]:sgpr_32 = S_MOV_B32 93
+# CHECK: %[[#r3094:]]:sgpr_32 = S_MOV_B32 94
+# CHECK: %[[#r3095:]]:sgpr_32 = S_MOV_B32 95
+# CHECK: %[[#r3096:]]:sgpr_32 = S_MOV_B32 96
+# CHECK: %[[#r3097:]]:sgpr_32 = S_MOV_B32 97
+# CHECK: %[[#r3098:]]:sgpr_32 = S_MOV_B32 98
+# CHECK: %[[#r3099:]]:sgpr_32 = S_MOV_B32 99
+# CHECK: bb.1:
+# CHECK: bb.2:
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+    %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %3000:sgpr_32 = S_MOV_B32 0
+    %3001:sgpr_32 = S_MOV_B32 1
+    %3002:sgpr_32 = S_MOV_B32 2
+    %3003:sgpr_32 = S_MOV_B32 3
+    %3004:sgpr_32 = S_MOV_B32 4
+    %3005:sgpr_32 = S_MOV_B32 5
+    %3006:sgpr_32 = S_MOV_B32 6
+    %3007:sgpr_32 = S_MOV_B32 7
+    %3008:sgpr_32 = S_MOV_B32 8
+    %3009:sgpr_32 = S_MOV_B32 9
+    %3010:sgpr_32 = S_MOV_B32 10
+    %3011:sgpr_32 = S_MOV_B32 11
+    %3012:sgpr_32 = S_MOV_B32 12
+    %3013:sgpr_32 = S_MOV_B32 13
+    %3014:sgpr_32 = S_MOV_B32 14
+    %3015:sgpr_32 = S_MOV_B32 15
+    %3016:sgpr_32 = S_MOV_B32 16
+    %3017:sgpr_32 = S_MOV_B32 17
+    %3018:sgpr_32 = S_MOV_B32 18
+    %3019:sgpr_32 = S_MOV_B32 19
+    %3020:sgpr_32 = S_MOV_B32 20
+    %3021:sgpr_32 = S_MOV_B32 21
+    %3022:sgpr_32 = S_MOV_B32 22
+    %3023:sgpr_32 = S_MOV_B32 23
+    %3024:sgpr_32 = S_MOV_B32 24
+    %3025:sgpr_32 = S_MOV_B32 25
+    %3026:sgpr_32 = S_MOV_B32 26
+    %3027:sgpr_32 = S_MOV_B32 27
+    %3028:sgpr_32 = S_MOV_B32 28
+    %3029:sgpr_32 = S_MOV_B32 29
+    %3030:sgpr_32 = S_MOV_B32 30
+    %3031:sgpr_32 = S_MOV_B32 31
+    %3032:sgpr_32 = S_MOV_B32 32
+    %3033:sgpr_32 = S_MOV_B32 33
+    %3034:sgpr_32 = S_MOV_B32 34
+    %3035:sgpr_32 = S_MOV_B32 35
+    %3036:sgpr_32 = S_MOV_B32 36
+    %3037:sgpr_32 = S_MOV_B32 37
+    %3038:sgpr_32 = S_MOV_B32 38
+    %3039:sgpr_32 = S_MOV_B32 39
+    %3040:sgpr_32 = S_MOV_B32 40
+    %3041:sgpr_32 = S_MOV_B32 41
+    %3042:sgpr_32 = S_MOV_B32 42
+    %3043:sgpr_32 = S_MOV_B32 43
+    %3044:sgpr_32 = S_MOV_B32 44
+    %3045:sgpr_32 = S_MOV_B32 45
+    %3046:sgpr_32 = S_MOV_B32 46
+    %3047:sgpr_32 = S_MOV_B32 47
+    %3048:sgpr_32 = S_MOV_B32 48
+    %3049:sgpr_32 = S_MOV_B32 49
+    %3050:sgpr_32 = S_MOV_B32 50
+    %3051:sgpr_32 = S_MOV_B32 51
+    %3052:sgpr_32 = S_MOV_B32 52
+    %3053:sgpr_32 = S_MOV_B32 53
+    %3054:sgpr_32 = S_MOV_B32 54
+    %3055:sgpr_32 = S_MOV_B32 55
+    %3056:sgpr_32 = S_MOV_B32 56
+    %3057:sgpr_32 = S_MOV_B32 57
+    %3058:sgpr_32 = S_MOV_B32 58
+    %3059:sgpr_32 = S_MOV_B32 59
+    %3060:sgpr_32 = S_MOV_B32 60
+    %3061:sgpr_32 = S_MOV_B32 61
+    %3062:sgpr_32 = S_MOV_B32 62
+    %3063:sgpr_32 = S_MOV_B32 63
+    %3064:sgpr_32 = S_MOV_B32 64
+    %3065:sgpr_32 = S_MOV_B32 65
+    %3066:sgpr_32 = S_MOV_B32 66
+    %3067:sgpr_32 = S_MOV_B32 67
+    %3068:sgpr_32 = S_MOV_B32 68
+    %3069:sgpr_32 = S_MOV_B32 69
+    %3070:sgpr_32 = S_MOV_B32 70
+    %3071:sgpr_32 = S_MOV_B32 71
+    %3072:sgpr_32 = S_MOV_B32 72
+    %3073:sgpr_32 = S_MOV_B32 73
+    %3074:sgpr_32 = S_MOV_B32 74
+    %3075:sgpr_32 = S_MOV_B32 75
+    %3076:sgpr_32 = S_MOV_B32 76
+    %3077:sgpr_32 = S_MOV_B32 77
+    %3078:sgpr_32 = S_MOV_B32 78
+    %3079:sgpr_32 = S_MOV_B32 79
+    %3080:sgpr_32 = S_MOV_B32 80
+    %3081:sgpr_32 = S_MOV_B32 81
+    %3082:sgpr_32 = S_MOV_B32 82
+    %3083:sgpr_32 = S_MOV_B32 83
+    %3084:sgpr_32 = S_MOV_B32 84
+    %3085:sgpr_32 = S_MOV_B32 85
+    %3086:sgpr_32 = S_MOV_B32 86
+    %3087:sgpr_32 = S_MOV_B32 87
+    %3088:sgpr_32 = S_MOV_B32 88
+    %3089:sgpr_32 = S_MOV_B32 89
+    %3090:sgpr_32 = S_MOV_B32 90
+    %3091:sgpr_32 = S_MOV_B32 91
+    %3092:sgpr_32 = S_MOV_B32 92
+    %3093:sgpr_32 = S_MOV_B32 93
+    %3094:sgpr_32 = S_MOV_B32 94
+    %3095:sgpr_32 = S_MOV_B32 95
+    %3096:sgpr_32 = S_MOV_B32 96
+    %3097:sgpr_32 = S_MOV_B32 97
+    %3098:sgpr_32 = S_MOV_B32 98
+    %3099:sgpr_32 = S_MOV_B32 99
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+
+    %8001:vgpr_32 = COPY %8000
+    %8002:vgpr_32 = COPY %8000
+    %8003:vgpr_32 = COPY %8000
+    %8004:vgpr_32 = COPY %8000
+    %8005:vgpr_32 = COPY %8000
+    %8006:vgpr_32 = COPY %8000
+    %8007:vgpr_32 = COPY %8000
+    %8008:vgpr_32 = COPY %8000
+    %8009:vgpr_32 = COPY %8000
+    %8010:vgpr_32 = COPY %8000
+    %8011:vgpr_32 = COPY %8000
+    %8012:vgpr_32 = COPY %8000
+    %8013:vgpr_32 = COPY %8000
+    %8014:vgpr_32 = COPY %8000
+    %8015:vgpr_32 = COPY %8000
+    %8016:vgpr_32 = COPY %8000
+    %8017:vgpr_32 = COPY %8000
+
+    %9001:vgpr_32 = COPY %8001
+    %9002:vgpr_32 = COPY %8002
+    %9003:vgpr_32 = COPY %8003
+    %9004:vgpr_32 = COPY %8004
+    %9005:vgpr_32 = COPY %8005
+    %9006:vgpr_32 = COPY %8006
+    %9007:vgpr_32 = COPY %8007
+    %9008:vgpr_32 = COPY %8008
+    %9009:vgpr_32 = COPY %8009
+    %9010:vgpr_32 = COPY %8010
+    %9011:vgpr_32 = COPY %8011
+    %9012:vgpr_32 = COPY %8012
+    %9013:vgpr_32 = COPY %8013
+    %9014:vgpr_32 = COPY %8014
+    %9015:vgpr_32 = COPY %8015
+    %9016:vgpr_32 = COPY %8016
+    %9017:vgpr_32 = COPY %8017
+
+    S_BRANCH %bb.2
+
+  bb.2:
+    %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1
+    %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1
+    %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1
+    %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1
+    %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1
+    %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1
+    %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1
+    %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1
+    %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1
+    %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1
+    %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1
+    %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1
+    %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1
+    %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1
+    %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1
+    %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1
+    %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1
+    %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1
+    %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1
+    %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1
+    %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1
+    %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1
+    %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1
+    %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1
+    %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1
+    %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1
+    %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1
+    %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1
+    %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1
+    %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1
+    %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1
+    %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1
+    %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1
+    %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1
+    %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1
+    %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1
+    %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1
+    %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1
+    %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1
+    %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1
+    %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1
+    %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1
+    %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1
+    %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1
+    %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1
+    %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1
+    %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1
+    %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1
+    %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1
+    %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1
+    %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1
+    %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1
+    %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1
+    %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1
+    %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1
+    %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1
+    %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1
+    %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1
+    %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1
+    %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1
+    %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1
+    %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1
+    %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1
+    %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1
+    %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1
+    %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1
+    %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1
+    %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1
+    %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1
+    %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1
+    %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1
+    %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1
+    %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1
+    %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1
+    %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1
+    %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1
+    %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1
+    %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1
+    %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1
+    %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1
+    %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1
+    %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1
+    %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1
+    %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1
+    %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1
+    %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1
+    %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1
+    %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1
+    %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1
+    %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1
+    %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1
+    %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1
+    %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1
+    %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1
+    %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1
+    %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1
+    %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1
+    %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1
+    %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1
+    %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1
+
+
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec
+    %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec
+    %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec
+    %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec
+    %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec
+    %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec
+    %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec
+    %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec
+    %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec
+    %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec
+    %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec
+    %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec
+    %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec
+    %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec
+    %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec
+    %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec
+    %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec
+    %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec
+    %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec
+    %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec
+    %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec
+    %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec
+    %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec
+    %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec
+    %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec
+    %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec
+    %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec
+    %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec
+    %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec
+    %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec
+    %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec
+    %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec
+    %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec
+    %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec
+    %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec
+    %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec
+    %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec
+    %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec
+    %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec
+    %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec
+    %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec
+    %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec
+    %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec
+    %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec
+    %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec
+    %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec
+    %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec
+    %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec
+    %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec
+    %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec
+    %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec
+    %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec
+    %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec
+    %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec
+    %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec
+    %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec
+    %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec
+    %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec
+    %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec
+    %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec
+    %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec
+    %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec
+    %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec
+    %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec
+    %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec
+    %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec
+    %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec
+    %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec
+    %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec
+    %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec
+    %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec
+    %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec
+    %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec
+    %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec
+    %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec
+    %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec
+    %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec
+    %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec
+    %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec
+    %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec
+    %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec
+    %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec
+    %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec
+    %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec
+    %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec
+    %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec
+    %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec
+    %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec
+    %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec
+    %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec
+    %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec
+    %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec
+    %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec
+    %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec
+    %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec
+    %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec
+    %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec
+    %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec
+    %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec
+    %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec
+    EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+    
\ No newline at end of file

From 29eca4aa9b360ec1e98ece2539cbdeba2f7c24dd Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Wed, 12 Mar 2025 10:03:01 -0700
Subject: [PATCH 10/25] Removed old forks of things

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |    4 +-
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp    | 2774 -----------------
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.h      |  285 --
 .../AMDGPUMirSyncDependenceAnalysis.cpp       |  519 ---
 .../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h  |  101 -
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    2 -
 6 files changed, 1 insertion(+), 3684 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 2cd28513f10f3..e508ed2a6e2cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -14,7 +14,6 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUMIRUtils.h"
-#include "AMDGPUMirDivergenceAnalysis.h"
 #include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "AMDGPUSubExpDag.h"
 #include "AMDGPUSubtarget.h"
@@ -4620,8 +4619,7 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
     MachineUniformityInfo MachineUniformity =
         llvm::computeMachineUniformityInfo(MF, CI, *DT,
                                            /*HasBranchDivergence*/ true);
-
-    // llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
+    TotalUniformInsts.clear();
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : MBB) {
         if (MachineUniformity.isUniform(&MI)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
deleted file mode 100644
index 21aa5db0c6f27..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
+++ /dev/null
@@ -1,2774 +0,0 @@
-//===- MirDivergenceAnalysis.cpp -- Mir Divergence Analysis Implementation -==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is based on Analysis/DivergenceAnalysis.cpp,
-// The most important difference is
-// introduction of the idea of "Bit-Divergence".
-//
-// The way booleans are represented in in AMD GPU is a 64-bit uint in a pair of
-// scalar registers, where each bit represents a boolean value for one lane. If
-// all active lanes have the same bool value (all 1's or all 0's), then we can
-// generate a scalar branch, otherwise we must use exec mask to selectively
-// execute lanes based on the boolean mask. When all values in a boolean mask
-// are the same for all active lanes, we call that mask "bit-uniform",
-// otherwise we call it "bit-divergent". This differs from the normal concept
-// of "uniform" and "divergent", which represents whether the value may be
-// different across the 64 lanes. A "bit-divergent" value is still "uniform" in
-// the sense that it is the same 64-bit value from the perspective of all the
-// lanes, but when used as branch condition, will cause the branch to be
-// divergent, which will cause the uses of any values outside of the control
-// flow region to be divergent.
-//
-// The original DA marks everything including bools as divergent or uniform
-// based on the propagation of divergent sources. However, booleans in AMDGPU
-// are in fact never "divergent". Comparison operations that receive divergent
-// operands instead produce "bit-divergent" or "bit-uniform" 64-bit booleans.
-// Between the definition of any boolean mask and its use (particularly in
-// branches, cndmasks, or anything that specifially consumes booleans), there
-// can be any arbitrary number and types of operations performed on it,
-// including combining it with other boolean masks via bit operations.
-//
-// The XDA algorithm is a modified version of the original DA algorithm to
-// simultaneously propagate regular divergence and bit-divergence.
-//
-// First off, XDA identifies all sources of divergence as well as
-// bit-divergence and adds them to the worklist. Then, just like with LLVM DA,
-// it pops values off of the worklist to propagate (bit-)divergence to all its
-// users, unless the user is always (bit-)uniform when given (bit-)divergent
-// operand. It's possible for a value to be marked as both divergent and
-// bit-divergent, in which case the regular divergence will trump
-// bit-divergence.
-//
-// The important difference in this propagation step is that there are special
-// instructions that when given bit-divergent operands, produce divergent
-// values and vice versa.
-//
-// An example is comparison:
-//
-// v0 = interp ...               ; divergent
-// v1 = interp ...               ; divergent
-// s[0:1] = v_cmp v0, v1         ; bit-divergent
-//
-// v0 and v1 are both divergent, but when propagating them, the v_cmp (and its
-// result) is bit-divergent value instead of divergent.
-//
-//
-// An example of the reverse:
-//
-// v0 = ...                                ; uniform
-// s[0:1] = v_cmp v0, v1                   ; bit-divergent
-// ...
-// branch s[0:1], label                    ; divergent!
-// ...
-// v1 = ...                                ; uniform
-// ...
-//
-// label:
-// v3 = phi v0, v1                         ; divergent! because of divergent
-// branch.
-//
-// The boolean value is bit-divergent. When passed to the branch as an operand,
-// the branch becomes divergent, whose sync dependency will be computed as
-// normal to mark the appropriate values divergent (see description in normal
-// DA on how this works).
-//
-// Another difference is in MIR, some branch will be changed into exec update,
-// so only propagate control flow divergent on branch inst will not cover exec
-// control flow.
-// For case like
-//  %163:sreg_64_xexec = S_MOV_B64 $exec
-// bb.1:
-//; predecessors: %bb.1, %bb.0
-//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
-//  %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
-//  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
-//  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
-//  implicit-def $scc, implicit $exec
-//...
-//  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
-//  S_CBRANCH_EXECNZ %bb.1, implicit $exec
-// The ... code after SAVEEXEC will be divergent if %168 is divergent.
-// The PHI should be divergent when %40 is inside the ...
-// To propagate divergent from %168 to the PHI, need to start the propagate from
-// SAVEEXEC which is the control flow by update exec.
-//
-//
-// Original:
-// This file implements a general divergence analysis for loop vectorization
-// and GPU programs. It determines which branches and values in a loop or GPU
-// program are divergent. It can help branch optimizations such as jump
-// threading and loop unswitching to make better decisions.
-//
-// GPU programs typically use the SIMD execution model, where multiple threads
-// in the same execution group have to execute in lock-step. Therefore, if the
-// code contains divergent branches (i.e., threads in a group do not agree on
-// which path of the branch to take), the group of threads has to execute all
-// the paths from that branch with different subsets of threads enabled until
-// they re-converge.
-//
-// Due to this execution model, some optimizations such as jump
-// threading and loop unswitching can interfere with thread re-convergence.
-// Therefore, an analysis that computes which branches in a GPU program are
-// divergent can help the compiler to selectively run these optimizations.
-//
-// This implementation is derived from the Vectorization Analysis of the
-// Region Vectorizer (RV). That implementation in turn is based on the approach
-// described in
-//
-//   Improving Performance of OpenCL on CPUs
-//   Ralf Karrenberg and Sebastian Hack
-//   CC '12
-//
-// This DivergenceAnalysis implementation is generic in the sense that it does
-// not itself identify original sources of divergence.
-// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
-// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
-// (e.g., special variables that hold the thread ID or the iteration variable).
-//
-// The generic implementation propagates divergence to variables that are data
-// or sync dependent on a source of divergence.
-//
-// While data dependency is a well-known concept, the notion of sync dependency
-// is worth more explanation. Sync dependence characterizes the control flow
-// aspect of the propagation of branch divergence. For example,
-//
-//   %cond = icmp slt i32 %tid, 10
-//   br i1 %cond, label %then, label %else
-// then:
-//   br label %merge
-// else:
-//   br label %merge
-// merge:
-//   %a = phi i32 [ 0, %then ], [ 1, %else ]
-//
-// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
-// because %tid is not on its use-def chains, %a is sync dependent on %tid
-// because the branch "br i1 %cond" depends on %tid and affects which value %a
-// is assigned to.
-//
-// The sync dependence detection (which branch induces divergence in which join
-// points) is implemented in the SyncDependenceAnalysis.
-//
-// The current DivergenceAnalysis implementation has the following limitations:
-// 1. intra-procedural. It conservatively considers the arguments of a
-//    non-kernel-entry function and the return value of a function call as
-//    divergent.
-// 2. memory as black box. It conservatively considers values loaded from
-//    generic or local address as divergent. This can be improved by leveraging
-//    pointer analysis and/or by modelling non-escaping memory objects in SSA
-//    as done in RV.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMirDivergenceAnalysis.h"
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "GCNSubtarget.h"
-#include "SIInstrInfo.h"
-#include "TargetInfo/AMDGPUTargetInfo.h"
-#include "Utils/AMDGPUAsmUtils.h"
-#include "Utils/AMDGPUBaseInfo.h"
-// #include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/Support/Debug.h"
-// #include "newbe/cli/newbe_opts.h"  // AMDGPU change.
-#include "llvm/Support/raw_ostream.h"
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mir-divergence-analysis"
-
-namespace llvm {
-bool isAMDGPUOpcodeDivergent(class MachineInstr *MI);
-}
-
-//
-// TODO: TableGen these
-//
-bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  // case R600::INTERP_LOAD_P0:
-  // case R600::INTERP_PAIR_XY:
-  // case R600::INTERP_PAIR_ZW:
-  // case R600::INTERP_VEC_LOAD:
-  // case R600::INTERP_XY:
-  // case R600::INTERP_ZW:
-  case AMDGPU::V_WRITELANE_B32:
-
-  case AMDGPU::V_INTERP_MOV_F32:
-  case AMDGPU::V_INTERP_MOV_F32_e64:
-  case AMDGPU::V_INTERP_MOV_F32_e64_vi:
-  case AMDGPU::V_INTERP_MOV_F32_si:
-  case AMDGPU::V_INTERP_MOV_F32_vi:
-  case AMDGPU::V_INTERP_P1LL_F16:
-  case AMDGPU::V_INTERP_P1LL_F16_vi:
-  case AMDGPU::V_INTERP_P1LV_F16:
-  case AMDGPU::V_INTERP_P1LV_F16_vi:
-  case AMDGPU::V_INTERP_P1_F32:
-  case AMDGPU::V_INTERP_P1_F32_16bank:
-  case AMDGPU::V_INTERP_P1_F32_16bank_si:
-  case AMDGPU::V_INTERP_P1_F32_16bank_vi:
-  case AMDGPU::V_INTERP_P1_F32_e64:
-  case AMDGPU::V_INTERP_P1_F32_e64_vi:
-  case AMDGPU::V_INTERP_P1_F32_si:
-  case AMDGPU::V_INTERP_P1_F32_vi:
-  case AMDGPU::V_INTERP_P2_F16:
-  case AMDGPU::V_INTERP_P2_F16_vi:
-  case AMDGPU::V_INTERP_P2_F32:
-  case AMDGPU::V_INTERP_P2_F32_e64:
-  case AMDGPU::V_INTERP_P2_F32_e64_vi:
-  case AMDGPU::V_INTERP_P2_F32_si:
-  case AMDGPU::V_INTERP_P2_F32_vi:
-
-  case AMDGPU::V_MBCNT_HI_U32_B32_e32:
-  case AMDGPU::V_MBCNT_HI_U32_B32_e32_gfx6_gfx7:
-  case AMDGPU::V_MBCNT_HI_U32_B32_e64:
-  case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx10:
-  case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx6_gfx7:
-  case AMDGPU::V_MBCNT_HI_U32_B32_e64_vi:
-  case AMDGPU::V_MBCNT_HI_U32_B32_sdwa:
-  case AMDGPU::V_MBCNT_LO_U32_B32_e32:
-  case AMDGPU::V_MBCNT_LO_U32_B32_e32_gfx6_gfx7:
-  case AMDGPU::V_MBCNT_LO_U32_B32_e64:
-  case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx10:
-  case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx6_gfx7:
-  case AMDGPU::V_MBCNT_LO_U32_B32_e64_vi:
-  case AMDGPU::V_MBCNT_LO_U32_B32_sdwa:
-
-  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_vi:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx10:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx6_gfx7:
-  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_vi:
-
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
-  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_si:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_vi:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_si:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_vi:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_vi:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_nsa_gfx10:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_si:
-  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_vi:
-
-  case AMDGPU::SI_PS_LIVE:
-
-  case AMDGPU::DS_SWIZZLE_B32:
-  case AMDGPU::DS_SWIZZLE_B32_gfx10:
-  case AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7:
-  case AMDGPU::DS_SWIZZLE_B32_vi:
-
-    return true;
-
-  default:
-    break;
-  }
-  return false;
-}
-
-namespace {
-bool hasImmOperandWithVal(const MachineInstr *MI, uint16_t srcNameIdx,
-                          uint16_t srcModNameIdx, uint64_t Val) {
-  unsigned Op = MI->getOpcode();
-  unsigned srcIdx = AMDGPU::getNamedOperandIdx(Op, srcNameIdx);
-  if (srcIdx == -1)
-    return false;
-  const MachineOperand &srcMO = MI->getOperand(srcIdx);
-  if (srcMO.isImm() && srcMO.getImm() == Val) {
-
-    unsigned modIdx = AMDGPU::getNamedOperandIdx(Op, srcModNameIdx);
-    if (modIdx == -1)
-      return true;
-
-    const MachineOperand &modMO = MI->getOperand(modIdx);
-    if (modMO.getImm() == 0)
-      return true;
-  }
-  return false;
-}
-
-bool isConstant(const MachineInstr *MI) {
-  unsigned Op = MI->getOpcode();
-  switch (Op) {
-  default:
-    break;
-  case AMDGPU::V_OR_B32_e32:
-  case AMDGPU::V_OR_B32_e64: {
-    // Check special case  or -1, which will get result -1.
-    const uint64_t kImm = -1;
-    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
-                             AMDGPU::OpName::src0_modifiers, kImm))
-      return true;
-    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
-                             AMDGPU::OpName::src1_modifiers, kImm))
-      return true;
-  } break;
-  case AMDGPU::S_OR_B32:
-  case AMDGPU::S_OR_B64: {
-    // Check special case  or -1, which will get result -1.
-    const uint64_t kImm = -1;
-    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
-                             AMDGPU::OpName::src0_modifiers, kImm))
-      return true;
-    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
-                             AMDGPU::OpName::src1_modifiers, kImm))
-      return true;
-  } break;
-  case AMDGPU::S_AND_B32:
-  case AMDGPU::S_AND_B64:
-  case AMDGPU::V_AND_B32_e32:
-  case AMDGPU::V_AND_B32_e64: {
-    // Check special case  and 0, which will get result 0.
-    const uint64_t kImm = 0;
-    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
-                             AMDGPU::OpName::src0_modifiers, kImm))
-      return true;
-    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
-                             AMDGPU::OpName::src1_modifiers, kImm))
-      return true;
-  } break;
-  }
-  return false;
-}
-
-bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
-                  const MachineRegisterInfo &MRI) {
-  const auto *BoolRC = SIRI->getBoolRC();
-  for (const MachineOperand &MO : MI->operands()) {
-    if (!MO.isReg())
-      continue;
-    if (MO.isUse())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::VCC ||
-        Reg == AMDGPU::VCC_LO)
-      return true;
-
-    // Check if the written register class overlaps the bool register class.
-    //
-    // Note that this check is insufficent to catch all of the cases where
-    // a "bool" value could be created (for example writing to a register
-    // pair s[0:1], then using s0 as a bool value in wave32).
-    //
-    // The underlying problem is that we have two notions of divergence
-    // (bit divergence and wave divergence) but the algorithm only propagates
-    // wave divergence. The bit divergence is important for bools because it
-    // determines if a branch is uniform or not (and thus catches cases where a
-    // uniform value is used outside of a divergent control flow region). For
-    // bool values the algorithm will treat normally uniform values (i.e. scalar
-    // registers) as divergent in order to try and propagate bit divergence.
-    //
-    // To fix all the possible bugs here I think we need to actually proagate
-    // bit divergence as well as wave divergences. That is a bigger fix and this
-    // check should cover most cases of treating a bool value as divergent.
-    const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
-    if (SIRI->getCommonSubClass(BoolRC, RC))
-      return true;
-  }
-  return false;
-}
-
-bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
-                       const SIRegisterInfo *SIRI,
-                       const MachineRegisterInfo &MRI) {
-  unsigned Op = MI->getOpcode();
-  switch (Op) {
-  default:
-    // Mark all s_inst always uniform except write to bool dst. This doesn't
-    // mean it is bit uniform. When check branch/exec region, will use
-    // isBitUniform. A bool might be sreg, but still divergent, since it is just
-    // put all lanes in one 64/32 bits sreg.
-    if (SIII->isScalarUnit(*MI) && !writeBoolDst(MI, SIRI, MRI) &&
-        !MI->isTerminator())
-      return true;
-    break;
-  // case AMDGPU::AMDGPU_MAKE_UNIFORM:
-  // case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
-  case AMDGPU::V_READFIRSTLANE_B32:
-  case AMDGPU::V_READLANE_B32:
-    // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
-    // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
-    //  bool readfirstlane should be 1 bit, which means bit uniform.
-    return true;
-  case AMDGPU::S_OR_B32:
-  case AMDGPU::S_OR_B64: {
-    // Check special case  or -1, which will get result -1.
-    if (isConstant(MI))
-      return true;
-
-    return !writeBoolDst(MI, SIRI, MRI);
-  } break;
-  case AMDGPU::V_OR_B32_e32:
-  case AMDGPU::V_OR_B32_e64: {
-    // Check special case  or -1, which will get result -1.
-    if (isConstant(MI))
-      return true;
-  } break;
-  case AMDGPU::S_AND_B32:
-  case AMDGPU::S_AND_B64: {
-    // Check special case  and 0, which will get result 0.
-    if (isConstant(MI))
-      return true;
-
-    return !writeBoolDst(MI, SIRI, MRI);
-  } break;
-  case AMDGPU::V_AND_B32_e32:
-  case AMDGPU::V_AND_B32_e64: {
-    // Check special case  and 0, which will get result 0.
-    if (isConstant(MI))
-      return true;
-  } break;
-  }
-  return false;
-}
-
-bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) {
-  return reg.isPhysical();
-  ;
-}
-
-bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
-  return MRI.getRegClass(reg)->getID() == regClassID;
-}
-
-// For input reg of MF, vgpr will be divergent.
-bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI,
-                         const SIRegisterInfo *SIRI) {
-  if (isPhysicalReg(MRI, Reg)) {
-    unsigned vir_reg = MRI.getLiveInVirtReg(Reg);
-    if (SIRI->isVGPR(MRI, vir_reg))
-      return true;
-  } else {
-    if (SIRI->isVGPR(MRI, Reg))
-      return true;
-  }
-  return false;
-}
-
-bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI,
-                          const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
-  // if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
-  //   return true;
-  if (isAMDGPUOpcodeDivergent(MI))
-    return true;
-
-  if (isAlwaysUniformMI(MI, SIII, SIRI, MRI))
-    return false;
-
-  // If the instruction is neither guaranteed to
-  // be uniform or divergent, check whether any
-  // of its operands are passed in to the shader as
-  // args through vector regs.
-  //
-  // This makes them divergent.
-  for (MachineOperand &op : MI->operands()) {
-    if (!op.isReg())
-      continue;
-    if (op.isDef())
-      continue;
-    unsigned reg = op.getReg();
-    if (MRI.isLiveIn(reg)) {
-      if (isDivergentInputReg(reg, MRI, SIRI))
-        return true;
-    }
-  }
-
-  return false;
-}
-
-// For VCC, try to find the nearest define inside same BB.
-const MachineInstr *findPhysicalDefineInSameMBB(const MachineInstr *MI,
-                                                unsigned PhyReg) {
-  const MachineBasicBlock *MBB = MI->getParent();
-  auto it = MI->getReverseIterator();
-  for (it++; it != MBB->rend(); it++) {
-    const MachineInstr &TmpMI = *it;
-    for (const MachineOperand &DefMO : TmpMI.operands()) {
-      if (!DefMO.isReg())
-        continue;
-      if (DefMO.isUse())
-        continue;
-      if (DefMO.getReg() == PhyReg)
-        return &TmpMI;
-    }
-  }
-  return nullptr;
-}
-
-bool isWriteExec(const MachineInstr *MI) {
-  for (const MachineOperand &MO : MI->operands()) {
-    if (!MO.isReg())
-      continue;
-    if (MO.isUse())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
-      return true;
-  }
-  return false;
-}
-
-bool isVCndMask(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case AMDGPU::V_CNDMASK_B32_e32:
-  case AMDGPU::V_CNDMASK_B32_e64:
-  case AMDGPU::V_CNDMASK_B32_dpp:
-  case AMDGPU::V_CNDMASK_B32_sdwa:
-  case AMDGPU::V_CNDMASK_B64_PSEUDO:
-    return true;
-  }
-}
-
-bool isExecRegionOp(unsigned Op) {
-  switch (Op) {
-  default:
-    return false;
-  case AMDGPU::COPY:
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-    return true;
-  }
-}
-
-bool isRestoreExec(const MachineInstr *MI) {
-  unsigned Op = MI->getOpcode();
-  if (!isExecRegionOp(Op))
-    return false;
-
-  return isWriteExec(MI);
-}
-
-const MachineInstr *
-findExecRegionBeginFromRegionEnd(const MachineInstr *MI,
-                                 const MachineRegisterInfo &MRI) {
-  const MachineOperand &MO = MI->getOperand(1);
-  if (!MO.isReg())
-    return nullptr;
-  unsigned Reg = MO.getReg();
-  const MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
-  if (!Def)
-    return nullptr;
-  // Make sure the def is S_MOV Reg, Exec.
-  if (!isExecRegionOp(Def->getOpcode()))
-    return nullptr;
-  const MachineOperand &ExecMO = Def->getOperand(1);
-  if (!ExecMO.isReg())
-    return nullptr;
-  unsigned ExecReg = ExecMO.getReg();
-  if (ExecReg == AMDGPU::EXEC || ExecReg == AMDGPU::EXEC_LO)
-    return Def;
-  else
-    return nullptr;
-}
-
-bool isInsideExecRegion(const MachineInstr &MI, const MachineInstr &RegionBegin,
-                        const MachineInstr &RegionEnd,
-                        const MachineDominatorTree &DT,
-                        const MachinePostDominatorTree &PDT) {
-  if (!DT.dominates(&RegionBegin, &MI))
-    return false;
-
-  const MachineBasicBlock *MBB = MI.getParent();
-  const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
-  if (MBB != RegionEndMBB) {
-    return PDT.dominates(RegionEndMBB, MBB);
-  } else {
-    // MachineLoop through the basic block until we find A or B.
-    MachineBasicBlock::const_iterator I = MBB->begin();
-    for (; I != MI.getIterator() && I != RegionEnd.getIterator(); ++I)
-      /*empty*/;
-
-    // RegionEnd post-dominates MI if MI is found first in the basic block.
-    return I == MI.getIterator();
-  }
-}
-
-bool isInsideExecRegion(const MachineBasicBlock &MBB,
-                        const MachineInstr &RegionBegin,
-                        const MachineInstr &RegionEnd,
-                        const MachineDominatorTree &DT,
-                        const MachinePostDominatorTree &PDT) {
-  const MachineBasicBlock *RegionBeginMBB = RegionBegin.getParent();
-  const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
-  if (!DT.dominates(RegionBeginMBB, &MBB))
-    return false;
-  return PDT.dominates(RegionEndMBB, &MBB);
-}
-
-// Map from BB to nearest Exec Region. How to build? Add every MBB unless
-// already has smaller region? Then when hit saveExec, propagate leaked users of
-// define inside the exec region.
-
-} // namespace
-
-namespace llvm {
-// class DivergenceAnalysis
-DivergenceAnalysis::DivergenceAnalysis(
-    const MachineFunction &F, const MachineLoop *RegionLoop,
-    const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
-    const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
-    // AMDGPU change begin.
-    DivergentJoinMapTy &JoinMap
-    // AMDGPU change end.
-    )
-    : F(F), MRI(F.getRegInfo()), RegionLoop(RegionLoop), DT(DT), PDT(PDT),
-      LI(LI), SDA(SDA), DivergentJoinMap(JoinMap), // AMDGPU change
-      IsLCSSAForm(IsLCSSAForm) {
-  const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
-  SIRI = ST->getRegisterInfo();
-  SIII = ST->getInstrInfo();
-}
-
-void DivergenceAnalysis::markDivergent(const ValueTy DivVal) {
-  assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
-  // AMDGPU change begin.
-  LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
-             const SIRegisterInfo *SIRI = ST->getRegisterInfo();
-             dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI););
-  // AMDGPU change end.
-  DivergentValues.insert(DivVal);
-}
-
-// Mir change.
-void DivergenceAnalysis::markDivergent(const MachineInstr &I) {
-  for (const MachineOperand &DstMO : I.defs()) {
-    unsigned Reg = DstMO.getReg();
-    markDivergent(Reg);
-  }
-  DivergentInsts.insert(&I);
-}
-
-void DivergenceAnalysis::addUniformOverride(const ValueTy UniVal) {
-  // TODO: support uniform multi-def.
-  if (MRI.getUniqueVRegDef(UniVal) == nullptr)
-    return;
-
-  UniformOverrides.insert(UniVal);
-}
-
-void DivergenceAnalysis::addUniformOverride(const MachineInstr &I) {
-  for (const MachineOperand &DstMO : I.defs()) {
-    unsigned Reg = DstMO.getReg();
-    addUniformOverride(Reg);
-  }
-  UniformOverridesInsts.insert(&I);
-}
-
-bool DivergenceAnalysis::isBitUniform(
-    const MachineInstr &I, const llvm::MachineOperand &UseMO,
-    llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
-  if (UseMO.isImm()) {
-    uint64_t val = UseMO.getImm();
-    // 0 and -1 are OK since all lanes are still the same.
-    if (val == 0 || val == -1)
-      return true;
-    else
-      return false;
-  }
-  if (!UseMO.isReg())
-    return true;
-  unsigned Reg = UseMO.getReg();
-  // Exec is always bituniform, because all active lanes are 1.
-  if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
-      // SCC only has 1 bit. Always bituniform.
-      Reg == AMDGPU::SCC)
-    return true;
-
-  const MachineInstr *UseMI = nullptr;
-  if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) {
-    // Try to find define of this VCC.
-    UseMI = findPhysicalDefineInSameMBB(&I, Reg);
-  } else {
-    UseMI = MRI.getUniqueVRegDef(Reg);
-  }
-  if (!UseMI) {
-    return false;
-  }
-
-  bool bResult = isBitUniform(*UseMI, Processed);
-  Processed[UseMI] = bResult;
-  return bResult;
-}
-
-bool DivergenceAnalysis::isBitUniform(
-    const MachineInstr &I,
-    llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
-  auto it = Processed.find(&I);
-  if (it != Processed.end())
-    return it->second;
-  // For branch on MIR, need to make sure all activi lanes are the same.
-  // cmp of uniform value will make sure all active lanes are the same.
-  // Imm is also the same for all active lanes.
-  if (isDivergent(I))
-    return false;
-  // Uniform cmp is bit uniform.
-  if (I.isCompare())
-    return true;
-  if (isConstant(&I))
-    return true;
-
-  // Conservatively consider bituniform to be false.
-  Processed[&I] = false;
-
-  // If all operand is bit uniform, then result is bit uniform.
-  bool bAllOperandBitUniform = true;
-  for (const MachineOperand &UseMO : I.uses()) {
-    if (isBitUniform(I, UseMO, Processed))
-      continue;
-    bAllOperandBitUniform = false;
-    break;
-  }
-  return bAllOperandBitUniform;
-}
-
-bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const {
-  if (Term.getParent()->succ_size() <= 1)
-    return false;
-  switch (Term.getOpcode()) {
-  default: {
-    if (updateNormalInstruction(Term))
-      return true;
-    llvm::DenseMap<const MachineInstr *, bool> Processed;
-    // Check bit uniform here if not divergent.
-    return !isBitUniform(Term, Processed);
-  }
-  // case AMDGPU::AMDGPU_CALL_INDIRECT:
-  case AMDGPU::SI_CALL:
-    return true;
-  }
-}
-
-bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
-  // TODO function calls with side effects, etc
-  if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
-    return false;
-  if (DivergentInsts.find(&I) != DivergentInsts.end())
-    return true;
-  for (const auto &Op : I.uses()) {
-    if (!Op.isReg())
-      continue;
-    Register Reg = Op.getReg();
-    if (Reg.isPhysical()) {
-      if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::SCC)
-        continue;
-      else if (const MachineInstr *DefMI =
-                   findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
-        if (isDivergent(*DefMI))
-          return true;
-      } else {
-        // If cannot find def in same MBB, just treat it as divergent.
-        return true;
-      }
-    } else {
-      if (isDivergent(Op.getReg()))
-        return true;
-    }
-  }
-  return false;
-}
-
-bool DivergenceAnalysis::isTemporalDivergent(
-    const MachineBasicBlock &ObservingBlock, const ValueTy Val,
-    const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
-  const MachineBasicBlock *DefBlock =
-      &IncomingBlock; // AMDGPU change: Take def point as incoming block for
-                      // constants.
-  const auto *Inst = MRI.getUniqueVRegDef(Val);
-  if (Inst == nullptr)
-    return true;
-  if (Inst)
-    DefBlock = Inst->getParent();
-
-  // check whether any divergent loop carrying Val terminates before control
-  // proceeds to ObservingBlock
-  for (const auto *MachineLoop = LI.getLoopFor(DefBlock); // AMDGPU change
-       MachineLoop != RegionLoop && !MachineLoop->contains(&ObservingBlock);
-       MachineLoop = MachineLoop->getParentLoop()) {
-    if (DivergentLoops.find(MachineLoop) != DivergentLoops.end())
-      return true;
-  }
-
-  return false;
-}
-
-// AMDGPU CHANGE BEGIN
-static bool HasIncomingUndefValue(const PHINode_ *Phi) {
-  for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) {
-    const MachineOperand &Op = Phi->getOperand(I);
-    if (Op.isUndef())
-      return true;
-  }
-  return false;
-}
-
-// For case like
-//  %163:sreg_64_xexec = S_MOV_B64 $exec
-// bb.1:
-//; predecessors: %bb.1, %bb.0
-//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
-//  %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
-//  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
-//  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
-//  implicit-def $scc, implicit $exec
-//...
-//  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
-//  S_CBRANCH_EXECNZ %bb.1, implicit $exec
-// The ... code after SAVEEXEC will be divergent if %168 is divergent.
-// Return the SaveExec which affect MI.
-// If not exist, return nullptr.
-static const MachineInstr *
-findSaveExec(const MachineInstr *MI,
-             const SmallVector<const MachineInstr *, 2> &SaveExecs) {
-  // No save exec.
-  if (SaveExecs.empty())
-    return nullptr;
-  if (SaveExecs.size() > 1)
-    llvm::report_fatal_error(
-        "Not support case where, MBB has more than one SaveExec");
-  const MachineInstr *SaveExec = SaveExecs.front();
-  const MachineBasicBlock *MBB = SaveExec->getParent();
-  // Make sure MI is after SaveExec by check it is not before SaveExec.
-  // Assume MBB.begin to SaveExec is short here.
-  bool bIsAfterSaveExec = true;
-  for (auto it = MBB->begin(); it != SaveExec->getIterator(); it++) {
-    if (MI == it) {
-      bIsAfterSaveExec = false;
-      break;
-    }
-  }
-  // Not affect by save exec.
-  if (!bIsAfterSaveExec)
-    return nullptr;
-
-  return SaveExec;
-}
-
-// When a Phi's parent isJoinDivergent,the case make phi divergent is that 2
-// incoming values merged from different path of a divergent branch.
-// isJoinDivergentOnlyOnSameIncomingValue will check for all
-// combinations of incoming values except the BB with same incoming value,
-// because if values are same then even divergent branch is not divergent.
-// For example phi a:A, b:B, a:C.
-// It will check (A,B) (B,C) but not (A, C) Because A
-// and C has same value a.
-// If only (A,C) is sharing divergent branch,
-// then phi a:A, b:B, a:C is still uniform.
-// DivergentJoinMap saving MachineBasicBlock pairs which on different path of a
-// divergent branch and joined at one block.
-// For example,
-//    A
-//  /   \
-// |     \
-// |      \
-// B       /
-// | \    /
-// |  \  /
-// C   D
-// |   /
-//  \ /
-//   E
-// If A is uniform branch, B is divergent branch. Then only (C, D) will be saved
-// in DivergentJoinMap.
-// DivergentJoinMap is build with updateDisjointMap in
-// SyncDependenceAnalysis.cpp when SyncDependenceAnalysis::join_block is called.
-// It will only run on divergent branch, so (A, B) is not in
-// DivergentDisjointMap when A is uniform.
-static bool isJoinDivergentOnlyOnSameIncomingValue(
-    const PHINode_ &Phi, const DivergenceAnalysis *pDA,
-    const MachineDominatorTree &DT, DivergentJoinMapTy &DivergentJoinMap) {
-  // for phi which join divergent, if the incoming values from divergent
-  // branch are the same, the phi is still uniform.
-  // A
-  // | \
-  // |  \
-  // B   \
-  // |\   \
-  // | \   |
-  // C  D  E
-  // |  /  /
-  //  \/  /
-  //   \ /
-  //    F
-  // for phi in F like.
-  // phi (a:C, a:D, b:E)
-  // If A is uniform branch, B is non-uniform branch, phi is uniform.
-  SmallDenseSet<unsigned, 8> ValueToBlockMap;
-  for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
-    const MachineOperand &Op = Phi.getOperand(I);
-    if (!Op.isReg())
-      continue;
-    unsigned Reg = Op.getReg();
-    if (pDA->isDivergent(Reg))
-      return false;
-
-    ValueToBlockMap.insert(Reg);
-  }
-  unsigned NumIncoming = (Phi.getNumOperands() - 1) / 2;
-  // When there's same incoming value from different incoming block.
-  // If divergent select is only on same value, then it is still uniform.
-  if (ValueToBlockMap.size() != NumIncoming) {
-    // When a phi is on divergent join block, there is incoming block which is
-    // comeing from different path of a divergent branch.
-    // Check all combination here.
-    for (unsigned i = 0; i < NumIncoming; i++) {
-      MachineBasicBlock *BB0 = Phi.getOperand(2 + 2 * i).getMBB();
-      const MachineOperand &MO0 = Phi.getOperand(1 + 2 * i);
-      for (unsigned j = i + 1; j < NumIncoming; j++) {
-        MachineBasicBlock *BB1 = Phi.getOperand(2 + 2 * j).getMBB();
-        const MachineOperand &MO1 = Phi.getOperand(1 + 2 * j);
-        // If value match, no divergent.
-        if (MO0.isImm() && MO1.isImm() && MO0.getImm() == MO1.getImm())
-          continue;
-        if (MO0.isReg() && MO1.isReg() && MO0.getReg() == MO1.getReg() &&
-            MO0.getSubReg() == MO1.getSubReg())
-          continue;
-
-        // If BB and BB2 is from divergent disjoint, then they will
-        // divergent join on phi.
-        // This is for case like
-        //    A
-        //  /   \
-        // |     \
-        // |      \
-        // B       /
-        // | \    /
-        // |  \  /
-        // C   D
-        // |   /
-        //  \ /
-        //   E
-        //
-        // phi(a:C, b:D)
-        // When nearestCommonDominator is A, but B also can be divergent
-        // disjoint for C and D.
-        if (DivergentJoinMap[BB0].count(BB1))
-          return false;
-      }
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-// AMDGPU CHANGE END
-
-bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
-  // AMDGPU CHANGE BEGIN
-  // Do not mark phis with undef as incoming values as uniform.
-  // When promoting to scalar we will readfirstlane on
-  // the phi output. If some of the inputs are undef then
-  // this could replace a well defined vector value with an
-  // undefined scalar value.
-  if (HasIncomingUndefValue(&Phi))
-    return true;
-  // AMDGPU CHANGE END
-
-  // joining divergent disjoint path in Phi parent block
-  if (isJoinDivergent(*Phi.getParent())) {
-    // AMDGPU CHANGE BEGIN
-    if (true /*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
-      // Continue if the divergent join only on same incoming value.
-      if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT,
-                                                  DivergentJoinMap))
-        return true;
-    } else
-      // AMDGPU CHANGE END
-      return true;
-  }
-
-  // An incoming value could be divergent by itself.
-  // Otherwise, an incoming value could be uniform within the loop
-  // that carries its definition but it may appear divergent
-  // from outside the loop. This happens when divergent loop exits
-  // drop definitions of that uniform value in different iterations.
-  //
-  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
-  //   if (i % thread_id == 0) break;    // divergent loop exit
-  // }
-  // int divI = i;                 // divI is divergent
-  for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
-    const MachineOperand &Op = Phi.getOperand(I);
-    if (!Op.isReg())
-      continue;
-
-    unsigned Reg = Op.getReg();
-    const MachineOperand &BB = Phi.getOperand(I + 1);
-    if (isDivergent(Reg) ||
-        isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB()))
-      return true;
-  }
-
-  return false;
-}
-
-bool DivergenceAnalysis::updateVCndMask(const MachineInstr &VCndMask) const {
-  // VCndMask require the Cond bituniform to be uniform.
-  unsigned Op = VCndMask.getOpcode();
-  unsigned src0Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src0);
-  unsigned src1Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src1);
-  unsigned src2Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src2);
-
-  const MachineOperand &src0 = VCndMask.getOperand(src0Idx);
-  const MachineOperand &src1 = VCndMask.getOperand(src1Idx);
-
-  const MachineOperand &cond = VCndMask.getOperand(src2Idx);
-
-  if (isDivergent(src0))
-    return true;
-
-  // If src0 == src1, then return src0 divergent.
-  if (src0.isReg() && src1.isReg() && src0.getReg() == src1.getReg()) {
-    if (src0.getSubReg() == src1.getSubReg() &&
-        SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src0_modifiers) ==
-            SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src1_modifiers))
-      return false;
-  }
-
-  if (isDivergent(src1))
-    return true;
-
-  llvm::DenseMap<const MachineInstr *, bool> Processed;
-  return !isBitUniform(VCndMask, cond, Processed);
-}
-
-bool DivergenceAnalysis::inRegion(const MachineInstr &I) const {
-  return I.getParent() && inRegion(*I.getParent());
-}
-
-bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const {
-  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
-}
-
-// marks all users of loop-carried values of the loop headed by LoopHeader as
-// divergent
-void DivergenceAnalysis::taintLoopLiveOuts(
-    const MachineBasicBlock &LoopHeader) {
-  auto *DivLoop = LI.getLoopFor(&LoopHeader);
-  assert(DivLoop && "loopHeader is not actually part of a loop");
-
-  SmallVector<MachineBasicBlock *, 8> TaintStack;
-  DivLoop->getExitBlocks(TaintStack);
-
-  // Otherwise potential users of loop-carried values could be anywhere in the
-  // dominance region of DivLoop (including its fringes for phi nodes)
-  DenseSet<const MachineBasicBlock *> Visited;
-  for (auto *Block : TaintStack) {
-    Visited.insert(Block);
-  }
-  Visited.insert(&LoopHeader);
-
-  while (!TaintStack.empty()) {
-    auto *UserBlock = TaintStack.back();
-    TaintStack.pop_back();
-
-    // don't spread divergence beyond the region
-    if (!inRegion(*UserBlock))
-      continue;
-
-    assert(!DivLoop->contains(UserBlock) &&
-           "irreducible control flow detected");
-
-    // phi nodes at the fringes of the dominance region
-    if (!DT.dominates(&LoopHeader, UserBlock)) {
-      // all PHI nodes of UserBlock become divergent
-      pushPHINodes(*UserBlock);
-      continue;
-    }
-
-    // taint outside users of values carried by DivLoop
-    for (auto &I : *UserBlock) {
-      if (isAlwaysUniformMI(&I, SIII, SIRI, MRI))
-        continue;
-      if (isDivergent(I))
-        continue;
-
-      for (auto &Op : I.uses()) {
-        if (!Op.isReg())
-          continue;
-        unsigned OpReg = Op.getReg();
-        MachineInstr *OpInst = MRI.getUniqueVRegDef(OpReg);
-        if (!OpInst)
-          continue;
-        if (DivLoop->contains(OpInst->getParent())) {
-          markDivergent(I);
-          pushUsers(I);
-          break;
-        }
-      }
-    }
-
-    // visit all blocks in the dominance region
-    for (auto *SuccBlock : UserBlock->successors()) {
-      if (!Visited.insert(SuccBlock).second) {
-        continue;
-      }
-      TaintStack.push_back(SuccBlock);
-    }
-  }
-}
-
-void DivergenceAnalysis::pushInstruction(const MachineInstr &I) {
-  Worklist.push_back(&I);
-}
-void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) {
-  for (const auto &Phi : Block.phis()) {
-    if (isDivergent(Phi))
-      continue;
-    pushInstruction(Phi);
-  }
-}
-
-void DivergenceAnalysis::pushUsers(const ValueTy V) {
-  for (const auto &UserInst : MRI.use_nodbg_instructions(V)) {
-
-    if (isDivergent(UserInst))
-      continue;
-
-    // only compute divergent inside loop
-    if (!inRegion(UserInst))
-      continue;
-
-    Worklist.push_back(&UserInst);
-  }
-}
-void DivergenceAnalysis::pushUsers(const MachineInstr &I) {
-  for (const auto &DstMO : I.defs()) {
-    unsigned Reg = DstMO.getReg();
-    pushUsers(Reg);
-  }
-}
-
-bool DivergenceAnalysis::propagateJoinDivergence(
-    const MachineBasicBlock &JoinBlock, const MachineLoop *BranchLoop) {
-  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
-
-  // ignore divergence outside the region
-  if (!inRegion(JoinBlock)) {
-    return false;
-  }
-
-  // push non-divergent phi nodes in JoinBlock to the worklist
-  pushPHINodes(JoinBlock);
-
-  // JoinBlock is a divergent loop exit
-  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
-    return true;
-  }
-
-  // disjoint-paths divergent at JoinBlock
-  markBlockJoinDivergent(JoinBlock);
-  return false;
-}
-
-void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) {
-  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
-
-  markDivergent(Term);
-
-  const auto *BranchLoop = LI.getLoopFor(Term.getParent());
-
-  // whether there is a divergent loop exit from BranchLoop (if any)
-  bool IsBranchLoopDivergent = false;
-
-  // iterate over all blocks reachable by disjoint from Term within the loop
-  // also iterates over loop exits that become divergent due to Term.
-  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
-    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
-  }
-
-  // Branch loop is a divergent loop due to the divergent branch in Term
-  if (IsBranchLoopDivergent) {
-    assert(BranchLoop);
-    if (!DivergentLoops.insert(BranchLoop).second) {
-      return;
-    }
-    propagateLoopDivergence(*BranchLoop);
-  }
-}
-
-void DivergenceAnalysis::propagateLoopDivergence(
-    const MachineLoop &ExitingLoop) {
-  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber()
-                    << "\n");
-
-  // don't propagate beyond region
-  if (!inRegion(*ExitingLoop.getHeader()))
-    return;
-
-  const auto *BranchLoop = ExitingLoop.getParentLoop();
-
-  // Uses of loop-carried values could occur anywhere
-  // within the dominance region of the definition. All loop-carried
-  // definitions are dominated by the loop header (reducible control).
-  // Thus all users have to be in the dominance region of the loop header,
-  // except PHI nodes that can also live at the fringes of the dom region
-  // (incoming defining value).
-  if (!IsLCSSAForm)
-    taintLoopLiveOuts(*ExitingLoop.getHeader());
-
-  // whether there is a divergent loop exit from BranchLoop (if any)
-  bool IsBranchLoopDivergent = false;
-
-  // iterate over all blocks reachable by disjoint paths from exits of
-  // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
-  // become divergent.
-  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
-    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
-  }
-
-  // Branch loop is a divergent due to divergent loop exit in ExitingLoop
-  if (IsBranchLoopDivergent) {
-    assert(BranchLoop);
-    if (!DivergentLoops.insert(BranchLoop).second) {
-      return;
-    }
-    propagateLoopDivergence(*BranchLoop);
-  }
-}
-
-// For case like
-//  %149:sreg_64_xexec = S_MOV_B64 $exec
-//
-// bb.3:
-//; predecessors: %bb.3, %bb.2
-//  successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%),
-//  %bb.4(50.00%)
-//
-//  %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3
-//  %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec
-//  %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec
-//  %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec,
-//  implicit-def $scc, implicit $exec $m0 = S_MOV_B32 %153:sgpr_32 %55:vreg_512
-//  = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit
-//  $exec $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
-//  S_CBRANCH_EXECNZ %bb.3, implicit $exec
-//
-// bb.4:
-//; predecessors: %bb.3
-//  successors: %bb.5(0x80000000); %bb.5(100.00%)
-//
-//  $exec = S_MOV_B64 %149:sreg_64_xexec
-
-// bb.3 is inside exec region which exec is saved by %149.
-// %152:sreg_64 = S_AND_SAVEEXEC_B64 will update the exec which cause divergence
-// when it is not bituniform. Everything inside the exec region need to be
-// scaned. Out region or phi use should be marked as divergent and add users to
-// worklist.
-void DivergenceAnalysis::propagateExecControlFlowDivergence(
-    const MachineInstr &SaveExec) {
-  const MachineBasicBlock *MBB = SaveExec.getParent();
-  auto it = ExecRegionMap.find(MBB);
-  if (it == ExecRegionMap.end())
-    return;
-  ExecRegion &Region = *it->second;
-  // One region only need to propagate once.
-  if (Region.bPropagated)
-    return;
-  Region.bPropagated = true;
-  // Scan all MIs in the region. Mark out region or phi use as divergent and add
-  // their users to worklist.
-  auto propagateExecDivergence = [this, Region](const MachineInstr *MI) {
-    for (const auto &DstMO : MI->defs()) {
-      Register Reg = DstMO.getReg();
-      // Only VCC/Exec/m0.
-      // Exec always uniform. Assume VCC and m0 not cross region.
-      if (Reg.isPhysical())
-        continue;
-      for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
-
-        if (isDivergent(UserInst))
-          continue;
-
-        // only propagate user outside of region or phi which will not be
-        // guarded by saveExec.
-        if (UserInst.getOpcode() != AMDGPU::PHI &&
-            isInsideExecRegion(UserInst, *Region.begin, *Region.end, DT, PDT)) {
-          continue;
-        }
-        // Write exec is not divergent.
-        if (isWriteExec(&UserInst))
-          continue;
-
-        markDivergent(UserInst);
-        pushUsers(UserInst);
-      }
-    }
-  };
-  const MachineBasicBlock *RegionBeginMBB = Region.begin->getParent();
-  const MachineBasicBlock *RegionEndMBB = Region.end->getParent();
-  if (RegionBeginMBB != RegionEndMBB) {
-    auto it = Region.begin->getIterator();
-    for (it++; it != RegionBeginMBB->end(); it++) {
-      const MachineInstr &MI = *it;
-      propagateExecDivergence(&MI);
-    }
-
-    // All blocks between RegionBeginMBB and RegionEndMBB.
-    for (const MachineBasicBlock *MBB : Region.blocks) {
-      for (const MachineInstr &MI : *MBB) {
-        propagateExecDivergence(&MI);
-      }
-    }
-
-    for (auto it = RegionEndMBB->begin(); it != Region.end->getIterator();
-         it++) {
-      const MachineInstr &MI = *it;
-      propagateExecDivergence(&MI);
-    }
-
-  } else {
-    auto it = Region.begin->getIterator();
-    for (it++; it != Region.end->getIterator(); it++) {
-      const MachineInstr &MI = *it;
-      propagateExecDivergence(&MI);
-    }
-  }
-}
-
-void DivergenceAnalysis::compute() {
-  SmallVector<ExecRegion, 4> ExecRegions;
-  // Build exec regions.
-  // Add VCndMask for non-bituniform caused by input sreg.
-  for (const MachineBasicBlock &MBB : F) {
-    for (const MachineInstr &Term : MBB.terminators()) {
-      if (updateTerminator(Term))
-        pushInstruction(Term);
-    }
-
-    for (const MachineInstr &I : MBB) {
-      unsigned Opcode = I.getOpcode();
-      if (isVCndMask(Opcode)) {
-        // Cond for CndMask needs bit uniform check.
-        // Add it to worklist to check bit uniform from input.
-        pushInstruction(I);
-      } else if (isRestoreExec(&I)) {
-        const MachineInstr *RegionBegin =
-            findExecRegionBeginFromRegionEnd(&I, MRI);
-        if (RegionBegin) {
-          ExecRegions.emplace_back(ExecRegion(RegionBegin, &I));
-        }
-      }
-    }
-  }
-
-  // Build exec region map.
-  for (const MachineBasicBlock &MBB : F) {
-    for (ExecRegion &Region : ExecRegions) {
-      if (isInsideExecRegion(MBB, *Region.begin, *Region.end, DT, PDT)) {
-        // Add block to region.
-        if (&MBB != Region.begin->getParent() &&
-            &MBB != Region.end->getParent())
-          Region.blocks.emplace_back(&MBB);
-        // Update ExecRegionMap.
-        auto it = ExecRegionMap.find(&MBB);
-        if (it == ExecRegionMap.end()) {
-          ExecRegionMap[&MBB] = &Region;
-        } else {
-          // When MBB inside multiple regions, save the smallest one.
-          if (isInsideExecRegion(*Region.begin, *it->second->begin,
-                                 *it->second->end, DT, PDT)) {
-            ExecRegionMap[&MBB] = &Region;
-          }
-        }
-      }
-    }
-  }
-
-  for (auto DivVal : DivergentValues) {
-    LLVM_DEBUG(dbgs() << "\t sourceOfDivergence :"; printReg(DivVal, SIRI);
-               dbgs() << "\n";);
-    pushUsers(DivVal);
-  }
-
-  // propagate divergence
-  while (!Worklist.empty()) {
-    const MachineInstr *I = Worklist.back();
-    Worklist.pop_back();
-
-    // maintain uniformity of overrides
-    if (isAlwaysUniformMI(I, SIII, SIRI, MRI)) {
-      // If used by terminators, and not bit uniform.
-      // Add terminator.
-      SmallVector<const MachineInstr *, 2> TermUsers;
-      for (const auto &DstMO : I->defs()) {
-        unsigned Reg = DstMO.getReg();
-        for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
-
-          if (isDivergent(UserInst))
-            continue;
-          // Only check terminator here.
-          if (!UserInst.isTerminator())
-            continue;
-
-          // only compute divergent inside loop
-          if (!inRegion(UserInst))
-            continue;
-
-          TermUsers.emplace_back(&UserInst);
-        }
-      }
-
-      if (!TermUsers.empty()) {
-        llvm::DenseMap<const MachineInstr *, bool> Processed;
-        if (!isBitUniform(*I, Processed)) {
-          for (const MachineInstr *Term : TermUsers) {
-            Worklist.emplace_back(Term);
-          }
-        }
-      }
-
-      continue;
-    }
-
-    bool WasDivergent = isDivergent(*I);
-    if (WasDivergent)
-      continue;
-
-    // propagate divergence caused by terminator
-    if (I->isTerminator()) {
-      if (updateTerminator(*I)) {
-        // propagate control divergence to affected instructions
-        propagateBranchDivergence(*I);
-        continue;
-      }
-    }
-
-    // update divergence of I due to divergent operands
-    bool DivergentUpd = false;
-    unsigned Opcode = I->getOpcode();
-    switch (I->getOpcode()) {
-    default:
-      if (isVCndMask(Opcode)) {
-        DivergentUpd = updateVCndMask(*I);
-      } else {
-        DivergentUpd = updateNormalInstruction(*I);
-        llvm::DenseMap<const MachineInstr *, bool> Processed;
-        if ((DivergentUpd || !isBitUniform(*I, Processed)) && isWriteExec(I)) {
-          // propagate exec control divergence to affected instructions.
-          propagateExecControlFlowDivergence(*I);
-        }
-      }
-      break;
-    case AMDGPU::PHI:
-      DivergentUpd = updatePHINode(*I);
-      break;
-    }
-
-    // propagate value divergence to users
-    if (DivergentUpd) {
-      markDivergent(*I);
-      pushUsers(*I);
-    }
-  }
-}
-
-bool DivergenceAnalysis::isAlwaysUniform(const ValueTy V) const {
-  return UniformOverrides.find(V) != UniformOverrides.end();
-}
-
-bool DivergenceAnalysis::isDivergent(const ValueTy V) const {
-  return DivergentValues.find(V) != DivergentValues.end();
-}
-
-bool DivergenceAnalysis::isDivergent(const MachineOperand &MO) const {
-  if (!MO.isReg())
-    return false;
-  Register Reg = MO.getReg();
-  if (Reg.isPhysical()) {
-    const MachineInstr *MI = MO.getParent();
-    if (MI)
-      return isDivergent(!MI);
-
-  } else {
-    return isDivergent(Reg);
-  }
-  return true;
-}
-
-bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const {
-  if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
-    return false;
-  if (DivergentInsts.find(&I) != DivergentInsts.end())
-    return true;
-  for (const MachineOperand &DstMO : I.defs()) {
-    unsigned Reg = DstMO.getReg();
-    if (isDivergent(Reg))
-      return true;
-  }
-  return false;
-}
-
-void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const {
-  // iterate instructions using instructions() to ensure a deterministic order.
-  for (auto &MBB : F)
-    for (auto &I : MBB) {
-      if (isDivergent(I))
-        OS << "DIVERGENT:" << I;
-      // AMDGPU changes begin
-      else
-        OS << "UNIFORM:" << I;
-      // AMDGPU changes end
-    }
-}
-
-// class GPUDivergenceAnalysis
-MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(
-    MachineFunction &F, const MachineDominatorTree &DT,
-    const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
-    : SDA(DT, PDT, LI, /*AMDGPU change*/ DivergentJoinMap),
-      DA(F, nullptr, DT, PDT, LI, SDA, false,
-         /*AMDGPU change*/ DivergentJoinMap) {
-  MachineRegisterInfo &MRI = F.getRegInfo();
-  const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
-  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
-  const SIInstrInfo *SIII = ST->getInstrInfo();
-  for (auto &MBB : F)
-    for (auto &I : MBB) {
-      if (isSourceOfDivergence(&I, MRI, SIRI, SIII)) {
-        DA.markDivergent(I);
-      } else if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) {
-        DA.addUniformOverride(I);
-      }
-    }
-  for (auto &ArgIt : F.getRegInfo().liveins()) {
-    unsigned Reg = ArgIt.first;
-    if (isDivergentInputReg(Reg, MRI, SIRI)) {
-      DA.markDivergent(Reg);
-    }
-  }
-
-  DA.compute();
-}
-
-bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const {
-  return DA.isDivergent(*I);
-}
-
-void MirGPUDivergenceAnalysis::print(raw_ostream &OS,
-                                     const Module_ *mod) const {
-  OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
-  DA.print(OS, mod);
-  OS << "}\n";
-}
-
-} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
deleted file mode 100644
index e721ac323255e..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
+++ /dev/null
@@ -1,285 +0,0 @@
-//===- AMDGPUMirDivergenceAnalysis.h -  Mir Divergence Analysis -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// The divergence analysis determines which instructions and branches are
-// divergent given a set of divergent source instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "AMDGPUMirSyncDependenceAnalysis.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Pass.h"
-#include <vector>
-
-namespace llvm {
-class raw_ostream;
-class TargetTransformInfo;
-class MachineRegisterInfo;
-class SIInstrInfo;
-class SIRegisterInfo;
-class MachineOperand;
-class MachineBasicBlock;
-
-using Module_ = void;
-class TargetTransformInfo;
-using ValueTy = unsigned;
-using PHINode_ = MachineInstr;
-
-/// \brief Generic divergence analysis for reducible CFGs.
-///
-/// This analysis propagates divergence in a data-parallel context from sources
-/// of divergence to all users. It requires reducible CFGs. All assignments
-/// should be in SSA form.
-class DivergenceAnalysis {
-public:
-  /// \brief This instance will analyze the whole function \p F or the loop \p
-  /// RegionLoop.
-  ///
-  /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop.
-  /// Otherwise the whole function is analyzed.
-  /// \param IsLCSSAForm whether the analysis may assume that the IR in the
-  /// region in in LCSSA form.
-  DivergenceAnalysis(const llvm::MachineFunction &F,
-                     const MachineLoop *RegionLoop,
-                     const MachineDominatorTree &DT,
-                     const MachinePostDominatorTree &PDT,
-                     const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA,
-                     bool IsLCSSAForm,
-                     // AMDGPU change begin.
-                     DivergentJoinMapTy &JoinMap
-                     // AMDGPU change end.
-  );
-
-  /// \brief The loop that defines the analyzed region (if any).
-  const MachineLoop *getRegionLoop() const { return RegionLoop; }
-  const llvm::MachineFunction &getFunction() const { return F; }
-
-  /// \brief Whether \p BB is part of the region.
-  bool inRegion(const MachineBasicBlock &BB) const;
-  /// \brief Whether \p I is part of the region.
-  bool inRegion(const MachineInstr &I) const;
-
-  /// \brief Mark \p UniVal as a value that is always uniform.
-  void addUniformOverride(const ValueTy UniVal);
-  void addUniformOverride(const MachineInstr &I);
-
-  /// \brief Mark \p DivVal as a value that is always divergent.
-  void markDivergent(const ValueTy DivVal);
-  void markDivergent(const MachineInstr &I);
-
-  /// \brief Propagate divergence to all instructions in the region.
-  /// Divergence is seeded by calls to \p markDivergent.
-  void compute();
-
-  /// \brief Whether any value was marked or analyzed to be divergent.
-  bool hasDetectedDivergence() const { return !DivergentValues.empty(); }
-
-  /// \brief Whether \p Val will always return a uniform value regardless of its
-  /// operands
-  bool isAlwaysUniform(const ValueTy Val) const;
-
-  /// \brief Whether \p Val is a divergent value
-  bool isDivergent(const ValueTy Val) const;
-  bool isDivergent(const MachineInstr &I) const;
-
-  void print(llvm::raw_ostream &OS, const Module_ *) const;
-
-private:
-  bool isDivergent(const llvm::MachineOperand &MO) const;
-  bool updateTerminator(const MachineInstr &Term) const;
-  bool updatePHINode(const PHINode_ &Phi) const;
-  bool updateVCndMask(const MachineInstr &VCndMask) const;
-  bool
-  isBitUniform(const MachineInstr &I,
-               llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
-  bool
-  isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
-               llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
-
-  /// \brief Computes whether \p Inst is divergent based on the
-  /// divergence of its operands.
-  ///
-  /// \returns Whether \p Inst is divergent.
-  ///
-  /// This should only be called for non-phi, non-terminator instructions.
-  bool updateNormalInstruction(const MachineInstr &Inst) const;
-
-  /// \brief Mark users of live-out users as divergent.
-  ///
-  /// \param LoopHeader the header of the divergent loop.
-  ///
-  /// Marks all users of live-out values of the loop headed by \p LoopHeader
-  /// as divergent and puts them on the worklist.
-  void taintLoopLiveOuts(const MachineBasicBlock &LoopHeader);
-
-  /// \brief Push all users of \p Val (in the region) to the worklist
-  void pushUsers(const ValueTy I);
-  void pushUsers(const MachineInstr &I);
-
-  void pushInstruction(const MachineInstr &I);
-  /// \brief Push all phi nodes in @block to the worklist
-  void pushPHINodes(const MachineBasicBlock &Block);
-
-  /// \brief Mark \p Block as join divergent
-  ///
-  /// A block is join divergent if two threads may reach it from different
-  /// incoming blocks at the same time.
-  void markBlockJoinDivergent(const MachineBasicBlock &Block) {
-    DivergentJoinBlocks.insert(&Block);
-  }
-
-  /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
-  bool isTemporalDivergent(
-      const MachineBasicBlock &ObservingBlock, const ValueTy Val,
-      const MachineBasicBlock &incomingBlock) const; // AMDGPU change
-
-  /// \brief Whether \p Block is join divergent
-  ///
-  /// (see markBlockJoinDivergent).
-  bool isJoinDivergent(const MachineBasicBlock &Block) const {
-    return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end();
-  }
-
-  /// \brief Propagate control-induced divergence to users (phi nodes and
-  /// instructions).
-  //
-  // \param JoinBlock is a divergent loop exit or join point of two disjoint
-  // paths.
-  // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
-  bool propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
-                               const MachineLoop *TermLoop);
-
-  /// \brief Propagate induced value divergence due to control divergence in \p
-  /// Term.
-  void propagateBranchDivergence(const MachineInstr &Term);
-
-  /// \brief Propagate induced value divergence due to exec update caused by \p
-  /// SaveExec.
-  void propagateExecControlFlowDivergence(const MachineInstr &SaveExec);
-
-  /// \brief Propagate divergent caused by a divergent loop exit.
-  ///
-  /// \param ExitingLoop is a divergent loop.
-  void propagateLoopDivergence(const MachineLoop &ExitingLoop);
-
-private:
-  const llvm::MachineFunction &F;
-  const llvm::MachineRegisterInfo &MRI;
-  const llvm::SIRegisterInfo *SIRI;
-  const llvm::SIInstrInfo *SIII;
-  // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
-  // Otw, analyze the whole function
-  const MachineLoop *RegionLoop;
-
-  const MachineDominatorTree &DT;
-  const MachinePostDominatorTree &PDT;
-  const MachineLoopInfo &LI;
-
-  // Recognized divergent loops
-  llvm::DenseSet<const MachineLoop *> DivergentLoops;
-
-  // AMDGPU change begin
-  // Save block pair which divergent disjoint.
-  // A
-  // | \
-  // |  \
-  // B   C
-  // |  /
-  //  D
-  // When A is divergent branch, B and C are divergent join at D.
-  // Then DivergentJoinMap[B].count(C) > 0 and
-  // DivergentJoinMap[C].count(B) > 0.
-  DivergentJoinMapTy &DivergentJoinMap;
-  // AMDGPU change end
-
-  // The SDA links divergent branches to divergent control-flow joins.
-  SyncDependenceAnalysis &SDA;
-
-  // Use simplified code path for LCSSA form.
-  bool IsLCSSAForm;
-
-  // Set of known-uniform values.
-  llvm::DenseSet<unsigned> UniformOverrides;
-  llvm::DenseSet<const llvm::MachineInstr *> UniformOverridesInsts;
-
-  // Blocks with joining divergent control from different predecessors.
-  llvm::DenseSet<const MachineBasicBlock *> DivergentJoinBlocks;
-
-  // Detected/marked divergent values.
-  llvm::DenseSet<unsigned> DivergentValues;
-  llvm::DenseSet<const llvm::MachineInstr *> DivergentInsts;
-
-  // Mir change for EXEC control flow.
-  // Map from MBB to the exec region it belongs too.
-  // A exec region is begin with
-  // S_MOV_B64 sreg, exec
-  // end with
-  // S_MOV_B64 exec, sreg
-  // Inside the region, exec might be updated to make control flow with exec.
-  struct ExecRegion {
-    const llvm::MachineInstr *begin;
-    const llvm::MachineInstr *end;
-    std::vector<const llvm::MachineBasicBlock *> blocks;
-    bool bPropagated = false;
-    ExecRegion(const llvm::MachineInstr *b, const llvm::MachineInstr *e)
-        : begin(b), end(e), bPropagated(false) {}
-  };
-  llvm::DenseMap<const llvm::MachineBasicBlock *, ExecRegion *> ExecRegionMap;
-
-  // Internal worklist for divergence propagation.
-  std::vector<const llvm::MachineInstr *> Worklist;
-};
-
-/// \brief Divergence analysis frontend for GPU kernels.
-class MirGPUDivergenceAnalysis {
-  // AMDGPU change begin
-  // Save block pair which divergent disjoint.
-  // A
-  // | \
-  // |  \
-  // B   C
-  // |  /
-  //  D
-  // When A is divergent branch, B and C are divergent join at D.
-  // Then DivergentJoinMap[B].count(C) > 0 and
-  // DivergentJoinMap[C].count(B) > 0.
-  DivergentJoinMapTy DivergentJoinMap;
-  // AMDGPU change end
-  SyncDependenceAnalysis SDA;
-  DivergenceAnalysis DA;
-
-public:
-  /// Runs the divergence analysis on @F, a GPU kernel
-  MirGPUDivergenceAnalysis(llvm::MachineFunction &F,
-                           const MachineDominatorTree &DT,
-                           const MachinePostDominatorTree &PDT,
-                           const MachineLoopInfo &LI);
-
-  /// Whether any divergence was detected.
-  bool hasDivergence() const { return DA.hasDetectedDivergence(); }
-
-  /// The GPU kernel this analysis result is for
-  const llvm::MachineFunction &getFunction() const { return DA.getFunction(); }
-
-  /// Whether \p I is divergent.
-  bool isDivergent(const MachineInstr *I) const;
-
-  /// Whether \p I is uniform/non-divergent
-  bool isUniform(const MachineInstr *I) const { return !isDivergent(I); }
-
-  /// Print all divergent values in the kernel.
-  void print(llvm::raw_ostream &OS, const Module_ *) const;
-};
-
-} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
deleted file mode 100644
index 302939c76a4df..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
+++ /dev/null
@@ -1,519 +0,0 @@
-//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence
-//Calculation
-//--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is based on Analysis/MirSyncDependenceAnalysis.cpp, just change
-// MachineBasicBlock to MachineBasicBlock.
-// This file implements an algorithm that returns for a divergent branch
-// the set of basic blocks whose phi nodes become divergent due to divergent
-// control. These are the blocks that are reachable by two disjoint paths from
-// the branch or loop exits that have a reaching path that is disjoint from a
-// path to the loop latch.
-//
-// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
-// control-induced divergence in phi nodes.
-//
-// -- Summary --
-// The SyncDependenceAnalysis lazily computes sync dependences [3].
-// The analysis evaluates the disjoint path criterion [2] by a reduction
-// to SSA construction. The SSA construction algorithm is implemented as
-// a simple data-flow analysis [1].
-//
-// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
-// [2] "Efficiently Computing Static Single Assignment Form
-//     and the Control Dependence Graph", TOPLAS '91,
-//           Cytron, Ferrante, Rosen, Wegman and Zadeck
-// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
-// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
-//
-// -- Sync dependence --
-// Sync dependence [4] characterizes the control flow aspect of the
-// propagation of branch divergence. For example,
-//
-//   %cond = icmp slt i32 %tid, 10
-//   br i1 %cond, label %then, label %else
-// then:
-//   br label %merge
-// else:
-//   br label %merge
-// merge:
-//   %a = phi i32 [ 0, %then ], [ 1, %else ]
-//
-// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
-// because %tid is not on its use-def chains, %a is sync dependent on %tid
-// because the branch "br i1 %cond" depends on %tid and affects which value %a
-// is assigned to.
-//
-// -- Reduction to SSA construction --
-// There are two disjoint paths from A to X, if a certain variant of SSA
-// construction places a phi node in X under the following set-up scheme [2].
-//
-// This variant of SSA construction ignores incoming undef values.
-// That is paths from the entry without a definition do not result in
-// phi nodes.
-//
-//       entry
-//     /      \
-//    A        \
-//  /   \       Y
-// B     C     /
-//  \   /  \  /
-//    D     E
-//     \   /
-//       F
-// Assume that A contains a divergent branch. We are interested
-// in the set of all blocks where each block is reachable from A
-// via two disjoint paths. This would be the set {D, F} in this
-// case.
-// To generally reduce this query to SSA construction we introduce
-// a virtual variable x and assign to x different values in each
-// successor block of A.
-//           entry
-//         /      \
-//        A        \
-//      /   \       Y
-// x = 0   x = 1   /
-//      \  /   \  /
-//        D     E
-//         \   /
-//           F
-// Our flavor of SSA construction for x will construct the following
-//            entry
-//          /      \
-//         A        \
-//       /   \       Y
-// x0 = 0   x1 = 1  /
-//       \   /   \ /
-//      x2=phi    E
-//         \     /
-//          x3=phi
-// The blocks D and F contain phi nodes and are thus each reachable
-// by two disjoins paths from A.
-//
-// -- Remarks --
-// In case of loop exits we need to check the disjoint path criterion for loops
-// [2]. To this end, we check whether the definition of x differs between the
-// loop exit and the loop header (_after_ SSA construction).
-//
-//===----------------------------------------------------------------------===//
-#include "AMDGPUMirSyncDependenceAnalysis.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-
-#include <stack>
-#include <unordered_set>
-
-#define DEBUG_TYPE "sync-dependence"
-
-namespace llvm {
-
-ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
-
-SyncDependenceAnalysis::SyncDependenceAnalysis(
-    const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
-    const MachineLoopInfo &LI,
-    // AMDGPU change begin.
-    DivergentJoinMapTy &JoinMap
-    // AMDGPU change end.
-    )
-    : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI),
-      // AMDGPU change begin.
-      DivergentJoinMap(JoinMap)
-// AMDGPU change end.
-{}
-
-SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
-
-using FunctionRPOT = ReversePostOrderTraversal<const MachineFunction *>;
-
-// divergence propagator for reducible CFGs
-struct DivergencePropagator {
-  const FunctionRPOT &FuncRPOT;
-  const MachineDominatorTree &DT;
-  const MachinePostDominatorTree &PDT;
-  const MachineLoopInfo &LI;
-
-  // identified join points
-  std::unique_ptr<ConstBlockSet> JoinBlocks;
-
-  // reached loop exits (by a path disjoint to a path to the loop header)
-  SmallPtrSet<const MachineBasicBlock *, 4> ReachedLoopExits;
-
-  // if DefMap[B] == C then C is the dominating definition at block B
-  // if DefMap[B] ~ undef then we haven't seen B yet
-  // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
-  // an immediate successor of X (initial value).
-  using DefiningBlockMap =
-      std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
-  DefiningBlockMap DefMap;
-
-  // all blocks with pending visits
-  std::unordered_set<const MachineBasicBlock *> PendingUpdates;
-
-  DivergencePropagator(const FunctionRPOT &FuncRPOT,
-                       const MachineDominatorTree &DT,
-                       const MachinePostDominatorTree &PDT,
-                       const MachineLoopInfo &LI)
-      : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
-        JoinBlocks(new ConstBlockSet) {}
-
-  // set the definition at @block and mark @block as pending for a visit
-  void addPending(const MachineBasicBlock &Block,
-                  const MachineBasicBlock &DefBlock) {
-    bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
-    if (WasAdded)
-      PendingUpdates.insert(&Block);
-  }
-
-  void printDefs(raw_ostream &Out) {
-    Out << "Propagator::DefMap {\n";
-    for (const auto *Block : FuncRPOT) {
-      auto It = DefMap.find(Block);
-      Out << Block->getName() << " : ";
-      if (It == DefMap.end()) {
-        Out << "\n";
-      } else {
-        const auto *DefBlock = It->second;
-        Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
-      }
-    }
-    Out << "}\n";
-  }
-
-  // process @succBlock with reaching definition @defBlock
-  // the original divergent branch was in @parentLoop (if any)
-  void visitSuccessor(const MachineBasicBlock &SuccBlock,
-                      const MachineLoop *ParentLoop,
-                      const MachineBasicBlock &DefBlock) {
-
-    // @succBlock is a loop exit
-    if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
-      DefMap.emplace(&SuccBlock, &DefBlock);
-      ReachedLoopExits.insert(&SuccBlock);
-      return;
-    }
-
-    // first reaching def?
-    auto ItLastDef = DefMap.find(&SuccBlock);
-    if (ItLastDef == DefMap.end()) {
-      addPending(SuccBlock, DefBlock);
-      return;
-    }
-
-    // a join of at least two definitions
-    if (ItLastDef->second != &DefBlock) {
-      // do we know this join already?
-      if (!JoinBlocks->insert(&SuccBlock).second)
-        return;
-
-      // update the definition
-      addPending(SuccBlock, SuccBlock);
-    }
-  }
-
-  // find all blocks reachable by two disjoint paths from @rootTerm.
-  // This method works for both divergent terminators and loops with
-  // divergent exits.
-  // @rootBlock is either the block containing the branch or the header of the
-  // divergent loop.
-  // @nodeSuccessors is the set of successors of the node (MachineLoop or
-  // Terminator) headed by @rootBlock.
-  // @parentLoop is the parent loop of the MachineLoop or the loop that contains
-  // the Terminator.
-  template <typename SuccessorIterable>
-  std::unique_ptr<ConstBlockSet> computeJoinPoints(
-      const MachineBasicBlock &RootBlock, SuccessorIterable NodeSuccessors,
-      const MachineLoop *ParentLoop, const MachineBasicBlock *PdBoundBlock) {
-    assert(JoinBlocks);
-
-    // bootstrap with branch targets
-    for (const auto *SuccBlock : NodeSuccessors) {
-      DefMap.emplace(SuccBlock, SuccBlock);
-
-      if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
-        // immediate loop exit from node.
-        ReachedLoopExits.insert(SuccBlock);
-        continue;
-      } else {
-        // regular successor
-        PendingUpdates.insert(SuccBlock);
-      }
-    }
-
-    auto ItBeginRPO = FuncRPOT.begin();
-
-    // skip until term (TODO RPOT won't let us start at @term directly)
-    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {
-    }
-
-    auto ItEndRPO = FuncRPOT.end();
-    assert(ItBeginRPO != ItEndRPO);
-
-    // propagate definitions at the immediate successors of the node in RPO
-    auto ItBlockRPO = ItBeginRPO;
-    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
-      const auto *Block = *ItBlockRPO;
-
-      // skip @block if not pending update
-      auto ItPending = PendingUpdates.find(Block);
-      if (ItPending == PendingUpdates.end())
-        continue;
-      PendingUpdates.erase(ItPending);
-
-      // propagate definition at @block to its successors
-      auto ItDef = DefMap.find(Block);
-      const auto *DefBlock = ItDef->second;
-      assert(DefBlock);
-
-      auto *BlockLoop = LI.getLoopFor(Block);
-      if (ParentLoop &&
-          (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
-        // if the successor is the header of a nested loop pretend its a
-        // single node with the loop's exits as successors
-        SmallVector<MachineBasicBlock *, 4> BlockLoopExits;
-        BlockLoop->getExitBlocks(BlockLoopExits);
-        for (const auto *BlockLoopExit : BlockLoopExits) {
-          visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
-        }
-
-      } else {
-        // the successors are either on the same loop level or loop exits
-        for (const auto *SuccBlock : Block->successors()) {
-          visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
-        }
-      }
-    }
-
-    // We need to know the definition at the parent loop header to decide
-    // whether the definition at the header is different from the definition at
-    // the loop exits, which would indicate a divergent loop exits.
-    //
-    // A // loop header
-    // |
-    // B // nested loop header
-    // |
-    // C -> X (exit from B loop) -..-> (A latch)
-    // |
-    // D -> back to B (B latch)
-    // |
-    // proper exit from both loops
-    //
-    // D post-dominates B as it is the only proper exit from the "A loop".
-    // If C has a divergent branch, propagation will therefore stop at D.
-    // That implies that B will never receive a definition.
-    // But that definition can only be the same as at D (D itself in thise case)
-    // because all paths to anywhere have to pass through D.
-    //
-    const MachineBasicBlock *ParentLoopHeader =
-        ParentLoop ? ParentLoop->getHeader() : nullptr;
-    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
-      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
-    }
-
-    // analyze reached loop exits
-    if (!ReachedLoopExits.empty()) {
-      assert(ParentLoop);
-      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
-      LLVM_DEBUG(printDefs(dbgs()));
-
-      // AMDGPU CHANGE: Allow null HeaderDefBlock
-      // Because of the way they walk the blocks (a reverse post order traversal
-      // stopping at the immediate post dominator) it is possible that
-      // they will reach a loop exit, but not the loop header.
-      //
-      // We conservatively mark the exit blocks as divergent join points
-      // in this case.
-      //
-      // Problem CFG is below:
-      //
-      //     +--> A
-      //     |   / \
-      //     |  B   C
-      //     |  | / |
-      //     +--L   P
-      //
-      // In this cfg, C is the RootBlock and P is C's post-dominator.
-      // It will only visit L and P and then stop because it hits the
-      // post dominator. Most loops do not hit this case because the
-      // loop exiting block (C) will branch directly back to the loop
-      // header.
-      //
-      if (HeaderDefBlock) {
-        for (const auto *ExitBlock : ReachedLoopExits) {
-          auto ItExitDef = DefMap.find(ExitBlock);
-          assert((ItExitDef != DefMap.end()) &&
-                 "no reaching def at reachable loop exit");
-          if (ItExitDef->second != HeaderDefBlock) {
-            JoinBlocks->insert(ExitBlock);
-          }
-        }
-      } else {
-        for (const auto *ExitBlock : ReachedLoopExits) {
-          JoinBlocks->insert(ExitBlock);
-        }
-      }
-    }
-
-    return std::move(JoinBlocks);
-  }
-};
-
-// AMDGPU change begin.
-// For all join blocks caused by divergent RootBlock, the prevs of a join block
-// which are in DefMap or the RootBlock are divergent join each other on the
-// join block because of divergent RootBlock.
-static void
-updateJoinMap(const MachineBasicBlock *RootBlock,
-              DenseMap<const MachineBasicBlock *,
-                       SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
-              DivergencePropagator::DefiningBlockMap &DefMap,
-              ConstBlockSet &JoinBlocks) {
-  for (const MachineBasicBlock *JoinBB : JoinBlocks) {
-    // makr divergent join for all pred pair which in DefMap.
-    for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end();
-         predIt++) {
-      auto predIt2 = predIt;
-      const MachineBasicBlock *pred = *predIt;
-      if (DefMap.count(pred) == 0 && pred != RootBlock)
-        continue;
-
-      for (predIt2++; predIt2 != JoinBB->pred_end(); predIt2++) {
-        const MachineBasicBlock *pred2 = *predIt2;
-        if (DefMap.count(pred2) == 0 && pred2 != RootBlock)
-          continue;
-
-        JoinMap[pred].insert(pred2);
-        JoinMap[pred2].insert(pred);
-        LLVM_DEBUG(dbgs() << "joint_bb0: " << pred->getName()
-                          << " joint_bb1: " << pred2->getName() << "\n";);
-      }
-    }
-  }
-}
-// AMDGPU change end.
-
-const ConstBlockSet &
-SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
-  using LoopExitVec = SmallVector<MachineBasicBlock *, 4>;
-  LoopExitVec LoopExits;
-  MachineLoop.getExitBlocks(LoopExits);
-  if (LoopExits.size() < 1) {
-    return EmptyBlockSet;
-  }
-
-  // already available in cache?
-  auto ItCached = CachedLoopExitJoins.find(&MachineLoop);
-  if (ItCached != CachedLoopExitJoins.end()) {
-    return *ItCached->second;
-  }
-
-  // dont propagte beyond the immediate post dom of the loop
-  const auto *PdNode =
-      PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
-  const auto *IpdNode = PdNode->getIDom();
-  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) {
-    IpdNode = IpdNode->getIDom();
-    PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  }
-
-  // compute all join points
-  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
-  auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
-      *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(),
-      PdBoundBlock);
-
-  // AMDGPU change begin.
-  // Save divergent join pairs.
-  updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap,
-                *JoinBlocks.get());
-  // AMDGPU change end.
-
-  auto ItInserted =
-      CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
-  assert(ItInserted.second);
-  return *ItInserted.first->second;
-}
-
-const ConstBlockSet &
-SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
-  // trivial case
-  if (Term.getParent()->succ_size() < 1) {
-    return EmptyBlockSet;
-  }
-
-  // already available in cache?
-  auto ItCached = CachedBranchJoins.find(&Term);
-  if (ItCached != CachedBranchJoins.end())
-    return *ItCached->second;
-
-  // dont propagate beyond the immediate post dominator of the branch
-  const auto *PdNode =
-      PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
-  const auto *IpdNode = PdNode->getIDom();
-  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-
-  // compute all join points
-  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
-  const auto &TermBlock = *Term.getParent();
-
-  // AMDGPU CHANGE
-  // Make sure the post-dominator is outside the loop for the loop header.
-  // Otherwise, we may not find all the join blocks in the loop
-  // because the search stops too early. Some join points can be reached
-  // after the post-dominator!
-  //
-  // Problem CFG is below:
-  //
-  //     +--> A
-  //     |   / \
-  //     |  B   P
-  //     |  | / |
-  //     +--L   X
-  //
-  // In this cfg, A is the loop header and P is A's post-dominator.
-  // The algorithm to mark join points does an Reverse Post Order walk
-  // from A and stops when it reaches the post dominator. It would not
-  // mark the phi node in L as divergent even when A had a divergent branch.
-  // The fix we made was to make the join point search continue all the way
-  // to the loops post dominator (which is X in this example).
-  //
-  // NOTE: They already made this change for the loop case above, but for
-  //       a different bug apparently. See
-  //       SyncDependenceAnalysis::join_blocks(MachineLoop&)
-  //
-  const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock);
-  if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) {
-    while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
-      IpdNode = IpdNode->getIDom();
-      PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-    }
-  }
-
-  auto JoinBlocks = Propagator.computeJoinPoints(
-      TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock);
-
-  // AMDGPU change begin.
-  // Save divergent join pairs.
-  updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap,
-                *JoinBlocks.get());
-  // AMDGPU change end.
-
-  auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
-  assert(ItInserted.second);
-  return *ItInserted.first->second;
-}
-
-} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
deleted file mode 100644
index 92059d85b848a..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
+++ /dev/null
@@ -1,101 +0,0 @@
-//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++
-//-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This file defines the SyncDependenceAnalysis class, which computes for
-// every divergent branch the set of phi nodes that the branch will make
-// divergent.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include <map>
-#include <memory>
-
-namespace llvm {
-class MachineBasicBlock;
-class MachineDominatorTree;
-class MachineLoop;
-class MachinePostDominatorTree;
-class MachineLoopInfo;
-class MachineFunction;
-class MachineInstr;
-
-using DivergentJoinMapTy =
-    llvm::DenseMap<const llvm::MachineBasicBlock *,
-                   llvm::SmallPtrSet<const llvm::MachineBasicBlock *, 4>>;
-
-using ConstBlockSet = llvm::SmallPtrSet<const MachineBasicBlock *, 4>;
-
-/// \brief Relates points of divergent control to join points in
-/// reducible CFGs.
-///
-/// This analysis relates points of divergent control to points of converging
-/// divergent control. The analysis requires all loops to be reducible.
-class SyncDependenceAnalysis {
-  void visitSuccessor(const MachineBasicBlock &succBlock,
-                      const MachineLoop *termLoop,
-                      const MachineBasicBlock *defBlock);
-
-public:
-  bool inRegion(const MachineBasicBlock &BB) const;
-
-  ~SyncDependenceAnalysis();
-  SyncDependenceAnalysis(const MachineDominatorTree &DT,
-                         const MachinePostDominatorTree &PDT,
-                         const MachineLoopInfo &LI,
-                         // AMDGPU change begin
-                         DivergentJoinMapTy &JoinMap
-                         // AMDGPU change end
-  );
-
-  /// \brief Computes divergent join points and loop exits caused by branch
-  /// divergence in \p Term.
-  ///
-  /// The set of blocks which are reachable by disjoint paths from \p Term.
-  /// The set also contains loop exits if there two disjoint paths:
-  /// one from \p Term to the loop exit and another from \p Term to the loop
-  /// header. Those exit blocks are added to the returned set.
-  /// If L is the parent loop of \p Term and an exit of L is in the returned
-  /// set then L is a divergent loop.
-  const ConstBlockSet &join_blocks(const MachineInstr &Term);
-
-  /// \brief Computes divergent join points and loop exits (in the surrounding
-  /// loop) caused by the divergent loop exits of\p MachineLoop.
-  ///
-  /// The set of blocks which are reachable by disjoint paths from the
-  /// loop exits of \p MachineLoop.
-  /// This treats the loop as a single node in \p MachineLoop's parent loop.
-  /// The returned set has the same properties as for join_blocks(TermInst&).
-  const ConstBlockSet &join_blocks(const MachineLoop &MachineLoop);
-
-private:
-  static ConstBlockSet EmptyBlockSet;
-
-  llvm::ReversePostOrderTraversal<const llvm::MachineFunction *> FuncRPOT;
-  const MachineDominatorTree &DT;
-  const MachinePostDominatorTree &PDT;
-  const MachineLoopInfo &LI;
-  // AMDGPU change begin.
-  DivergentJoinMapTy &DivergentJoinMap;
-  // AMDGPU change end.
-  std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>>
-      CachedLoopExitJoins;
-  std::map<const MachineInstr *, std::unique_ptr<ConstBlockSet>>
-      CachedBranchJoins;
-};
-
-} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index f089b210c8849..eac9b57dc9973 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -83,8 +83,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
   AMDGPUMIRUtils.cpp
-  AMDGPUMirDivergenceAnalysis.cpp
-  AMDGPUMirSyncDependenceAnalysis.cpp
   AMDGPUIGroupLP.cpp
   AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp

From 6b011fbeac70b6f6e7473cf77109ac627fdda811 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Wed, 12 Mar 2025 13:14:10 -0700
Subject: [PATCH 11/25] Clang format and warnings.

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 34 +++++++------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index e508ed2a6e2cd..591cfef570d74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -339,7 +339,6 @@ unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
       if (!LIS->hasInterval(Reg))
         continue;
 
-      LaneBitmask LiveMask;
       const auto &LI = LIS->getInterval(Reg);
 
       // Skip local live interval to make live input/ouput faster.
@@ -506,15 +505,11 @@ struct RematNode {
     Clone,
   };
   RematNode()
-      : Reg(0), DefMI(nullptr), Kind(RematKind::Candidate),
-        InsertPointMI(nullptr), InsertBlock(nullptr), Size(0) {}
+      : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr),
+        Kind(RematKind::Candidate), Size(0) {}
   RematNode(unsigned R, MachineInstr *MI, unsigned S)
-      : Reg(R), DefMI(MI), Kind(RematKind::Candidate), InsertPointMI(nullptr),
-        InsertBlock(nullptr), Size(S) {}
-  RematNode(const RematNode &N)
-      : Reg(N.Reg), DefMI(N.DefMI), Kind(N.Kind),
-        InsertPointMI(N.InsertPointMI), InsertBlock(N.InsertBlock),
-        Size(N.Size) {}
+      : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr),
+        Kind(RematKind::Candidate), Size(S) {}
   unsigned Reg;
   MachineInstr *DefMI;
   MachineBasicBlock *InsertBlock;
@@ -528,10 +523,10 @@ struct RematNode {
 
 struct BlockLiveInfo {
   MachineBasicBlock *BB;
-  unsigned maxSReg;
-  unsigned maxVReg;
+  unsigned MaxSReg;
+  unsigned MaxVReg;
   // Input live is the live reg which cross block.
-  const GCNRPTracker::LiveRegSet inputLive;
+  const GCNRPTracker::LiveRegSet InputLive;
 };
 
 // Skip live reg remated to other block.
@@ -893,7 +888,7 @@ void AddCloneCandidate(std::vector<RematNode *> &cloneList,
   // Group user in same blocks.
   std::vector<BlockSet> UserSetList(cloneList.size());
 
-  for (int i = 0; i < cloneList.size(); i++) {
+  for (size_t i = 0; i < cloneList.size(); i++) {
     auto *Node = cloneList[i];
     unsigned Reg = Node->Reg;
     MachineInstr *DefMI = Node->DefMI;
@@ -1010,7 +1005,7 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
   // Collect hot blocks which Exp is live in.
   DenseSet<MachineBasicBlock *> hotBlockSet;
   for (BlockLiveInfo &hotBlock : hotBlocks) {
-    if (hotBlock.inputLive.count(Reg)) {
+    if (hotBlock.InputLive.count(Reg)) {
       hotBlockSet.insert(hotBlock.BB);
     }
   }
@@ -1411,7 +1406,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     // entry block.
     if (MBB != EntryMBB)
       hotBlocks.emplace_back(LiveInfo);
-    GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.inputLive;
+    GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive;
 
     // Update reg pressure based on remat list.
     InstSet VReducedInsts;
@@ -1552,7 +1547,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     }
     // TODO: what to do when cannot reach target?
     if (newRematSCnt > 0) {
-      if (newRematSCnt <= NearTargetRegLimit) {
+      if ((unsigned)newRematSCnt <= NearTargetRegLimit) {
         bNearTarget = true;
       } else {
         if (!bSGPRSpill)
@@ -2838,7 +2833,7 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI,
         continue;
       unsigned dstIdx =
           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
-      if (dstIdx == -1)
+      if (dstIdx == (unsigned)-1)
         continue;
       MachineOperand &DstMO = MI.getOperand(dstIdx);
       if (DstMO.getSubReg() != 0)
@@ -2899,8 +2894,6 @@ bool collectVToSCrossHotSpot(
     }
 
     // Try to make all possible vtos to reduce vpressure.
-    int VExtra = VPressure - VLimit;
-
     const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs();
     for (auto it : CurLives) {
       unsigned Reg = it.first;
@@ -2908,7 +2901,6 @@ bool collectVToSCrossHotSpot(
       if (UniformIt == UniformMap.end())
         continue;
       VToSMap[UniformIt->first] = UniformIt->second;
-      VExtra--;
       bUpdated = true;
     }
   }
@@ -4252,7 +4244,7 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
                            const SIRegisterInfo *SIRI,
                            const SIInstrInfo *SIII) {
   bool bUpdated = false;
-  bool bCanClone = EnableSubExpClone | EnableSubExpAggressive;
+  bool bCanClone = EnableSubExpClone || EnableSubExpAggressive;
 
   SlotIndexes *slotIndexes = LIS->getSlotIndexes();
   // Sort hot blocks by pressure first.

From eb4f8c19817be23c7e05f62b02c3b0320b840eb7 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Thu, 13 Mar 2025 13:29:19 -0700
Subject: [PATCH 12/25] First batch of formatting changes

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 1169 ++++++++---------
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    |   60 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h      |   26 +-
 3 files changed, 618 insertions(+), 637 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 591cfef570d74..ed7093f85823d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -71,7 +71,7 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
   DenseSet<const MachineInstr *> TotalUniformInsts;
   DenseSet<const MachineInstr *> SafeToRemoveInsts;
   DenseSet<const MachineInstr *> DivergentInsts;
-  void RemoveInst(const MachineInstr *MI) {
+  void removeInst(const MachineInstr *MI) {
     TotalUniformInsts.erase(MI);
     SafeToRemoveInsts.erase(MI);
     DivergentInsts.erase(MI);
@@ -102,8 +102,8 @@ typedef AMDGPUHotBlockRematerialize Remat;
 // Util functions.
 namespace {
 
-MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT,
-                                            BlockSet &Blocks) {
+MachineBasicBlock *NearestCommonDominator(MachineDominatorTree *DT,
+                                          BlockSet &Blocks) {
   auto I = Blocks.begin(), E = Blocks.end();
 
   MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
@@ -150,9 +150,9 @@ MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT,
   return DomB;
 }
 
-MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB,
-                                           MachineDominatorTree *DT,
-                                           MachineLoopInfo *LI) {
+MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB,
+                                        MachineDominatorTree *DT,
+                                        MachineLoopInfo *LI) {
   while (LI->getLoopDepth(BB) > 0) {
     MachineDomTreeNode *N = DT->getNode(BB);
     if (N == nullptr)
@@ -168,9 +168,9 @@ MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB,
 }
 
 MachineBasicBlock *
-FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
+findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
                 MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
-                const MachineRegisterInfo &MRI, bool bMemBound) {
+                const MachineRegisterInfo &MRI, bool MemBound) {
 
   BlockSet BBSet;
   for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
@@ -181,14 +181,14 @@ FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
 
   MachineBasicBlock *BB = *BBSet.begin();
   if (BBSet.size() > 1) {
-    MachineBasicBlock *BDom = nearest_common_dominator(DT, BBSet);
+    MachineBasicBlock *BDom = NearestCommonDominator(DT, BBSet);
     if (!BDom)
       return nullptr;
     BB = BDom;
   }
   // Try to find non loop dominator.
-  if (!bMemBound) {
-    BB = find_non_loop_dominator(BB, DT, MLI);
+  if (!MemBound) {
+    BB = findNonLoopDominator(BB, DT, MLI);
   }
   if (!BB)
     return nullptr;
@@ -204,7 +204,7 @@ FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
 }
 
 // Maybe expensive to be called all over the place
-bool IsUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   for (auto &Def : DefMI->defs()) {
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) {
       if (UseMI.isPHI())
@@ -214,9 +214,9 @@ bool IsUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   return false;
 }
 
-bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   // Do not move PHI nodes
-  if (IsUsedByPhi(DefMI, MRI))
+  if (isUsedByPhi(DefMI, MRI))
     return false;
 
   unsigned OpNum = DefMI->getNumOperands();
@@ -235,18 +235,18 @@ bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
 
 // SGPR has alignment requirment, cannot get accurate reg number.
 const unsigned NearTargetRegLimit = 10;
-bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST,
+bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST,
                    MachineFunction &MF) {
-  unsigned maxSGPR = ST->getAddressableNumSGPRs();
+  unsigned MaxSGPR = ST->getAddressableNumSGPRs();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  Register ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg)
-    maxSGPR -= 4;
+    MaxSGPR -= 4;
 
   const unsigned AlignmentDelta = 3;
-  maxSGPR -= AlignmentDelta;
+  MaxSGPR -= AlignmentDelta;
 
-  return maxSPressure > maxSGPR;
+  return MaxSPressure > MaxSGPR;
 }
 
 struct RematStatus {
@@ -258,9 +258,9 @@ struct RematStatus {
   unsigned InputPhysicalVPressure;
   unsigned InputPhysicalSPressure;
   // More occupancy can help more than latency cost to reach it.
-  bool bMemBound;
+  bool MemBound;
   // abs(VTargetOcc-STargetOcc) > 1.
-  bool bNotBalance;
+  bool NotBalance;
   DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
   DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
   DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
@@ -270,10 +270,9 @@ struct RematStatus {
   DenseSet<MachineBasicBlock *> MemWriteMBBSet;
 };
 
-unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
-                            const MachineRegisterInfo &MRI,
-                            const GCNSubtarget *ST, unsigned &maxVPressure,
-                            unsigned &maxSPressure, RematStatus &status) {
+unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+                            const GCNSubtarget *ST, unsigned &MaxVPressure,
+                            unsigned &MaxSPressure, RematStatus &Status) {
   // Skip processing current block if it has only debug instructions
   if (MBB.getFirstNonDebugInstr() == MBB.end())
     return ST->getOccupancyWithNumVGPRs(0);
@@ -284,32 +283,32 @@ unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
   if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB))
     return ST->getOccupancyWithNumVGPRs(0);
 
-  GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
-  RPTracker.reset(*BBEnd, &outputLive, true);
+  GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
+  RPTracker.reset(*BBEnd, &OutputLive, true);
 
   for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
     MachineInstr &MI = (*I++);
     RPTracker.recede(MI);
     if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
-      status.MemWriteMBBSet.insert(&MBB);
+      Status.MemWriteMBBSet.insert(&MBB);
   }
 
   GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
-  unsigned sPressure = RP.getMaxSGPR();
-  if (sPressure > maxSPressure) {
-    maxSPressure = sPressure;
+  unsigned SPressure = RP.getMaxSGPR();
+  if (SPressure > MaxSPressure) {
+    MaxSPressure = SPressure;
   }
-  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > maxVPressure) {
-    maxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) {
+    MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
   }
-  status.MBBPressureMap[&MBB] = RP;
+  Status.MBBPressureMap[&MBB] = RP;
   return RP.getOccupancy(*ST);
 }
 
-unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
                            const MachineRegisterInfo &MRI,
-                           const GCNSubtarget *ST, unsigned &maxVPressure,
-                           unsigned &maxSPressure, RematStatus &status) {
+                           const GCNSubtarget *ST, unsigned &MaxVPressure,
+                           unsigned &MaxSPressure, RematStatus &Status) {
   unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
   // If only have one block, input/ouput virtual live set are empty.
   if (MF.size() > 1) {
@@ -345,22 +344,22 @@ unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
       if (llvm::isLocalLiveInterval(LI, SlotIndexes))
         continue;
 
-      for (auto inputIt : MBBInputSlotMap) {
-        MachineBasicBlock *MBB = inputIt.first;
-        auto SI = inputIt.second;
+      for (auto InputIt : MBBInputSlotMap) {
+        MachineBasicBlock *MBB = InputIt.first;
+        auto SI = InputIt.second;
 
         auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
         if (LiveMask.any())
-          status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+          Status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
       }
 
-      for (auto outputIt : MBBOutputSlotMap) {
-        MachineBasicBlock *MBB = outputIt.first;
-        auto SI = outputIt.second;
+      for (auto OutputIt : MBBOutputSlotMap) {
+        MachineBasicBlock *MBB = OutputIt.first;
+        auto SI = OutputIt.second;
 
         auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
         if (LiveMask.any())
-          status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+          Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
       }
     }
   }
@@ -368,70 +367,70 @@ unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
   LLVM_DEBUG(
       const SIRegisterInfo *SIRI = ST->getRegisterInfo();
       dbgs() << "output live"; for (auto &it
-                                    : status.MBBOutputLiveMap) {
+                                    : Status.MBBOutputLiveMap) {
         unsigned Idx = it.first->getNumber();
         auto LiveReg = it.second;
         dbgs() << "MBB" << Idx << ":";
         llvm::dumpLiveSet(LiveReg, SIRI);
       } dbgs() << "input live";
       for (auto &it
-           : status.MBBInputLiveMap) {
+           : Status.MBBInputLiveMap) {
         unsigned Idx = it.first->getNumber();
         auto LiveReg = it.second;
         dbgs() << "MBB" << Idx << ":";
         llvm::dumpLiveSet(LiveReg, SIRI);
       });
 
-  for (auto it = MF.begin(); it != MF.end(); ++it) {
-    MachineBasicBlock &MBB = *it;
-    unsigned Occ = CollectMBBPressure(MBB, LIS, MRI, ST, maxVPressure,
-                                      maxSPressure, status);
+  for (auto It = MF.begin(); It != MF.end(); ++It) {
+    MachineBasicBlock &MBB = *It;
+    unsigned Occ =
+        collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status);
     if (TgtOcc > Occ)
       TgtOcc = Occ;
   }
   return TgtOcc;
 }
-RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
                            LiveIntervals *LIS, const MachineRegisterInfo &MRI,
                            const GCNSubtarget *ST) {
-  unsigned maxSPressure = 0;
-  unsigned maxVPressure = 0;
-  RematStatus status;
+  unsigned MaxSPressure = 0;
+  unsigned MaxVPressure = 0;
+  RematStatus Status;
   unsigned TgtOcc =
-      CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, maxSPressure, status);
+      collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status);
   const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
   if (TgtOcc >= MaxOcc) {
-    status.TargetOcc = TgtOcc;
-    status.TargetVLimit = 0;
-    status.TargetSLimit = 0;
-    status.MaxVPressure = 0;
-    status.MaxSPressure = 0;
-    status.InputPhysicalVPressure = 0;
-    status.InputPhysicalSPressure = 0;
-    status.bMemBound = false;
-    status.bNotBalance = false;
-    return status;
+    Status.TargetOcc = TgtOcc;
+    Status.TargetVLimit = 0;
+    Status.TargetSLimit = 0;
+    Status.MaxVPressure = 0;
+    Status.MaxSPressure = 0;
+    Status.InputPhysicalVPressure = 0;
+    Status.InputPhysicalSPressure = 0;
+    Status.MemBound = false;
+    Status.NotBalance = false;
+    return Status;
   }
 
-  maxSPressure += RegForVCC;
-  maxVPressure = std::min(maxVPressure, ST->getMaxNumVGPRs(MF));
-  unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(maxSPressure);
-  unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(maxVPressure);
+  MaxSPressure += RegForVCC;
+  MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF));
+  unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure);
+  unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure);
 
-  llvm::SchedScore totalScore = llvm::CollectLatency(MF, *ST, MLI);
-  bool bMemBound =
-      totalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+  llvm::SchedScore TotalScore = llvm::CollectLatency(MF, *ST, MLI);
+  bool MemBound =
+      TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
 
-  bool bNotBalance = false;
+  bool NotBalance = false;
 
   const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
   // Currently, only sgpr bound can be fixed with remat.
   if (STgtOcc < VTgtOcc) {
-    unsigned bigOcc = std::max(STgtOcc, VTgtOcc);
-    // Change TgtOcc to bigOcc in case sgpr and vgpr is not balance.
-    if (bigOcc > TgtOcc) {
-      TgtOcc = bigOcc;
-      bNotBalance = true;
+    unsigned BigOcc = std::max(STgtOcc, VTgtOcc);
+    // Change TgtOcc to  in case sgpr and vgpr is not balance.
+    if (BigOcc > TgtOcc) {
+      TgtOcc = BigOcc;
+      NotBalance = true;
       if (TgtOcc >= MaxOccupancy)
         TgtOcc = MaxOccupancy - 1;
     }
@@ -440,34 +439,34 @@ RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
   // Collect input physical pressure.
   const SIRegisterInfo *SIRI = ST->getRegisterInfo();
 
-  unsigned vInputPressure = 0;
-  uint64_t sInputMask = 0;
-  for (const auto &livein : MRI.liveins()) {
-    const Register Reg = livein.first;
+  unsigned VInputPressure = 0;
+  uint64_t SInputMask = 0;
+  for (const auto &Livein : MRI.liveins()) {
+    const Register Reg = Livein.first;
     const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
     assert(Reg.isPhysical() && "input must be physical reg");
     unsigned RegSize = RC->getLaneMask().getNumLanes();
     if (SIRI->isVGPR(MRI, Reg)) {
-      vInputPressure += RegSize;
+      VInputPressure += RegSize;
     } else {
       unsigned RegIndex = SIRI->getHWRegIndex(Reg);
-      uint64_t mask = ((1 << RegSize) - 1) << RegIndex;
-      sInputMask |= mask;
+      uint64_t Mask = ((1 << RegSize) - 1) << RegIndex;
+      SInputMask |= Mask;
     }
   }
   // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high
   // pressure.
-  unsigned sInputPressure = 0;
-  uint64_t mask = 0xf;
-  while (mask != 0) {
-    if (mask & sInputMask) {
-      sInputPressure += 4;
+  unsigned SInputPressure = 0;
+  uint64_t Mask = 0xf;
+  while (Mask != 0) {
+    if (Mask & SInputMask) {
+      SInputPressure += 4;
     }
-    mask = mask << 4;
+    Mask = Mask << 4;
   }
 
   // If balanced, try next occupancy.
-  TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1);
+  TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1);
 
   auto CC = MF.getFunction().getCallingConv();
   bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS;
@@ -481,16 +480,16 @@ RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
   unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true);
   unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc);
 
-  status.TargetOcc = TgtOcc;
-  status.TargetVLimit = VLimit;
-  status.TargetSLimit = SLimit;
-  status.MaxVPressure = maxVPressure;
-  status.MaxSPressure = maxSPressure;
-  status.InputPhysicalVPressure = vInputPressure;
-  status.InputPhysicalSPressure = sInputPressure;
-  status.bMemBound = bMemBound;
-  status.bNotBalance = bNotBalance;
-  return status;
+  Status.TargetOcc = TgtOcc;
+  Status.TargetVLimit = VLimit;
+  Status.TargetSLimit = SLimit;
+  Status.MaxVPressure = MaxVPressure;
+  Status.MaxSPressure = MaxSPressure;
+  Status.InputPhysicalVPressure = VInputPressure;
+  Status.InputPhysicalSPressure = SInputPressure;
+  Status.MemBound = MemBound;
+  Status.NotBalance = NotBalance;
+  return Status;
 }
 
 } // namespace
@@ -530,22 +529,22 @@ struct BlockLiveInfo {
 };
 
 // Skip live reg remated to other block.
-void UpdateLiveInfo(MapVector<unsigned, RematNode> &RematMap,
+void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
                     GCNRPTracker::LiveRegSet &LiveSet,
-                    const GCNRPTracker::LiveRegSet &inputLive,
+                    const GCNRPTracker::LiveRegSet &InputLive,
                     MachineBasicBlock *CurBB,
                     DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
-  for (auto &it : RematMap) {
-    unsigned Reg = it.first;
+  for (auto &It : RematMap) {
+    unsigned Reg = It.first;
     // Skip reg not in live set.
     if (!LiveSet.count(Reg))
       continue;
     // Skip reg already in input set.
-    // Input set will be taken care in GetReducedSize.
-    if (inputLive.count(Reg))
+    // Input set will be taken care in getReducedSize.
+    if (InputLive.count(Reg))
       continue;
 
-    auto &Node = it.second;
+    auto &Node = It.second;
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       MachineBasicBlock *InsertBB = Node.InsertBlock;
       // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
@@ -562,7 +561,7 @@ void UpdateLiveInfo(MapVector<unsigned, RematNode> &RematMap,
   }
 }
 
-int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
+int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
                          const MachineRegisterInfo &MRI,
                          const SIRegisterInfo *SIRI) {
 
@@ -586,8 +585,7 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
       if (!Reg.isVirtual())
         continue;
 
-      bool isVGPR = SIRI->isVGPR(MRI, MO.getReg());
-      if (bVGPR != isVGPR) {
+      if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) {
         // Not support mix of v and s when remat now.
         continue;
       }
@@ -623,20 +621,19 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
   return SharedSize;
 }
 
-int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
+int getReducedSize(MapVector<Register, RematNode> &RematMap,
                    GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts,
-                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                   BlockLiveInfo &LiveInfo,
+                   const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo,
                    DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
   int ReducedSize = 0;
-  for (auto &it : RematMap) {
-    unsigned Reg = it.first;
+  for (auto &It : RematMap) {
+    Register Reg = It.first;
 
     if (!CanidateSet.count(Reg))
       continue;
 
-    bool bReduced = false;
-    auto &Node = it.second;
+    bool IsReduced = false;
+    auto &Node = It.second;
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       MachineBasicBlock *InsertBB = Node.InsertBlock;
       // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
@@ -644,19 +641,19 @@ int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
       unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
       unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
       if (LiveBBIndex < InsertBBIndex)
-        bReduced = true;
+        IsReduced = true;
     } else {
       // Clone.
-      bReduced = true;
+      IsReduced = true;
       // If has use in LiveInfo.BB, could not reduce from input live.
       for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
         if (UseMI.getParent() == LiveInfo.BB) {
-          bReduced = false;
+          IsReduced = false;
           break;
         }
       }
     }
-    if (bReduced) {
+    if (IsReduced) {
       ReducedSize += Node.Size;
       ReducedInsts.insert(Node.DefMI);
     }
@@ -668,11 +665,9 @@ int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
   return ReducedSize;
 }
 
-int RematGain(MachineInstr *DefMI, unsigned Reg,
-              GCNRPTracker::LiveRegSet &CandidateRegSet,
-              const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-              bool bVGPR) {
-  int rematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
+              const SIRegisterInfo *SIRI, bool IsVGPR) {
+  int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
   for (MachineOperand &MO : DefMI->operands()) {
     if (MO.isImm())
       continue;
@@ -688,32 +683,31 @@ int RematGain(MachineInstr *DefMI, unsigned Reg,
 
     // Don't move user of VCC.
     if (MO.getReg() == AMDGPU::VCC) {
-      rematSize = 0;
+      RematSize = 0;
       break;
     }
     Register Reg = MO.getReg();
 
     // Don't move physical register use.
     if (Reg.isPhysical()) {
-      rematSize = 0;
+      RematSize = 0;
       break;
     }
 
-    bool isVGPR = SIRI->isVGPR(MRI, Reg);
-    if (bVGPR != isVGPR) {
+    if (IsVGPR != SIRI->isVGPR(MRI, Reg)) {
       // Not support mix of v and s when remat now.
       // TODO: count possible pressure change here.
-      rematSize = 0;
+      RematSize = 0;
       break;
     }
-    bool bSingleDef = MRI.hasOneDef(Reg);
-    if (!bSingleDef) {
-      bSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI);
+    bool IsSingleDef = MRI.hasOneDef(Reg);
+    if (!IsSingleDef) {
+      IsSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI);
     }
 
-    if (bSingleDef) {
-      // The reg might share with other candidates, but not check it here.
-      // Count share reg in GetReducedSize.
+    if (IsSingleDef) {
+      // The reg might share with other candidates,  check it here.
+      // Count share reg in getReducedSize.
       if (EnableAggressive) {
         // In case of aggressive remat, treat multi use reg as shared reg and
         // ignore size of shared reg.
@@ -725,72 +719,71 @@ int RematGain(MachineInstr *DefMI, unsigned Reg,
         if (OpRC)
           OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
       }
-      int inputSize = SIRI->getRegSizeInBits(*OpRC);
+      int InputSize = SIRI->getRegSizeInBits(*OpRC);
       // If input not live in hotspot, move it cross hotspot should have
       // less reg then DefMi.
-      if (rematSize > inputSize) {
-        rematSize -= inputSize;
+      if (RematSize > InputSize) {
+        RematSize -= InputSize;
         continue;
       }
     }
 
-    rematSize = 0;
+    RematSize = 0;
     break;
   }
-  return rematSize;
+  return RematSize;
 }
 
-void BuildRematCandiates(std::vector<RematNode> &Candidates,
+void buildRematCandiates(std::vector<RematNode> &Candidates,
                          GCNRPTracker::LiveRegSet &CandidateRegSet,
                          DenseSet<unsigned> &PinnedRegSet,
                          const MachineRegisterInfo &MRI,
                          const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
-                         bool bVGPR) {
+                         bool IsVGPR) {
 
-  for (auto liveRegIt : CandidateRegSet) {
-    unsigned Reg = liveRegIt.first;
+  for (auto LiveRegIt : CandidateRegSet) {
+    unsigned Reg = LiveRegIt.first;
     // Skip unsafe reg.
     if (PinnedRegSet.count(Reg))
       continue;
 
-    bool isVGPR = SIRI->isVGPR(MRI, Reg);
-    if (isVGPR != bVGPR)
+    if (SIRI->isVGPR(MRI, Reg) != IsVGPR)
       continue;
-    bool bSafeCandidate = true;
+    bool IsSafeCandidate = true;
     MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
     if (MI) {
-      if (bVGPR) {
+      if (IsVGPR) {
         // Only remat valu now.
         if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY)
-          bSafeCandidate = false;
+          IsSafeCandidate = false;
         if (MI->getOpcode() == AMDGPU::COPY) {
           // Make sure src is unique define.
           if (MI->getOperand(1).isReg() &&
               nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg()))
-            bSafeCandidate = false;
+            IsSafeCandidate = false;
         } else {
           // Skip convergent valu.
           if (MI->isConvergent())
-            bSafeCandidate = false;
+            IsSafeCandidate = false;
         }
       }
       // Skip inst has more than 1 def.
       if (MI->getDesc().NumDefs > 1)
-        bSafeCandidate = false;
+        IsSafeCandidate = false;
     } else {
-      bSafeCandidate = false;
+      IsSafeCandidate = false;
     }
 
-    if (bSafeCandidate) {
-      int gain = RematGain(MI, Reg, CandidateRegSet, MRI, SIRI, bVGPR);
-      if (gain > 0) {
-        Candidates.emplace_back(RematNode(Reg, MI, gain >> 5));
+    if (IsSafeCandidate) {
+      int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR);
+      if (Gain > 0) {
+        Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5));
       } else {
-        bSafeCandidate = false;
+        IsSafeCandidate = false;
       }
     }
     // Save unsafe reg.
-    if (!bSafeCandidate)
+    if (!IsSafeCandidate)
       PinnedRegSet.insert(Reg);
   }
 
@@ -812,57 +805,57 @@ bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
     return false;
 
   auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo();
-  for (MachineOperand &def : DefMI->implicit_operands()) {
-    if (!def.isReg())
+  for (MachineOperand &Def : DefMI->implicit_operands()) {
+    if (!Def.isReg())
       continue;
-    if (def.isUse())
+    if (Def.isUse())
       continue;
-    unsigned Reg = def.getReg();
+    Register Reg = Def.getReg();
     if (UseMI->readsRegister(Reg, TRI))
       return true;
   }
   return false;
 }
 
-void AddOneDefOneUseCandidate(RematNode &Node,
+void addOneDefOneUseCandidate(RematNode &Node,
                               std::vector<RematNode> &RematList,
-                              MachineRegisterInfo &MRI, int &rematCnt,
+                              MachineRegisterInfo &MRI, int &RematCnt,
                               MachineDominatorTree *DT,
                               MachinePostDominatorTree *PDT,
-                              MachineLoopInfo *MLI, bool bVGPR,
-                              bool bMemBound) {
+                              MachineLoopInfo *MLI, bool IsVGPR,
+                              bool MemBound) {
   unsigned Reg = Node.Reg;
   MachineInstr *DefMI = Node.DefMI;
 
-  unsigned size = Node.Size;
+  unsigned Size = Node.Size;
   MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin();
   MachineBasicBlock *InsertBB = UseMI->getParent();
 
   // For VGPR, always move next to the only user to avoid wqm or exec issue.
-  // But doing this will cause issue when DefMI is in wqm but single user not in
+  // But doing this will cause issue when DefMI is in wqm  user not in
   // wqm. Disable VGPR remat for now.
   // TODO: make sure single user don't need wqm.
-  if (!bVGPR) {
+  if (!IsVGPR) {
     if (MachineBasicBlock *NewInsertBB =
-            FindInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, bMemBound)) {
+            findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) {
       if (InsertBB != NewInsertBB) {
         InsertBB = NewInsertBB;
         // If can find a non-loop insert block, go to the insert block.
         if (DefMI->getParent() != InsertBB) {
           if (!InsertBB->empty()) {
-            auto it = InsertBB->getFirstNonPHI();
-            it = skipDebugInstructionsForward(it, InsertBB->end());
-            if (it == InsertBB->end())
+            auto It = InsertBB->getFirstNonPHI();
+            It = skipDebugInstructionsForward(It, InsertBB->end());
+            if (It == InsertBB->end())
               UseMI = nullptr;
             else
-              UseMI = &*it;
+              UseMI = &*It;
           }
         }
       }
     }
   }
 
-  if (bVGPR) {
+  if (IsVGPR) {
     // Don't count reg in same block for valu.
     if (UseMI->getParent() == DefMI->getParent())
       return;
@@ -877,27 +870,26 @@ void AddOneDefOneUseCandidate(RematNode &Node,
   Node.InsertPointMI = UseMI;
   Node.Kind = RematNode::RematKind::OneDefOneUse;
   RematList.emplace_back(Node);
-  rematCnt += size;
+  RematCnt += Size;
 }
 
-void AddCloneCandidate(std::vector<RematNode *> &cloneList,
+void addCloneCandidate(std::vector<RematNode *> &CloneList,
                        std::vector<RematNode> &RematList,
                        DenseSet<unsigned> &PinnedRegSet,
-                       MachineRegisterInfo &MRI, int &rematCnt,
-                       SlotIndexes *SlotIndexes, MachineFunction &MF) {
+                       MachineRegisterInfo &MRI, int &RematCnt) {
   // Group user in same blocks.
-  std::vector<BlockSet> UserSetList(cloneList.size());
+  std::vector<BlockSet> UserSetList(CloneList.size());
 
-  for (size_t i = 0; i < cloneList.size(); i++) {
-    auto *Node = cloneList[i];
+  for (size_t i = 0; i < CloneList.size(); i++) {
+    auto *Node = CloneList[i];
     unsigned Reg = Node->Reg;
     MachineInstr *DefMI = Node->DefMI;
     // Group user in same blocks.
     BlockSet &UserSet = UserSetList[i];
 
-    for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
-         useIt != MRI.use_instr_nodbg_end();) {
-      MachineInstr &UseMI = *(useIt++);
+    for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+         UseIt != MRI.use_instr_nodbg_end();) {
+      MachineInstr &UseMI = *(UseIt++);
       UserSet.insert(UseMI.getParent());
     }
 
@@ -912,36 +904,34 @@ void AddCloneCandidate(std::vector<RematNode *> &cloneList,
       }
     }
 
-    int size = Node->Size;
-    size <<= 16;
+    int Size = Node->Size;
+    Size <<= 16;
     // Pack userSet size to size.
-    size |= UserSet.size();
-    Node->UserCount = size;
+    Size |= UserSet.size();
+    Node->UserCount = Size;
   }
 
-  std::sort(cloneList.begin(), cloneList.end(),
+  std::sort(CloneList.begin(), CloneList.end(),
             // Sort based on userSet size.
-            [](const RematNode *a, const RematNode *b) {
-              static constexpr int mask = 0xffff;
-              return (a->UserCount & mask) < (b->UserCount & mask);
+            [](const RematNode *A, const RematNode *B) {
+              static constexpr int Mask = 0xffff;
+              return (A->UserCount & Mask) < (B->UserCount & Mask);
             });
 
-  for (RematNode *Node : cloneList) {
+  for (RematNode *Node : CloneList) {
     Node->Kind = RematNode::RematKind::Clone;
     RematList.emplace_back(*Node);
-    rematCnt += Node->Size;
+    RematCnt += Node->Size;
   }
 }
 
-int FilterRematCandiates(std::vector<RematNode> &Candidates,
+int filterRematCandiates(std::vector<RematNode> &Candidates,
                          std::vector<RematNode> &RematList,
                          DenseSet<unsigned> &PinnedRegSet,
                          MachineDominatorTree *DT,
                          MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
-                         MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                         MachineFunction &MF, SlotIndexes *SlotIndexes,
-                         bool bVGPR, bool bMemBound) {
-  int rematCnt = 0;
+                         MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) {
+  int RematCnt = 0;
   // Work one def one use first.
   for (auto &Node : Candidates) {
     unsigned Reg = Node.Reg;
@@ -949,17 +939,17 @@ int FilterRematCandiates(std::vector<RematNode> &Candidates,
       continue;
     }
     MachineInstr *DefMI = Node.DefMI;
-    if (!IsSafeToMove(DefMI, MRI)) {
+    if (!isSafeToMove(DefMI, MRI)) {
       PinnedRegSet.insert(Reg);
       continue;
     }
 
-    AddOneDefOneUseCandidate(Node, RematList, MRI, rematCnt, DT, PDT, MLI,
-                             bVGPR, bMemBound);
+    addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI,
+                             IsVGPR, MemBound);
   }
 
-  if (!bVGPR) {
-    std::vector<RematNode *> cloneList;
+  if (!IsVGPR) {
+    std::vector<RematNode *> CloneList;
     // Try multi use case.
     for (auto &Node : Candidates) {
       unsigned Reg = Node.Reg;
@@ -967,23 +957,22 @@ int FilterRematCandiates(std::vector<RematNode> &Candidates,
         continue;
       }
       MachineInstr *DefMI = Node.DefMI;
-      if (!IsSafeToMove(DefMI, MRI)) {
+      if (!isSafeToMove(DefMI, MRI)) {
         PinnedRegSet.insert(Reg);
         continue;
       }
 
       // Clone for each user.
-      cloneList.emplace_back(&Node);
+      CloneList.emplace_back(&Node);
     }
 
-    AddCloneCandidate(cloneList, RematList, PinnedRegSet, MRI, rematCnt,
-                      SlotIndexes, MF);
+    addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt);
   }
 
-  return rematCnt;
+  return RematCnt;
 }
 
-void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
+void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
                  SmallVector<MachineInstr *, 2> &userMIs) {
   for (MachineInstr *UseMI : userMIs) {
     for (MachineOperand &MO : UseMI->operands()) {
@@ -991,7 +980,7 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
         continue;
       if (MO.getReg() == Reg) {
         MO.setReg(NewReg);
-        if (bSubRegDef)
+        if (IsSubRegDef)
           MO.setSubReg(0);
       }
     }
@@ -1001,7 +990,7 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
 DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &userBlocks,
     DenseSet<MachineBasicBlock *> &UserMBBSet,
-    std::vector<BlockLiveInfo> &hotBlocks, MachineDominatorTree *pDT) {
+    std::vector<BlockLiveInfo> &hotBlocks, MachineDominatorTree *DT) {
   // Collect hot blocks which Exp is live in.
   DenseSet<MachineBasicBlock *> hotBlockSet;
   for (BlockLiveInfo &hotBlock : hotBlocks) {
@@ -1020,22 +1009,22 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     if (hotBlockSet.count(MBB))
       continue;
 
-    bool bDomAllHotBlocks = true;
-    bool bDomedByAllHotBlocks = true;
+    bool IsDomAllHotBlocks = true;
+    bool IsDomedByAllHotBlocks = true;
     for (MachineBasicBlock *hotMBB : hotBlockSet) {
-      if (!pDT->dominates(MBB, hotMBB)) {
-        bDomAllHotBlocks = false;
+      if (!DT->dominates(MBB, hotMBB)) {
+        IsDomAllHotBlocks = false;
       }
-      if (!pDT->dominates(hotMBB, MBB)) {
-        bDomedByAllHotBlocks = false;
+      if (!DT->dominates(hotMBB, MBB)) {
+        IsDomedByAllHotBlocks = false;
       }
-      if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+      if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
         break;
       }
     }
-    if (bDomAllHotBlocks) {
+    if (IsDomAllHotBlocks) {
       userBlocks.erase(MBB);
-    } else if (bDomedByAllHotBlocks) {
+    } else if (IsDomedByAllHotBlocks) {
       afterHotRangeMBBs.insert(MBB);
     }
   }
@@ -1049,7 +1038,7 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
         MachineBasicBlock *MBB2 = it2;
         if (MBB == MBB2)
           continue;
-        if (pDT->dominates(MBB, MBB2)) {
+        if (DT->dominates(MBB, MBB2)) {
           auto &Dom = DomMap[MBB];
           Dom.insert(MBB2);
           auto &Dom2 = DomMap[MBB2];
@@ -1113,7 +1102,7 @@ static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
 
 void ApplyCloneRemat(Remat *Remat, RematNode &Node,
                      std::vector<BlockLiveInfo> &hotBlocks,
-                     MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+                     MachineDominatorTree *DT, MachineRegisterInfo &MRI,
                      SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
                      const SIInstrInfo *SIII, MachineFunction &MF) {
   unsigned Reg = Node.Reg;
@@ -1123,10 +1112,10 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
   const MCInstrDesc &Desc = DefMI->getDesc();
   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
   // When the unique def has subReg, just create newReg for the subReg part.
-  bool bSubRegDef = false;
+  bool IsSubRegDef = false;
   if (DefOp.getSubReg() != 0) {
     RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
-    bSubRegDef = true;
+    IsSubRegDef = true;
   }
   const DebugLoc DL = DefMI->getDebugLoc();
   unsigned OpNum = DefMI->getNumOperands();
@@ -1144,7 +1133,7 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
   }
 
   DenseMap<MachineBasicBlock *, BlockSet> DomMap =
-      reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, pDT);
+      reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, DT);
 
   for (auto useIt : UserMap) {
     MachineBasicBlock *MBB = useIt.first;
@@ -1185,14 +1174,14 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
     SlotIndexes->insertMachineInstrInMaps(*NewDef);
 
     SmallVector<MachineInstr *, 2> &userMIs = useIt.second;
-    updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+    updateUsers(Reg, NewReg, IsSubRegDef, userMIs);
 
     // update users in dom MBBs.
     auto domMapIt = DomMap.find(MBB);
     if (domMapIt != DomMap.end()) {
       for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
         SmallVector<MachineInstr *, 2> &userMIs = UserMap[UpdateMBB];
-        updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+        updateUsers(Reg, NewReg, IsSubRegDef, userMIs);
       }
     }
 
@@ -1200,7 +1189,7 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
   }
   if (MRI.use_empty(Reg)) {
     SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
-    Remat->RemoveInst(DefMI);
+    Remat->removeInst(DefMI);
     DefMI->eraseFromParent();
   }
 }
@@ -1235,9 +1224,9 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
   slotIndexes->insertMachineInstrInMaps(*DefMI);
 }
 
-void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
+void ApplyRemat(Remat *Remat, MapVector<Register, RematNode> &RematMap,
                 std::vector<BlockLiveInfo> &hotBlocks,
-                MachineDominatorTree *pDT, SlotIndexes *slotIndexes,
+                MachineDominatorTree *DT, SlotIndexes *slotIndexes,
                 MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                 const SIInstrInfo *SIII, MachineFunction &MF) {
   std::vector<RematNode> UpdateList;
@@ -1257,13 +1246,13 @@ void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
     } else if (Node.Kind == RematNode::RematKind::Clone) {
-      ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII,
+      ApplyCloneRemat(Remat, Node, hotBlocks, DT, MRI, slotIndexes, SIRI, SIII,
                       MF);
     }
   }
 }
 
-void dumpRematMap(MapVector<unsigned, RematNode> &RematMap,
+void dumpRematMap(MapVector<Register, RematNode> &RematMap,
                   const SIRegisterInfo *SIRI) {
   dbgs() << "\n rematMap: \n";
   for (auto it : RematMap) {
@@ -1276,8 +1265,8 @@ void dumpRematMap(MapVector<unsigned, RematNode> &RematMap,
 int DebugBlockIndex = 42;
 
 void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet,
-                  MapVector<unsigned, RematNode> &VRematMap,
-                  MapVector<unsigned, RematNode> &SRematMap, int BlockIndex,
+                  MapVector<Register, RematNode> &VRematMap,
+                  MapVector<Register, RematNode> &SRematMap, int BlockIndex,
                   const SIRegisterInfo *SIRI) {
   if (DebugBlockIndex != BlockIndex)
     return;
@@ -1303,8 +1292,8 @@ void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
 } // namespace
 
 bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
-                   LiveIntervals *LIS, MachineDominatorTree *pDT,
-                   MachinePostDominatorTree *pPDT, bool &bNearTarget) {
+                   LiveIntervals *LIS, MachineDominatorTree *DT,
+                   MachinePostDominatorTree *PDT, bool &IsNearTarget) {
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
 
   const SIInstrInfo *SIII = ST->getInstrInfo();
@@ -1318,8 +1307,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
 
   auto &MRI = MF.getRegInfo();
 
-  bool bUpdated = false;
-  RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+  bool IsUpdated = false;
+  RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST);
 
   const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
   if (status.TargetOcc >= MaxOcc)
@@ -1333,16 +1322,16 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
   if (EnableAggressive)
     rematSCnt += NearTargetRegLimit;
 
-  bool bSGPRSpill = false;
+  bool IsSGPRSpill = false;
   if (rematSCnt > 0) {
-    bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+    IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
   }
 
-  bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+  bool IsForceRematSgpr = IsSGPRSpill | status.NotBalance;
 
   // If bound by lds, skip.
   if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
-      !bForceRematSgpr)
+      !IsForceRematSgpr)
     return false;
 
   MachineBasicBlock *EntryMBB = &MF.front();
@@ -1350,8 +1339,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
   auto *SlotIndexes = LIS->getSlotIndexes();
 
   // Reg which already marked remat.
-  MapVector<unsigned, RematNode> VRematMap;
-  MapVector<unsigned, RematNode> SRematMap;
+  MapVector<Register, RematNode> VRematMap;
+  MapVector<Register, RematNode> SRematMap;
   // Reg which cannot move around to remat.
   DenseSet<unsigned> PinnedRegSet;
   std::vector<BlockLiveInfo> hotBlocks;
@@ -1382,8 +1371,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
       Tracker.advance();
       auto LISLR = Tracker.getLiveRegs();
       // Update live set for things already remated.
-      UpdateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
-      UpdateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+      updateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+      updateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
 
       const GCNRPTracker::LiveRegSet &liveSet = LISLR;
       unsigned VPressure = 0;
@@ -1411,38 +1400,35 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     // Update reg pressure based on remat list.
     InstSet VReducedInsts;
     InstSet SReducedInsts;
-    int VReduced =
-        GetReducedSize(VRematMap, /*bVGPR*/ true, CandidateRegs, VReducedInsts,
-                       MRI, SIRI, LiveInfo, RPOTIndexMap);
-    int SReduced =
-        GetReducedSize(SRematMap, /*bVGPR*/ false, CandidateRegs, SReducedInsts,
-                       MRI, SIRI, LiveInfo, RPOTIndexMap);
+    int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI,
+                                  LiveInfo, RPOTIndexMap);
+    int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI,
+                                  LiveInfo, RPOTIndexMap);
 
     // Calculate size need to be remat.
     int rematVCnt = maxVPressure - VReduced - VLimit;
     int rematSCnt = maxSPressure - SReduced - SLimit;
 
-    bool bSGPRSpill = false;
+    bool IsSGPRSpill = false;
     if (rematSCnt > 0) {
-      bSGPRSpill = nearSgprSpill(maxSPressure, ST, MF);
+      IsSGPRSpill = nearSgprSpill(maxSPressure, ST, MF);
     }
-    bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+    bool IsForceRematSgpr = IsSGPRSpill || status.NotBalance;
     // Try to add candidates into remat list.
 
     int newRematSCnt = 0;
     if (rematSCnt > 0) {
       // Build candidate nodes.
       std::vector<RematNode> SRematCandidates;
-      BuildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
-                          SIII, SIRI, /*bVGPR*/ false);
+      buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
+                          SIII, SIRI, /*IsVGPR*/ false);
 
       LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
       std::vector<RematNode> SRematList;
       // Filter candidates.
-      newRematSCnt =
-          FilterRematCandiates(SRematCandidates, SRematList, PinnedRegSet, pDT,
-                               pPDT, MLI, MRI, SIRI, MF, SlotIndexes,
-                               /*bVGPR*/ false, status.bMemBound);
+      newRematSCnt = filterRematCandiates(SRematCandidates, SRematList,
+                                          PinnedRegSet, DT, PDT, MLI, MRI,
+                                          /*IsVGPR*/ false, status.MemBound);
       if (newRematSCnt > rematSCnt) {
         // Has enough remat node to cover rematCnt.
         int rematCnt = 0;
@@ -1460,51 +1446,49 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
         }
         // Check shared size.
         int SharedReducedSize =
-            GetSharedReducedSize(SReducedInsts, /*bVGPR*/ false, MRI, SIRI);
+            getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
         if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
             rematSCnt) {
           for (RematNode &Node : SRematList) {
             SRematMap[Node.Reg] = Node;
           }
         } else {
-          if (!bForceRematSgpr) {
+          if (!IsForceRematSgpr)
             return false;
-          } else {
-            for (RematNode &Node : SRematList) {
-              SRematMap[Node.Reg] = Node;
-            }
-            // Find local one def one use candidates.
-            for (MachineInstr &MI : *MBB) {
-              if (MI.isDebugInstr())
-                continue;
-              if (MI.getDesc().NumDefs != 1)
-                continue;
-              MachineOperand &DstMO = MI.getOperand(0);
-              Register Reg = DstMO.getReg();
-              if (!SIRI->isSGPRReg(MRI, Reg))
-                continue;
-              if (!MRI.hasOneNonDBGUse(Reg))
-                continue;
-              if (!MRI.hasOneDef(Reg))
-                continue;
-              if (Reg.isPhysical())
-                continue;
-              MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
-              if (UseMI.getParent() != MBB)
+          for (RematNode &Node : SRematList) {
+            SRematMap[Node.Reg] = Node;
+          }
+          // Find local one def one use candidates.
+          for (MachineInstr &MI : *MBB) {
+            if (MI.isDebugInstr())
+              continue;
+            if (MI.getDesc().NumDefs != 1)
+              continue;
+            MachineOperand &DstMO = MI.getOperand(0);
+            Register Reg = DstMO.getReg();
+            if (!SIRI->isSGPRReg(MRI, Reg))
+              continue;
+            if (!MRI.hasOneNonDBGUse(Reg))
+              continue;
+            if (!MRI.hasOneDef(Reg))
+              continue;
+            if (Reg.isPhysical())
+              continue;
+            MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
+            if (UseMI.getParent() != MBB)
+              continue;
+            int gain = rematGain(&MI, Reg, MRI, SIRI,
+                                 /*IsVGPR*/ false);
+            if (gain > 0) {
+              // Skip case when DefMI has implicit define which used by UseMI.
+              if (isImplicitDefUse(&MI, &UseMI)) {
                 continue;
-              int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI,
-                                   /*bVGPR*/ false);
-              if (gain > 0) {
-                // Skip case when DefMI has implicit define which used by UseMI.
-                if (isImplicitDefUse(&MI, &UseMI)) {
-                  continue;
-                }
-                RematNode Node = {Reg, &MI, (unsigned)gain >> 5};
-                Node.InsertPointMI = &UseMI;
-                Node.Kind = RematNode::RematKind::OneDefOneUse;
-                SRematMap[Reg] = Node;
-                SharedReducedSize += Node.Size;
               }
+              RematNode Node = {Reg, &MI, (unsigned)gain >> 5};
+              Node.InsertPointMI = &UseMI;
+              Node.Kind = RematNode::RematKind::OneDefOneUse;
+              SRematMap[Reg] = Node;
+              SharedReducedSize += Node.Size;
             }
           }
         }
@@ -1518,57 +1502,57 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     // Remat these common live range.
     // Apply the remat.
 
-    int newRematVCnt = 0;
+    int NewRematVCnt = 0;
     if (rematVCnt > 0) {
       // TODO: V remat.
     }
 
-    bool bNeedSRemat = rematSCnt > 0;
-    bool bNeedVRemat = rematVCnt > 0;
+    bool NeedSRemat = rematSCnt > 0;
+    bool NeedVRemat = rematVCnt > 0;
     // If sgpr spill, always do remat.
-    bool bSRematOK =
-        (newRematSCnt <= 0 && !SRematMap.empty()) || bForceRematSgpr;
-    bool bVRematOK =
-        (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty();
-    if (bNeedSRemat && bNeedVRemat) {
-      if (bVRematOK && bSRematOK) {
-        bUpdated = true;
-      } else if (bSGPRSpill) {
-        bUpdated = true;
+    bool IsSRematOK =
+        (newRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr;
+    bool IsVRematOK =
+        (status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
+    if (NeedSRemat && NeedVRemat) {
+      if (IsVRematOK && IsSRematOK) {
+        IsUpdated = true;
+      } else if (IsSGPRSpill) {
+        IsUpdated = true;
       }
-    } else if (bNeedSRemat) {
-      if (bSRematOK) {
-        bUpdated = true;
+    } else if (NeedSRemat) {
+      if (IsSRematOK) {
+        IsUpdated = true;
       }
-    } else if (bNeedVRemat) {
-      if (bVRematOK) {
-        bUpdated = true;
+    } else if (NeedVRemat) {
+      if (IsVRematOK) {
+        IsUpdated = true;
       }
     }
     // TODO: what to do when cannot reach target?
     if (newRematSCnt > 0) {
       if ((unsigned)newRematSCnt <= NearTargetRegLimit) {
-        bNearTarget = true;
+        IsNearTarget = true;
       } else {
-        if (!bSGPRSpill)
+        if (!IsSGPRSpill)
           return false;
       }
     }
   }
 
   if (SRematMap.empty() && VRematMap.empty()) {
-    return bUpdated;
+    return IsUpdated;
   }
 
   if (!SRematMap.empty()) {
-    bUpdated = true;
-    ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII,
+    IsUpdated = true;
+    ApplyRemat(Remat, SRematMap, hotBlocks, DT, SlotIndexes, MRI, SIRI, SIII,
                MF);
     LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
   }
 
   // Balance between vector and scalar if possible.
-  return bUpdated;
+  return IsUpdated;
 }
 
 namespace {
@@ -1622,10 +1606,10 @@ static bool isConvergent(Remat *Remat, const MachineInstr &MI) {
 
 bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
                      const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-                     bool bSink) {
+                     bool IsSink) {
   if (Reg.isPhysical())
     return false;
-  bool bVGPR = SIRI->isVGPR(MRI, Reg);
+  bool IsVGPR = SIRI->isVGPR(MRI, Reg);
 
   MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
   if (!DefMI)
@@ -1667,7 +1651,7 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
     }
   }
 
-  if (bVGPR && bSink) {
+  if (IsVGPR && IsSink) {
     // Skip mem related inst.
     if (DefMI->mayLoadOrStore()) {
       return false;
@@ -1686,7 +1670,7 @@ std::vector<SubExp> buildSubExpFromCandidates(
     Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
     const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
-    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
+    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) {
   InstSet CandidateDefs;
   DenseSet<unsigned> RemovedCandidates;
   std::vector<unsigned> CandidateRegs;
@@ -1715,7 +1699,7 @@ std::vector<SubExp> buildSubExpFromCandidates(
   LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
   for (unsigned Reg : CandidateRegs) {
     MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
-    bool bHasNoCandidatesSameBlockUser = false;
+    bool IsHasNoCandidatesSameBlockUser = false;
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       if (UseMI.getParent() == MI->getParent()) {
         if (UseMI.getNumExplicitDefs() == 1) {
@@ -1725,14 +1709,14 @@ std::vector<SubExp> buildSubExpFromCandidates(
               RemovedCandidates.count(UserDefReg) == 0)
             continue;
         }
-        if (!bAllowPartialUseInSubExp)
-          bHasNoCandidatesSameBlockUser = true;
+        if (!AllowPartialUseInSubExp)
+          IsHasNoCandidatesSameBlockUser = true;
         else
           PartialCandidates.insert(MI);
         break;
       }
     }
-    if (bHasNoCandidatesSameBlockUser) {
+    if (IsHasNoCandidatesSameBlockUser) {
       RemovedCandidates.insert(Reg);
       continue;
     }
@@ -1761,15 +1745,15 @@ std::vector<SubExp> buildSubExpFromCandidates(
     // Skip if MI is not safe to move.
     if (MI.getNumDefs() != 1) {
       // allow to move unused implicit def.
-      bool bDeadImplictDef = false;
+      bool IsDeadImplictDef = false;
       for (MachineOperand &MO : MI.implicit_operands()) {
         if (!MO.isReg())
           continue;
         if (!MO.isDef())
           continue;
-        bDeadImplictDef = MO.isDead();
+        IsDeadImplictDef = MO.isDead();
       }
-      if (!bDeadImplictDef)
+      if (!IsDeadImplictDef)
         continue;
     }
 
@@ -1783,24 +1767,24 @@ std::vector<SubExp> buildSubExpFromCandidates(
       break;
     }
 
-    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true))
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true))
       continue;
 
     // If all users of MI are in candidate defs, add MI into candidate defs.
     // If part of user of MI is in candidate defs, add MI into candidate defs
     // when allow partialUse.
-    bool bAllUserInCandidate = true;
-    bool bHasCandidateUser = false;
+    bool IsAllUserInCandidate = true;
+    bool IsHasCandidateUser = false;
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       if (CandidateDefs.count(&UseMI) == 0)
-        bAllUserInCandidate = false;
+        IsAllUserInCandidate = false;
       else
-        bHasCandidateUser = true;
+        IsHasCandidateUser = true;
     }
-    if (!bHasCandidateUser)
+    if (!IsHasCandidateUser)
       continue;
-    if (!bAllUserInCandidate) {
-      if (!bAllowPartialUseInSubExp)
+    if (!IsAllUserInCandidate) {
+      if (!AllowPartialUseInSubExp)
         continue;
       PartialCandidates.insert(&MI);
     }
@@ -1834,10 +1818,9 @@ std::vector<SubExp> buildSubExpFromCandidates(
   std::vector<MachineInstr *> defs;
   defs.reserve(CandidateDefs.size());
   for (MachineInstr &MI : *MBB) {
-    MachineInstr *pMI = &MI;
-    if (CandidateDefs.count(pMI) == 0)
+    if (CandidateDefs.count(&MI) == 0)
       continue;
-    defs.emplace_back(pMI);
+    defs.emplace_back(&MI);
   }
 
   LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
@@ -1847,13 +1830,13 @@ std::vector<SubExp> buildSubExpFromCandidates(
 
   // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
   // Candidates as output.
-  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
   dag.build(CandidateInput, Candidates, defs);
-  if (bAllowPartialUseInSubExp) {
+  if (AllowPartialUseInSubExp) {
     for (auto &subExp : dag.SubExps) {
       for (auto *MI : subExp.SUnits) {
         if (PartialCandidates.count(MI)) {
-          subExp.bCloneOnly = true;
+          subExp.IsCloneOnly = true;
           break;
         }
       }
@@ -1881,7 +1864,7 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
         continue;
       assert(UseMBB == MBB && "block mismatch");
       // If all operands in CandidateRegs, add to candidateDefs.
-      bool bHasOpRegNotInCandidates = false;
+      bool IsHasOpRegNotInCandidates = false;
       for (MachineOperand &MO : UseMI.operands()) {
         if (!MO.isReg())
           continue;
@@ -1891,11 +1874,11 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
         if (MO.isImplicit() && OpReg.isPhysical())
           continue;
         if (Candidates.count(OpReg) == 0) {
-          bHasOpRegNotInCandidates = true;
+          IsHasOpRegNotInCandidates = true;
           break;
         }
       }
-      if (bHasOpRegNotInCandidates)
+      if (IsHasOpRegNotInCandidates)
         continue;
 
       LLVM_DEBUG(UseMI.dump());
@@ -1948,11 +1931,11 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
     }
 
     // Still use bsink to skip mem load/store.
-    // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*bSink*/true))
+    // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*IsSink*/true))
     //  continue;
 
     // If all user of MI is in candidate defs, add MI into candidate defs.
-    bool bAllOperandInCandidate = true;
+    bool IsAllOperandInCandidate = true;
     for (MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
@@ -1966,22 +1949,22 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
           (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO))
         continue;
       if (OpReg.isPhysical()) {
-        bAllOperandInCandidate = false;
+        IsAllOperandInCandidate = false;
         break;
       }
       MachineInstr *OpMI = MRI.getUniqueVRegDef(OpReg);
       if (!OpMI) {
-        bAllOperandInCandidate = false;
+        IsAllOperandInCandidate = false;
         break;
       }
       if (CandidateDefs.count(OpMI) == 0) {
-        bAllOperandInCandidate = false;
+        IsAllOperandInCandidate = false;
         break;
       }
       if (MO.isTied())
         continue;
     }
-    if (!bAllOperandInCandidate)
+    if (!IsAllOperandInCandidate)
       continue;
     LLVM_DEBUG(llvm::dbgs() << "Add local candidates:";
                pressure::print_reg(Reg, MRI, SIRI, llvm::dbgs()););
@@ -2023,10 +2006,9 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
   std::vector<MachineInstr *> defs;
   defs.reserve(CandidateDefs.size());
   for (MachineInstr &MI : *MBB) {
-    MachineInstr *pMI = &MI;
-    if (CandidateDefs.count(pMI) == 0)
+    if (CandidateDefs.count(&MI) == 0)
       continue;
-    defs.emplace_back(pMI);
+    defs.emplace_back(&MI);
   }
 
   LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
@@ -2042,7 +2024,7 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
   // Input is Candidates, output is?
   // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
   // Candidates as output.
-  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
   dag.build(Candidates, LocalCandidates, defs);
   return dag.SubExps;
 }
@@ -2060,7 +2042,7 @@ void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
 
 MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
                                    const MachineRegisterInfo &MRI,
-                                   MachineDominatorTree *pDT) {
+                                   MachineDominatorTree *DT) {
   BlockSet userBlocks;
   for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
     MachineBasicBlock *UserBB = UseMI.getParent();
@@ -2073,8 +2055,8 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
   }
   if (userBlocks.empty())
     return nullptr;
-  MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
-  if (!pDT->dominates(FromBB, userBlock)) {
+  MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks);
+  if (!DT->dominates(FromBB, userBlock)) {
     return nullptr;
   }
   if (userBlock == FromBB)
@@ -2083,7 +2065,7 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
 }
 
 void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
-                             MachineDominatorTree *pDT,
+                             MachineDominatorTree *DT,
                              SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
                              const SIRegisterInfo *SIRI) {
   // Move from bottom.
@@ -2094,7 +2076,7 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
       continue;
 
     unsigned Reg = DefMI->getOperand(0).getReg();
-    MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, pDT);
+    MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, DT);
     if (!ToBB)
       continue;
 
@@ -2118,7 +2100,7 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
 }
 
 void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
-                               MachineDominatorTree *pDT,
+                               MachineDominatorTree *DT,
                                SlotIndexes *slotIndexes,
                                const SIInstrInfo *SIII,
                                const SIRegisterInfo *SIRI) {
@@ -2172,18 +2154,18 @@ DenseSet<MachineInstr *> buildCloneSet(ExpDag &dag,
       continue;
     MachineInstr *MI = SU.getInstr();
     if (dagBottoms.find(&SU) != dagBottoms.end()) {
-      bool bUsed = false;
+      bool IsUsed = false;
       // For bottom SU, if in usedOutput, add to copySet;
       for (MachineOperand &DefMO : MI->defs()) {
         if (!DefMO.isReg())
           continue;
         unsigned Reg = DefMO.getReg();
         if (usedOutput.count(Reg) > 0) {
-          bUsed = true;
+          IsUsed = true;
           break;
         }
       }
-      if (bUsed) {
+      if (IsUsed) {
         copySet.insert(MI);
         continue;
       }
@@ -2192,16 +2174,16 @@ DenseSet<MachineInstr *> buildCloneSet(ExpDag &dag,
     }
 
     // If any SuccNode is in copySet, add to copySet.
-    bool bSuccCopied = false;
+    bool IsSuccCopied = false;
     for (SDep &SucDep : SU.Succs) {
       SUnit *SucSU = SucDep.getSUnit();
       MachineInstr *SuccMI = SucSU->getInstr();
       if (copySet.count(SuccMI) > 0) {
-        bSuccCopied = true;
+        IsSuccCopied = true;
         break;
       }
     }
-    if (bSuccCopied)
+    if (IsSuccCopied)
       copySet.insert(MI);
   }
   return copySet;
@@ -2237,7 +2219,7 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     SubExp &Exp,
     MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &userBlocks,
     DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &userBlocksLiveRegs,
-    std::vector<HotBlock> &hotBlocks, MachineDominatorTree *pDT) {
+    std::vector<HotBlock> &hotBlocks, MachineDominatorTree *DT) {
   // Collect hot blocks which Exp is live in.
   DenseSet<MachineBasicBlock *> hotBlockSet;
   for (HotBlock &hotBlock : hotBlocks) {
@@ -2260,22 +2242,22 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     if (hotBlockSet.count(MBB))
       continue;
 
-    bool bDomAllHotBlocks = true;
-    bool bDomedByAllHotBlocks = true;
+    bool IsDomAllHotBlocks = true;
+    bool IsDomedByAllHotBlocks = true;
     for (MachineBasicBlock *hotMBB : hotBlockSet) {
-      if (!pDT->dominates(MBB, hotMBB)) {
-        bDomAllHotBlocks = false;
+      if (!DT->dominates(MBB, hotMBB)) {
+        IsDomAllHotBlocks = false;
       }
-      if (!pDT->dominates(hotMBB, MBB)) {
-        bDomedByAllHotBlocks = false;
+      if (!DT->dominates(hotMBB, MBB)) {
+        IsDomedByAllHotBlocks = false;
       }
-      if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+      if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
         break;
       }
     }
-    if (bDomAllHotBlocks) {
+    if (IsDomAllHotBlocks) {
       userBlocks.erase(MBB);
-    } else if (bDomedByAllHotBlocks) {
+    } else if (IsDomedByAllHotBlocks) {
       afterHotRangeMBBs.insert(MBB);
     }
   }
@@ -2289,7 +2271,7 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
         MachineBasicBlock *MBB2 = it2;
         if (MBB == MBB2)
           continue;
-        if (pDT->dominates(MBB, MBB2)) {
+        if (DT->dominates(MBB, MBB2)) {
           auto &Dom = DomMap[MBB];
           Dom.insert(MBB2);
           auto &Dom2 = DomMap[MBB2];
@@ -2315,7 +2297,7 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
 }
 
 void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
-                              MachineDominatorTree *pDT,
+                              MachineDominatorTree *DT,
                               MachineRegisterInfo &MRI,
                               SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
                               const SIRegisterInfo *SIRI) {
@@ -2341,7 +2323,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
     }
   }
   // Build dag for SubExp to help remove unused inst when clone.
-  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
   dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits);
   DenseSet<SUnit *> dagBottoms;
   for (SUnit &SU : dag.SUnits) {
@@ -2369,7 +2351,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
   // For userBlocks which dominated by all hotBlocks, they could share clones
   // because once after hot block, the pressure is OK.
   DenseMap<MachineBasicBlock *, BlockSet> DomMap =
-      reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT);
+      reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, DT);
 
   // Sort to make stable order.
   std::sort(
@@ -2379,7 +2361,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
         return it0.first->getNumber() < it1.first->getNumber();
       });
 
-  const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+  const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
 
   // Clone for each userBlocks. Not share clone thru dom tree which cannot help
   // reg pressure.
@@ -2395,7 +2377,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
     DenseMap<unsigned, unsigned> RegMap;
     auto insertPtr = MBB->getFirstNonPHI();
     // If Exp has scc read/write, make sure MBB not have scc in liveins.
-    if (bModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr))
+    if (IsModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr))
       continue;
     MachineFunction *MF = MBB->getParent();
     for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
@@ -2484,7 +2466,7 @@ void ApplySubExpCloneNearUserInBlock(
   }
 
   SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex();
-  const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+  const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
 
   for (unsigned Reg : Exp.BottomRegs) {
 
@@ -2504,7 +2486,7 @@ void ApplySubExpCloneNearUserInBlock(
         continue;
 
       // Do not overwrite a live scc.
-      if (bModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI))
+      if (IsModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI))
         continue;
 
       useMIs.emplace_back(&UseMI);
@@ -2677,7 +2659,7 @@ bool collectPacifist(MachineInstr &MI,
 
     return false;
   }
-  bool bHasDef = false;
+  bool IsHasDef = false;
   for (MachineOperand &MO : MI.defs()) {
     Register Reg = MO.getReg();
 
@@ -2688,10 +2670,10 @@ bool collectPacifist(MachineInstr &MI,
         getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
       return false;
 
-    bHasDef = true;
+    IsHasDef = true;
   }
   // If no def, it will not increase pressure, don't mark it.
-  return bHasDef;
+  return IsHasDef;
 }
 
 static MachineInstr *findFirstAliasingLoadOrStoreInMBB(MachineInstr &MI,
@@ -2769,7 +2751,7 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
   LLVM_DEBUG(dbgs() << "pacifist end\n");
 
   SlotIndexes *slotIndexes = LIS->getSlotIndexes();
-  bool bUpdated = false;
+  bool IsUpdated = false;
 
   // Move pacifist to its first user.
   // for (MachineInstr *MI : pacifistList) {
@@ -2813,10 +2795,10 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
     MBB.insert(insertPoint, MI);
 
     LIS->handleMove(*MI);
-    bUpdated = true;
+    IsUpdated = true;
   }
 
-  return bUpdated;
+  return IsUpdated;
 }
 
 DenseMap<unsigned, MachineInstr *>
@@ -2862,16 +2844,15 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI,
 bool collectVToSCrossHotSpot(
     MachineBasicBlock &MBB, RematStatus &status,
     DenseMap<unsigned, MachineInstr *> &UniformMap,
-    SmallMapVector<unsigned, MachineInstr *, 4> &VToSMap, LiveIntervals *LIS,
-    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII) {
+    SmallMapVector<unsigned, MachineInstr *, 4> &VToSMap, LiveIntervals *LIS)
+{
   unsigned VLimit = status.TargetVLimit;
   unsigned SLimit = status.TargetSLimit;
   auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
 
   GCNDownwardRPTracker Tracker(*LIS);
 
-  bool bUpdated = false;
+  bool IsUpdated = false;
   const auto inputLive = status.MBBInputLiveMap[&MBB];
   Tracker.reset(*MBB.begin(), &inputLive);
   for (MachineInstr &MI : MBB) {
@@ -2901,10 +2882,10 @@ bool collectVToSCrossHotSpot(
       if (UniformIt == UniformMap.end())
         continue;
       VToSMap[UniformIt->first] = UniformIt->second;
-      bUpdated = true;
+      IsUpdated = true;
     }
   }
-  return bUpdated;
+  return IsUpdated;
 }
 
 // Return true if the user is outside of the def's loop.
@@ -2927,8 +2908,7 @@ bool rematUniformVgprToSgpr(
 
   for (auto &hotBlock : hotBlocks) {
     MachineBasicBlock &MBB = *hotBlock.MBB;
-    collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS, MRI,
-                            SIRI, SIII);
+    collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS);
   }
 
   if (VToSMap.empty())
@@ -2969,7 +2949,7 @@ bool rematUniformVgprToSgpr(
 
     for (MachineInstr *userMI : userMIs) {
       const auto &Desc = userMI->getDesc();
-      bool bIllegal = false;
+      bool IsIllegal = false;
       for (unsigned i = 0; i < userMI->getNumOperands(); i++) {
         MachineOperand &MO = userMI->getOperand(i);
         if (!MO.isReg())
@@ -2979,7 +2959,7 @@ bool rematUniformVgprToSgpr(
         if (MO.getReg() != Reg)
           continue;
         if (i >= Desc.getNumOperands()) {
-          bIllegal = true;
+          IsIllegal = true;
           break;
         }
 
@@ -2997,7 +2977,7 @@ bool rematUniformVgprToSgpr(
           // consider not have limit on reg class.
         }
       }
-      if (bIllegal)
+      if (IsIllegal)
         continue;
 
       auto rit = userMI->getReverseIterator();
@@ -3084,7 +3064,7 @@ bool collectRematableHotReg(
 }
 
 bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
-              std::vector<SubExp> &inBlockCloneSubExps, bool bVGPR,
+              std::vector<SubExp> &inBlockCloneSubExps, bool IsVGPR,
               const GCNRPTracker::LiveRegSet &inputLive,
               const GCNRPTracker::LiveRegSet &outputLive,
               DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
@@ -3138,7 +3118,7 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
       // If the def reg is in hot reg.
       // Add to output.
       if (hotLive.find(DefReg) != hotLive.end()) {
-        bool bUserIsHot = false;
+        bool IsUserIsHot = false;
         for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) {
           if (UseMI.getParent() != &MBB)
             continue;
@@ -3148,12 +3128,12 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
           const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex();
           // When has a hot user after hotMI, remat it may not help.
           if (useSI > SI) {
-            bUserIsHot = true;
+            IsUserIsHot = true;
             break;
           }
         }
 
-        if (bUserIsHot)
+        if (IsUserIsHot)
           continue;
         outputSet[DefReg];
         LLVM_DEBUG(dbgs() << "hotRemat:");
@@ -3174,37 +3154,37 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
   // Build SubExp with pureHotRematList as Nodes, hotLive as input
   // rematHot as output.
   // Not join input when build ExpDag to get small subExps.
-  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ false);
+  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ false);
   dag.build(hotLive, outputSet, pureHotRematList);
   // Find best subExp add to inBlockCloneSubExps.
   // Sort by size of subExp.
   std::sort(dag.SubExps.begin(), dag.SubExps.end(),
-            [](const SubExp &a, const SubExp &b) {
-              return a.SUnits.size() < b.SUnits.size();
+            [](const SubExp &A, const SubExp &B) {
+              return A.SUnits.size() < B.SUnits.size();
             });
   std::vector<SubExp> cloneSubExps;
-  int distance = bVGPR ? vDistance : sDistance;
+  int distance = IsVGPR ? vDistance : sDistance;
   for (SubExp &subExp : dag.SubExps) {
-    if (subExp.bNotSafeToCopy)
+    if (subExp.IsNotSafeToCopy)
       continue;
-    if (bVGPR) {
+    if (IsVGPR) {
       if (subExp.vOutputSize == 0)
         continue;
     } else {
       if (subExp.sOutputSize == 0)
         continue;
     }
-    if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+    if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
       continue;
-    // Not clone big subExp.
+    // Not clone .
     if (subExp.SUnits.size() > 10)
       continue;
     // Do not allow remat in the block when the expression has a memory op and
     // the block has a write. We could allow this in some cases with better
     // analysis.
-    if (subExp.bHasMemInst && MemWriteMBBSet.count(&MBB))
+    if (subExp.IsHasMemInst && MemWriteMBBSet.count(&MBB))
       continue;
-    if (bVGPR) {
+    if (IsVGPR) {
       distance -= subExp.vOutputSize;
     } else {
       distance -= subExp.sOutputSize;
@@ -3282,7 +3262,7 @@ bool tryRematInHotSpot(
   if (vDistance > 0 && hotVMI) {
     // Use hotVMI when apply.
     inBlockHotSInstMap[&MBB] = nullptr;
-    if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive,
+    if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*IsVGPR*/ true, inputLive,
                  outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
                  status.MemWriteMBBSet, LIS, MRI, SIRI, SIII))
       return true;
@@ -3292,7 +3272,7 @@ bool tryRematInHotSpot(
     // Use hotSMI when apply.
     inBlockHotSInstMap[&MBB] = hotSMI;
     inBlockHotVInstMap[&MBB] = nullptr;
-    return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false,
+    return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*IsVGPR*/ false,
                     inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
                     SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII);
   }
@@ -3308,7 +3288,7 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
   struct SortNode {
     SubExp Exp;
     unsigned Depth;
-    bool bDepthDirty;
+    bool IsDepthDirty;
     SmallDenseSet<SubExp *, 2> Preds;
     SmallDenseSet<SubExp *, 2> Succs;
   };
@@ -3342,10 +3322,10 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
     auto &outExps = outIt->second;
     for (SubExp *inExp : inExps) {
       for (SubExp *outExp : outExps) {
-        if (inExp->bHoist != outExp->bHoist) {
+        if (inExp->IsHoist != outExp->IsHoist) {
           // Different direction.
           // If output (def) move up, input(use) move down, nothing happens.
-          if (outExp->bHoist)
+          if (outExp->IsHoist)
             continue;
           // Canot input(use) move up, output(def) move down.
           // Choose the exp which save more.
@@ -3359,7 +3339,7 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
           continue;
         }
         // Link outExp to inExp.
-        if (inExp->bHoist) {
+        if (inExp->IsHoist) {
           sortMap[outExp].Preds.insert(inExp);
           sortMap[inExp].Succs.insert(outExp);
         } else {
@@ -3378,8 +3358,8 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
     SortNode &Node = sortMap[&Exp];
     Node.Depth = 0;
     Node.Exp = Exp;
-    Node.bDepthDirty = !Node.Preds.empty();
-    if (!Node.bDepthDirty)
+    Node.IsDepthDirty = !Node.Preds.empty();
+    if (!Node.IsDepthDirty)
       WorkList.emplace_back(&Exp);
   }
   // Calc depth.
@@ -3389,16 +3369,16 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
     for (SubExp *Succ : Node.Succs) {
       SortNode &SuccNode = sortMap[Succ];
       SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1);
-      bool bAllPrevClean = true;
+      bool IsAllPrevClean = true;
       for (SubExp *Prev : SuccNode.Preds) {
         SortNode &PrevNode = sortMap[Prev];
-        if (PrevNode.bDepthDirty) {
-          bAllPrevClean = false;
+        if (PrevNode.IsDepthDirty) {
+          IsAllPrevClean = false;
           break;
         }
       }
-      if (bAllPrevClean) {
-        SuccNode.bDepthDirty = false;
+      if (IsAllPrevClean) {
+        SuccNode.IsDepthDirty = false;
         WorkList.push_back(Succ);
       }
     }
@@ -3435,12 +3415,12 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
   unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1);
   unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0);
   unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1);
-  // big occupancy is low pressure.
+  //  is low pressure.
   if (Occ0 > Occ1)
     return false;
   if (Occ0 < Occ1)
     return true;
-  // When sgpr bound, big sgpr is high pressure.
+  // When sgpr bound,  is high pressure.
   if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) {
     return maxS0 > maxS1;
   }
@@ -3453,9 +3433,9 @@ bool canHelpPressureWhenSink(
     SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
     const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
     const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
-    MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound) {
+    MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound) {
   LLVM_DEBUG(subExp.dump(MRI, SIRI));
-  if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+  if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
     return false;
 
   // Update input size to ignore lives in which already in
@@ -3475,7 +3455,7 @@ bool canHelpPressureWhenSink(
   if (subExp.vInputSize > subExp.vOutputSize)
     return false;
 
-  if (subExp.sInputSize > subExp.sOutputSize && bSgprBound)
+  if (subExp.sInputSize > subExp.sOutputSize && IsSgprBound)
     return false;
 
   if (subExp.sInputSize >= subExp.sOutputSize &&
@@ -3496,20 +3476,20 @@ bool canHelpPressureWhenSink(
   }
   if (userBlocks.empty())
     return false;
-  MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
-  if (!pDT->dominates(subExp.FromBB, userBlock)) {
+  MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks);
+  if (!DT->dominates(subExp.FromBB, userBlock)) {
     return false;
   }
   if (userBlock == subExp.FromBB &&
       // When allow clone, could go clone path if cannot move subExp.
-      !bCanClone)
+      !IsCanClone)
     return false;
 
   subExp.ToBB = userBlock;
   if (auto *toLoop = MLI->getLoopFor(userBlock)) {
     auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
     if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
-      subExp.bMoveIntoLoop = true;
+      subExp.IsMoveIntoLoop = true;
   } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
     auto *toLoop = MLI->getLoopFor(userBlock);
     // not safe to move out of loop.
@@ -3523,12 +3503,12 @@ bool canHelpPressureWhenSink(
 bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
                               const SIRegisterInfo *SIRI,
                               const SIInstrInfo *SIII,
-                              const MachineLoopInfo *MLI, bool bSgprBound) {
-  if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ true))
+                              const MachineLoopInfo *MLI, bool IsSgprBound) {
+  if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ true))
     return false;
   if (subExp.vInputSize < subExp.vOutputSize)
     return false;
-  if (subExp.sInputSize < subExp.sOutputSize && bSgprBound)
+  if (subExp.sInputSize < subExp.sOutputSize && IsSgprBound)
     return false;
 
   if (subExp.sInputSize <= subExp.sOutputSize &&
@@ -3584,7 +3564,7 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus,
     LLVM_DEBUG(print_vreg(Reg, MRI));
     LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr ";
                else dbgs() << " vgpr ";);
-    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) {
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true)) {
       LLVM_DEBUG(dbgs() << " is not safe\n");
       continue;
     }
@@ -3631,7 +3611,7 @@ collectPassThrus(MachineBasicBlock *MBB,
                  const GCNRPTracker::LiveRegSet &outputLive,
                  const GCNRPTracker::LiveRegSet &usedPassThrus,
                  const GCNRPTracker::LiveRegSet &liveRegCandidates,
-                 MachineRegisterInfo &MRI, bool bCanClone) {
+                 MachineRegisterInfo &MRI, bool IsCanClone) {
   GCNRPTracker::LiveRegSet passThrus;
   llvm::mergeLiveRegSet(passThrus, inputLive);
   llvm::andLiveRegSet(passThrus, outputLive);
@@ -3655,17 +3635,17 @@ collectPassThrus(MachineBasicBlock *MBB,
     }
     DenseSet<MachineBasicBlock *> UseMBBs;
     // Allow use for pass thru if clone is OK.
-    if (!bCanClone) {
+    if (!IsCanClone) {
       for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
         MachineBasicBlock *UserMBB = UseMI.getParent();
         UseMBBs.insert(UserMBB);
       }
     }
-    bool bW = DefMBBs.count(MBB) > 0;
-    bool bR = UseMBBs.count(MBB) > 0;
+    bool IsW = DefMBBs.count(MBB) > 0;
+    bool IsR = UseMBBs.count(MBB) > 0;
 
-    bool bPassThru = !bW && !bR;
-    if (!bPassThru)
+    bool IsPassThru = !IsW && !IsR;
+    if (!IsPassThru)
       passThrus.erase(Reg);
   }
   return passThrus;
@@ -3682,7 +3662,7 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
   SmallDenseSet<unsigned, 8> freeInstUseRegs;
   SmallVector<MachineInstr *, 4> freeInsts;
   for (MachineInstr *MI : subExp.SUnits) {
-    bool bIsFree = true;
+    bool IsFree = true;
     // Check all use regs are free.
     for (MachineOperand &MO : MI->uses()) {
       if (!MO.isReg())
@@ -3691,7 +3671,7 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
       if (MO.isImplicit() && Reg == AMDGPU::EXEC)
         continue;
       if (MRI.getUniqueVRegDef(Reg) == nullptr) {
-        bIsFree = false;
+        IsFree = false;
         break;
       }
       // Skip local pass thrus unless it is free.
@@ -3699,18 +3679,18 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
         continue;
       if (freeRegs.count(Reg))
         continue;
-      bIsFree = false;
+      IsFree = false;
       break;
     }
     // Check def is unique.
     for (MachineOperand &MO : MI->defs()) {
       unsigned Reg = MO.getReg();
       if (MRI.getUniqueVRegDef(Reg) == nullptr) {
-        bIsFree = false;
+        IsFree = false;
         break;
       }
     }
-    if (!bIsFree)
+    if (!IsFree)
       continue;
     // Save inst as free inst.
     freeInsts.emplace_back(MI);
@@ -3730,20 +3710,20 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
   }
   // Then remove local inst has no output use.
   for (MachineInstr *MI : freeInsts) {
-    bool bIsFreeUsed = false;
+    bool IsFreeUsed = false;
     for (MachineOperand &MO : MI->defs()) {
       unsigned Reg = MO.getReg();
       // Used as freeInst or output.
-      bIsFreeUsed |=
+      IsFreeUsed |=
           freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg);
     }
-    if (!bIsFreeUsed)
+    if (!IsFreeUsed)
       continue;
     freeExp.SUnits.emplace_back(MI);
   }
   if (freeExp.SUnits.empty()) {
     // mark has terminator to make it unsafe.
-    freeExp.bHasTerminatorInst = true;
+    freeExp.IsHasTerminatorInst = true;
     return freeExp;
   }
   // Build BottomRegs and TopRegs for freeExp.
@@ -3760,7 +3740,7 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
   freeExp.FromBB = subExp.FromBB;
   freeExp.ToBB = subExp.ToBB;
   // must be clone since is partial of subExp.
-  freeExp.bCloneOnly = true;
+  freeExp.IsCloneOnly = true;
 
   // Calc reg for freeExp.
   for (unsigned Reg : freeExp.TopRegs) {
@@ -3785,10 +3765,10 @@ std::vector<SubExp> buildSubExpCandidates(
     GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
     const MachineLoopInfo *MLI, SlotIndexes *slotIndexes,
-    MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound,
+    MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound,
     GCNRPTracker::LiveRegSet &unUsedPassThrus,
     DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
-    bool bAllowPartialUseInSubExp) {
+    bool AllowPartialUseInSubExp) {
   std::vector<SubExp> subExpCandidates;
   // Build exp dag on define blocks.
   // Save profit candidates into list.
@@ -3799,40 +3779,40 @@ std::vector<SubExp> buildSubExpCandidates(
     // Go up on the dag until reach share node.
     auto subExps = buildSubExpFromCandidates(
         Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus,
-        bAllowPartialUseInSubExp);
+        AllowPartialUseInSubExp);
     for (SubExp &subExp : subExps) {
-      if (subExp.bHasMemInst) {
+      if (subExp.IsHasMemInst) {
         // Skip when memory ld/st inst need to cross MBB which write memory.
         // TODO: check all MBBs in between FromBB and ToBB not write memory.
         // Currently just skip when any memory write exist.
         if (!MemWriteMBBSet.empty()) {
           MachineBasicBlock *FromBB = subExp.FromBB;
           MachineBasicBlock *ToBB = subExp.ToBB;
-          if (subExp.bHoist) {
+          if (subExp.IsHoist) {
             FromBB = subExp.ToBB;
             ToBB = subExp.FromBB;
           }
-          bool bCrossMemWriteMBB = false;
+          bool IsCrossMemWriteMBB = false;
           for (MachineBasicBlock *MemMBB : MemWriteMBBSet) {
-            if (pDT->dominates(ToBB, MemMBB))
+            if (DT->dominates(ToBB, MemMBB))
               continue;
-            if (pDT->dominates(MemMBB, FromBB))
+            if (DT->dominates(MemMBB, FromBB))
               continue;
-            bCrossMemWriteMBB = true;
+            IsCrossMemWriteMBB = true;
             break;
           }
-          if (bCrossMemWriteMBB)
+          if (IsCrossMemWriteMBB)
             continue;
         }
       }
-      if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
-                                   bCanClone, bSgprBound)) {
-        if (bAllowPartialUseInSubExp &&
-            subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
+      if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, DT,
+                                   IsCanClone, IsSgprBound)) {
+        if (AllowPartialUseInSubExp &&
+            subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) {
           SubExp freeSubExp =
               buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
           if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII,
-                                      MLI, pDT, bCanClone, bSgprBound)) {
+                                      MLI, DT, IsCanClone, IsSgprBound)) {
             subExpCandidates.emplace_back(freeSubExp);
           }
         }
@@ -3848,28 +3828,28 @@ std::vector<SubExp> buildSubExpCandidates(
 std::pair<int, int>
 calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
                 GCNRPTracker::LiveRegSet &inputLive,
-                GCNRPTracker::LiveRegSet &outputLive, bool bVOutBound,
-                bool bSOutBound, bool bCanClone, MachineDominatorTree *pDT,
+                GCNRPTracker::LiveRegSet &outputLive, bool IsVOutBound,
+                bool IsSOutBound, bool IsCanClone, MachineDominatorTree *DT,
                 const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
   int vgpr = 0;
   int sgpr = 0;
   MachineBasicBlock *MBB = hotBB.MBB;
   // Sink saving.
   for (SubExp &Exp : subExpCandidates) {
-    if (Exp.bHoist) {
+    if (Exp.IsHoist) {
       // ToMBB -> MBB -> FromMBB.
       // If ToMBB not dom hot block, reg will not live in MBB.
-      if (!pDT->dominates(Exp.ToBB, MBB))
+      if (!DT->dominates(Exp.ToBB, MBB))
         continue;
     } else {
       // If FromBB not dom hot block, reg will not live in MBB.
-      if (!pDT->dominates(Exp.FromBB, MBB))
+      if (!DT->dominates(Exp.FromBB, MBB))
         continue;
       // When subExp is from hotBB, check output instead of input.
       if (Exp.FromBB == MBB) {
-        if (bVOutBound && Exp.vOutputSize < Exp.vInputSize)
+        if (IsVOutBound && Exp.vOutputSize < Exp.vInputSize)
           continue;
-        if (bSOutBound && Exp.sOutputSize < Exp.sInputSize)
+        if (IsSOutBound && Exp.sOutputSize < Exp.sInputSize)
           continue;
         vgpr += Exp.vInputSize;
         vgpr -= Exp.vOutputSize;
@@ -3884,18 +3864,18 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
     // If subExp is to hotBB, it is crossing output instead of input.
     GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive;
 
-    bool bClone = false;
+    bool IsClone = false;
     GCNRPTracker::LiveRegSet newInput;
-    if (!Exp.bMoveIntoLoop) {
-      if (Exp.bHoist) {
+    if (!Exp.IsMoveIntoLoop) {
+      if (Exp.IsHoist) {
         // If FromBB dom hot block, it will not change live for MBB.
-        if (Exp.FromBB != MBB && pDT->dominates(Exp.FromBB, MBB))
+        if (Exp.FromBB != MBB && DT->dominates(Exp.FromBB, MBB))
           continue;
       } else {
         // If ToBB dom hot block, it will not change live for MBB.
-        if (ToMBB != MBB && pDT->dominates(ToMBB, MBB)) {
-          if (bCanClone && !Exp.bNotSafeToCopy) {
-            bClone = true;
+        if (ToMBB != MBB && DT->dominates(ToMBB, MBB)) {
+          if (IsCanClone && !Exp.IsNotSafeToCopy) {
+            IsClone = true;
           } else {
             continue;
           }
@@ -3909,27 +3889,27 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
         if (crossLive.find(Reg) != crossLive.end())
           MBBBeginMask = crossLive[Reg];
         // Check mask which live in both BeginSlot and exp output when sink to
-        // kill the output. Check mask which not live in BeginSlot but live in
+        // kill the output. Check mask which not live in BeginSlot  in
         // exp output when hoist to live the output.
-        LaneBitmask profitMask =
-            Exp.bHoist ? (outMask & (~MBBBeginMask)) : (outMask & MBBBeginMask);
+        LaneBitmask profitMask = Exp.IsHoist ? (outMask & (~MBBBeginMask))
+                                             : (outMask & MBBBeginMask);
         if (MBBBeginMask.any()) {
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
           LLVM_DEBUG(std::string movStr =
-                         Exp.bHoist ? "output hoist:" : "output sink:";
+                         Exp.IsHoist ? "output hoist:" : "output sink:";
                      dbgs()
                      << movStr << Register::virtReg2Index(Reg) << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB when sink and increase when hoist.
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
-            if (Exp.bHoist)
+            if (Exp.IsHoist)
               vgprDiff += Size;
             else
               vgprDiff -= Size;
           } else {
             LLVM_DEBUG(dbgs() << "s\n");
-            if (Exp.bHoist)
+            if (Exp.IsHoist)
               sgprDiff += Size;
             else
               sgprDiff -= Size;
@@ -3943,11 +3923,11 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
         LaneBitmask MBBBeginMask;
         if (crossLive.find(Reg) != crossLive.end())
           MBBBeginMask = crossLive[Reg];
-        // Check mask which not live in BeginSlot but live in exp input when
+        // Check mask which not live in BeginSlot  in exp input when
         // sink to live the input. Check mask which live in both BeginSlot and
         // exp output when hoist to kill the input.
         LaneBitmask profitMask =
-            Exp.bHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask));
+            Exp.IsHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask));
         if (profitMask.any()) {
           // Update input live to avoid count same input more than once.
           newInput[Reg] |= inMask;
@@ -3956,17 +3936,17 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
 
           LLVM_DEBUG(
-              std::string movStr = Exp.bHoist ? "input hoist:" : "input sink:";
+              std::string movStr = Exp.IsHoist ? "input hoist:" : "input sink:";
               dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
-            if (Exp.bHoist)
+            if (Exp.IsHoist)
               vgprDiff -= Size;
             else
               vgprDiff += Size;
           } else {
             LLVM_DEBUG(dbgs() << "s\n");
-            if (Exp.bHoist)
+            if (Exp.IsHoist)
               sgprDiff -= Size;
             else
               sgprDiff += Size;
@@ -3981,15 +3961,15 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
       // Hoist into loop is not supported now.
       for (auto outIt : Exp.outputLive) {
         unsigned Reg = outIt.first;
-        bool bDomUser = false;
+        bool IsDomUser = false;
         for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
           MachineBasicBlock *UserMBB = MI.getParent();
-          if (pDT->dominates(MBB, UserMBB)) {
-            bDomUser = true;
+          if (DT->dominates(MBB, UserMBB)) {
+            IsDomUser = true;
             break;
           }
         }
-        if (bDomUser)
+        if (IsDomUser)
           continue;
 
         LaneBitmask outMask = outIt.second;
@@ -4019,7 +3999,7 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
         LaneBitmask MBBBeginMask;
         if (inputLive.find(Reg) != inputLive.end())
           MBBBeginMask = inputLive[Reg];
-        // Check mask which not live in BeginSlot but live in exp input.
+        // Check mask which not live in BeginSlot  in exp input.
         LaneBitmask profitMask = inMask & (~MBBBeginMask);
         if (profitMask.any()) {
           // Update input live to avoid count same input more than once.
@@ -4041,16 +4021,16 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
       }
     }
 
-    if (bVOutBound && vgprDiff > 0)
+    if (IsVOutBound && vgprDiff > 0)
       continue;
 
-    if (bSOutBound && sgprDiff > 0)
+    if (IsSOutBound && sgprDiff > 0)
       continue;
     llvm::mergeLiveRegSet(crossLive, newInput);
     vgpr += vgprDiff;
     sgpr += sgprDiff;
-    if (bClone)
-      Exp.bCloneOnly = true;
+    if (IsClone)
+      Exp.IsCloneOnly = true;
   }
 
   return std::make_pair(vgpr, sgpr);
@@ -4062,7 +4042,7 @@ void addExpCandidates(std::vector<SubExp> &subExpCandidates,
   subExpCandidates.insert(subExpCandidates.end(), subExps.begin(),
                           subExps.end());
   for (auto &Exp : subExps) {
-    if (Exp.bHoist) {
+    if (Exp.IsHoist) {
       for (auto &Reg : Exp.TopRegs) {
         usedRegs[Reg];
       }
@@ -4087,19 +4067,19 @@ bool tryToAddSubExps(
     GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs,
     MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
     const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
-    SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT,
-    bool bCanClone, bool bVOutBound, bool bSOutBound,
-    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
+    SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *DT,
+    bool IsCanClone, bool IsVOutBound, bool IsSOutBound,
+    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) {
   std::vector<SubExp> partialSubExps = buildSubExpCandidates(
-      Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT,
-      bCanClone, bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
-      bAllowPartialUseInSubExp);
+      Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, DT,
+      IsCanClone, IsSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
+      AllowPartialUseInSubExp);
 
   GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
   GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive;
   std::pair<int, int> curSaving = calculateSaving(
       hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive,
-      bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+      IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI);
   const int VLimit = status.TargetVLimit;
   const int SLimit = status.TargetSLimit;
 
@@ -4114,7 +4094,7 @@ bool tryToAddSubExps(
   }
 
   if (EnableSubExpAggressive) {
-    // Build candidates from passThrus but not used in partialSubExps.
+    // Build candidates from passThrus  used in partialSubExps.
     GCNRPTracker::LiveRegSet sinkUsedRegs;
     for (auto &Exp : partialSubExps) {
       for (auto &Reg : Exp.BottomRegs) {
@@ -4130,7 +4110,7 @@ bool tryToAddSubExps(
       if (usedRegs.count(Reg))
         continue;
       // Skip unsafe reg.
-      if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ false)) {
+      if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ false)) {
         LLVM_DEBUG(dbgs() << " is not safe to hoist\n");
         continue;
       }
@@ -4165,16 +4145,17 @@ bool tryToAddSubExps(
       auto subExps = buildSubExpFromCandidatesTopBottom(
           Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes);
       for (SubExp &subExp : subExps) {
-        if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound))
+        if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI,
+                                      IsSOutBound))
           continue;
-        subExp.bHoist = true;
+        subExp.IsHoist = true;
         hoistSubExpCandidates.emplace_back(subExp);
       }
     }
 
     std::pair<int, int> hoistSaving = calculateSaving(
         hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive,
-        bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+        IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI);
 
     int hoistVgpr = vgpr + hoistSaving.first;
     int hoistSgpr = sgpr + hoistSaving.second;
@@ -4182,7 +4163,7 @@ bool tryToAddSubExps(
     if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) ||
         // If status not balance, do the remat even cannot reach target.
         // TODO: check the result not help even one occupancy.
-        (!hoistSubExpCandidates.empty() && !status.bNotBalance &&
+        (!hoistSubExpCandidates.empty() && !status.NotBalance &&
          TargetOccupancy != 0)) {
       // nrmSubExps can help reach target occupancy, add it to
       // subExpCandidates.
@@ -4195,8 +4176,8 @@ bool tryToAddSubExps(
 
   if (EnableVmemDegree &&
       // Only expect vmem when last tryToAddSubExps.
-      // If not, bAllowPartialUseInSubExp will no chance to be true.
-      (bAllowPartialUseInSubExp || !EnableSubExpAggressive)) {
+      // If not, AllowPartialUseInSubExp will no chance to be true.
+      (AllowPartialUseInSubExp || !EnableSubExpAggressive)) {
     // Assume vmemLdSize could be optimized by not parallel.
     if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
          (vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
@@ -4218,11 +4199,11 @@ bool tryToAddSubExps(
                          inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) {
     // return false always when not allow partialUseInSubExp, it will try again
     // with partialUseInSubExp enabled.
-    if (!bAllowPartialUseInSubExp)
+    if (!AllowPartialUseInSubExp)
       return false;
     // If status not balance, do the remat even cannot reach target.
     // TODO: check the result not help even one occupancy.
-    if (!status.bNotBalance && TargetOccupancy == 0)
+    if (!status.NotBalance && TargetOccupancy == 0)
       return false;
   }
   // nrmSubExps can help reach target occupancy, add it to
@@ -4234,17 +4215,17 @@ bool tryToAddSubExps(
 // Remat passthru regs per hot block.
 // Reason to do it per block is to make sure passthru reuse is precise.
 // If try remat on all hot blocks together, the passthru might be on one block,
-// but the reuse in on another block which the reg is not passthru there.
+//  reuse in on another block which the reg is not passthru there.
 bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
                            RematStatus &status,
                            GCNRPTracker::LiveRegSet &liveRegCandidates,
                            const GCNSubtarget *ST, LiveIntervals *LIS,
                            const MachineLoopInfo *MLI,
-                           MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+                           MachineDominatorTree *DT, MachineRegisterInfo &MRI,
                            const SIRegisterInfo *SIRI,
                            const SIInstrInfo *SIII) {
-  bool bUpdated = false;
-  bool bCanClone = EnableSubExpClone || EnableSubExpAggressive;
+  bool IsUpdated = false;
+  bool IsCanClone = EnableSubExpClone || EnableSubExpAggressive;
 
   SlotIndexes *slotIndexes = LIS->getSlotIndexes();
   // Sort hot blocks by pressure first.
@@ -4285,15 +4266,15 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
     const int PressureDelta = -1;
     int vgpr = it.maxPressures.first - PressureDelta;
     int sgpr = it.maxPressures.second;
-    bool bVOutBound = vgpr > VLimit;
-    bool bSOutBound = sgpr > SLimit;
+    bool IsVOutBound = vgpr > VLimit;
+    bool IsSOutBound = sgpr > SLimit;
     // savingInputLive is used to calculate saving which will be modified to
     // avoid count same input multiple times.
     GCNRPTracker::LiveRegSet savingInputLive = inputLive;
     GCNRPTracker::LiveRegSet savingOutputLive = outputLive;
     std::pair<int, int> curSaving =
         calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive,
-                        bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+                        IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI);
 
     vgpr += curSaving.first;
     sgpr += curSaving.second;
@@ -4304,7 +4285,7 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
     // Collect pass thru regs.
     GCNRPTracker::LiveRegSet passThrus =
         collectPassThrus(MBB, inputLive, outputLive, usedPassThrus,
-                         liveRegCandidates, MRI, bCanClone);
+                         liveRegCandidates, MRI, IsCanClone);
 
     // Group pass thru regs by def MBB.
     SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
@@ -4314,34 +4295,34 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
     // subExp.
     GCNRPTracker::LiveRegSet unusedPassThrus;
     // Build exp dag on define blocks.
-    bool bAllowPartialUseInSubExp = false;
+    bool AllowPartialUseInSubExp = false;
     if (tryToAddSubExps(
             Remat, it, status, subExpCandidates, inBlockCloneSubExps,
             inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
             savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
-            SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
-            unusedPassThrus, bAllowPartialUseInSubExp)) {
+            SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound,
+            IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) {
       // Remove unusedPassThrus from passThrus first.
       llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
       llvm::mergeLiveRegSet(usedPassThrus, passThrus);
       continue;
     }
     // If cannot clone, don't need to try partialUseInSubExp which must clone.
-    if (!bCanClone)
+    if (!IsCanClone)
       return false;
 
-    // Partial use subExp may result big alu count caused by clone.
+    // Partial use subExp may result  count caused by clone.
     // Only try it when enable aggressive remat.
     if (!EnableSubExpAggressive)
       return false;
 
-    bAllowPartialUseInSubExp = true;
+    AllowPartialUseInSubExp = true;
     if (!tryToAddSubExps(
             Remat, it, status, subExpCandidates, inBlockCloneSubExps,
             inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
             savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
-            SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
-            unusedPassThrus, bAllowPartialUseInSubExp)) {
+            SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound,
+            IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) {
       return false;
     }
     // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
@@ -4360,14 +4341,14 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
       if (Exp.SUnits.empty())
         continue;
       LLVM_DEBUG(Exp.dump(MRI, SIRI));
-      if (Exp.bHoist) {
-        ApplySubExpMoveNearDefine(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+      if (Exp.IsHoist) {
+        ApplySubExpMoveNearDefine(Exp, MRI, DT, slotIndexes, SIII, SIRI);
       } else {
-        if (Exp.bCloneOnly)
-          ApplySubExpCloneNearUser(Exp, hotBlocks, pDT, MRI, slotIndexes, SIII,
+        if (Exp.IsCloneOnly)
+          ApplySubExpCloneNearUser(Exp, hotBlocks, DT, MRI, slotIndexes, SIII,
                                    SIRI);
         else
-          ApplySubExpMoveNearUser(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+          ApplySubExpMoveNearUser(Exp, MRI, DT, slotIndexes, SIII, SIRI);
       }
     }
 
@@ -4378,10 +4359,10 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
     }
     // Try to see possible occupancy could reach, then dicide a target.
     // Apply remat.
-    bUpdated = subExpCandidates.size();
+    IsUpdated = subExpCandidates.size();
   }
 
-  return bUpdated;
+  return IsUpdated;
 }
 
 int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
@@ -4389,8 +4370,8 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
   int vmemLdSize = 0;
   // Collect vmemLd when enable split.
   for (MachineInstr &MI : MBB) {
-    bool bIsHighLatency = SIII->isHighLatencyInstruction(MI);
-    if (!bIsHighLatency)
+    bool IsHighLatency = SIII->isHighLatencyInstruction(MI);
+    if (!IsHighLatency)
       continue;
     if (!(MI.mayLoad() &&
           // Skip case like atomic which not return value.
@@ -4408,8 +4389,8 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
 } // namespace
 
 bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
-                LiveIntervals *LIS, MachineDominatorTree *pDT,
-                MachinePostDominatorTree *pPDT, AliasAnalysis *AA) {
+                LiveIntervals *LIS, MachineDominatorTree *DT,
+                MachinePostDominatorTree *PDT, AliasAnalysis *AA) {
   if (MF.size() < 2)
     return false;
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
@@ -4419,7 +4400,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
 
   auto &MRI = MF.getRegInfo();
 
-  RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+  RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST);
 
   const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
   if (status.TargetOcc >= MaxOcc)
@@ -4431,20 +4412,20 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
   int rematVCnt = status.MaxVPressure - VLimit;
   int rematSCnt = status.MaxSPressure - SLimit;
 
-  bool bSGPRSpill = false;
+  bool IsSGPRSpill = false;
   if (rematSCnt > 0) {
-    bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+    IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
   }
 
   // If bound by lds, skip.
   if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second &&
-      !bSGPRSpill)
+      !IsSGPRSpill)
     return false;
 
-  bool bBothOutLimit = rematVCnt > 0 && rematSCnt > 0;
+  bool IsBothOutLimit = rematVCnt > 0 && rematSCnt > 0;
   // TODO: use check wqm and support vreg remat.
-  bool bCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
-  rematVCnt = bCheckWQM & false;
+  bool IsCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+  rematVCnt = IsCheckWQM & false;
 
   // Remat on every hot block.
 
@@ -4467,8 +4448,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) {
       maxLocalVPressure = 0;
       maxLocalSPressure = 0;
-      CollectMBBPressure(MBB, LIS, MRI, ST, maxLocalVPressure,
-                         maxLocalSPressure, status);
+      collectMBBPressure(MBB, LIS, ST, maxLocalVPressure, maxLocalSPressure,
+                         status);
 
       maxLocalSPressure += RegForVCC;
     }
@@ -4476,7 +4457,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
       continue;
 
     // When both vgpr sgpr out limit, only help vgpr.
-    if (bBothOutLimit && maxLocalVPressure <= VLimit)
+    if (IsBothOutLimit && maxLocalVPressure <= VLimit)
       continue;
     GCNRPTracker::LiveRegSet liveSet;
     hotBlocks.push_back({&MBB, liveSet,
@@ -4513,8 +4494,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
           continue;
         if (Pred->empty())
           continue;
-        bool bIsHighLatency = SIII->isHighLatencyInstruction(Pred->back());
-        if (!bIsHighLatency)
+        bool IsHighLatency = SIII->isHighLatencyInstruction(Pred->back());
+        if (!IsHighLatency)
           continue;
         int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI);
         it.vmemLdInputSize = vmemLdSize;
@@ -4527,14 +4508,14 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
                                hotBlocks, LIS, MRI, SIRI, SIII, MLI)) {
       // Rebuild LIS.
       LIS->reanalyze(MF);
-      status = GetRematStatus(MF, MLI, LIS, MRI, ST);
-      bool bSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF);
-      if (bSgprSpilled) {
-        bool bNearTarget = false;
-        hotBlockRemat(Remat, MF, MLI, LIS, pDT, pPDT, bNearTarget);
+      status = getRematStatus(MF, MLI, LIS, MRI, ST);
+      bool IsSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF);
+      if (IsSgprSpilled) {
+        bool IsNearTarget = false;
+        hotBlockRemat(Remat, MF, MLI, LIS, DT, PDT, IsNearTarget);
         // Rebuild LIS.
         LIS->reanalyze(MF);
-        status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+        status = getRematStatus(MF, MLI, LIS, MRI, ST);
       }
 
       for (auto &it : hotBlocks) {
@@ -4586,11 +4567,11 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     }
   }
 
-  bool bUpdated =
+  bool IsUpdated =
       perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST,
-                            LIS, MLI, pDT, MRI, SIRI, SIII);
+                            LIS, MLI, DT, MRI, SIRI, SIII);
 
-  return bUpdated;
+  return IsUpdated;
 }
 
 bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
@@ -4623,21 +4604,21 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
 
   // LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
   //  For non-cs/ps, set target occ as 4.
-  bool bNearTarget = false;
-  bool bFinalUpdated = false;
-  bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget);
-  bFinalUpdated |= bUpdated;
+  bool IsNearTarget = false;
+  bool IsFinalUpdated = false;
+  bool IsUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, IsNearTarget);
+  IsFinalUpdated |= IsUpdated;
   if (EnableSubExp) {
-    if (bUpdated) {
+    if (IsUpdated) {
       // Rebuild LIS.
       LIS->reanalyze(MF);
     }
 
-    bUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA);
+    IsUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA);
 
-    bFinalUpdated |= bUpdated;
+    IsFinalUpdated |= IsUpdated;
   }
-  return bFinalUpdated;
+  return IsFinalUpdated;
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index b133659d8fb66..be24bfce2851c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -102,7 +102,7 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
       Register Reg = MO.getReg();
       if (!Reg.isVirtual()) {
         if (Reg == AMDGPU::SCC)
-          bTouchSCC = true;
+          IsTouchSCC = true;
         continue;
       }
 
@@ -132,12 +132,12 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
   }
 }
 
-bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
-  if (bMultiDefOutput)
+bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool IsMoveUp) const {
+  if (IsMultiDefOutput)
     return false;
-  if (bHasTerminatorInst)
+  if (IsHasTerminatorInst)
     return false;
-  if (bUseIncomingReg)
+  if (IsUseIncomingReg)
     return false;
 
   // Input should be single def.
@@ -150,8 +150,8 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
 
 ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
                const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-               const bool bJoinInput)
-    : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {}
+               const bool IsJoinInput)
+    : MRI(MRI), SIRI(SIRI), SIII(SIII), IsJoinInputToSubExp(IsJoinInput) {}
 
 template <typename T>
 void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) {
@@ -209,12 +209,12 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
       passThruInputs.emplace_back(SU.NodeNum);
       continue;
     }
-    if (!bJoinInputToSubExp && !SU.isInstr())
+    if (!IsJoinInputToSubExp && !SU.isInstr())
       continue;
     // Join prev.
     for (SDep &PreDep : SU.Preds) {
       SUnit *PreSU = PreDep.getSUnit();
-      if (!bJoinInputToSubExp && !PreSU->isInstr())
+      if (!IsJoinInputToSubExp && !PreSU->isInstr())
         continue;
       SubtreeClasses.join(SU.NodeNum, PreSU->NodeNum);
     }
@@ -266,7 +266,7 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
           continue;
         unsigned Reg = MO.getReg();
         if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) {
-          Exp.bUseIncomingReg = true;
+          Exp.IsUseIncomingReg = true;
         }
       }
 
@@ -274,13 +274,13 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
       if (SU.NumSuccsLeft == 0) {
         Exp.BottomRoots.insert(MI);
         if (MI->isTerminator())
-          Exp.bHasTerminatorInst = true;
+          Exp.IsHasTerminatorInst = true;
       }
       if (MI->isNotDuplicable())
-        Exp.bNotSafeToCopy = true;
+        Exp.IsNotSafeToCopy = true;
       // Skip Scalar mem access since no scalar store.
       if (MI->mayLoadOrStore() && !SIII->isSMRD(*MI)) {
-        Exp.bHasMemInst = true;
+        Exp.IsHasMemInst = true;
       }
       // Add bottom regs.
       for (MachineOperand &MO : MI->operands()) {
@@ -295,16 +295,16 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
         if (SU.NumSuccsLeft) {
           // For SU which has used in current blk.
           // Check if used in other blks or subExps.
-          bool bUsedInOtherBlk = false;
+          bool IsUsedInOtherBlk = false;
           for (auto &UserMI : MRI.use_nodbg_instructions(Reg)) {
             if (UserMI.getParent() != MBB) {
-              bUsedInOtherBlk = true;
+              IsUsedInOtherBlk = true;
               break;
             }
             auto suIt = MISUnitMap.find(&UserMI);
             // When UserMI is not in dag, treat it as other block.
             if (suIt == MISUnitMap.end()) {
-              bUsedInOtherBlk = true;
+              IsUsedInOtherBlk = true;
               break;
             }
             SUnit *UseSU = suIt->second;
@@ -318,12 +318,12 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
               break;
             }
           }
-          if (!bUsedInOtherBlk)
+          if (!IsUsedInOtherBlk)
             continue;
         }
         Exp.BottomRegs.insert(Reg);
         if (!MRI.getUniqueVRegDef(Reg)) {
-          Exp.bMultiDefOutput = true;
+          Exp.IsMultiDefOutput = true;
         }
       }
     }
@@ -435,7 +435,7 @@ BlockExpDag::BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS,
                          const llvm::MachineRegisterInfo &MRI,
                          const llvm::SIRegisterInfo *SIRI,
                          const llvm::SIInstrInfo *SIII)
-    : ExpDag(MRI, SIRI, SIII, /*bJoinInput*/ true), LIS(LIS), MBB(B) {}
+    : ExpDag(MRI, SIRI, SIII, /*IsJoinInput*/ true), LIS(LIS), MBB(B) {}
 
 void BlockExpDag::build() {
   auto *SlotIndexes = LIS->getSlotIndexes();
@@ -503,7 +503,7 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet,
     }
   }
   while (!WorkList.empty()) {
-    bool bUpdated = false;
+    bool IsUpdated = false;
     SmallVector<SUnit *, 4> ReadyNodes;
     for (SUnit *SU : WorkList) {
       if (SU->NumPredsLeft > 0)
@@ -511,7 +511,7 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet,
       ReadyNodes.emplace_back(SU);
       // Ready, move it to Processed.
       Processed.insert(SU);
-      bUpdated = true;
+      IsUpdated = true;
       // Only update 1 node once.
       // Order of schedle here should not affect pressure.
       break;
@@ -613,7 +613,7 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
   }
 
   while (!WorkList.empty()) {
-    bool bUpdated = false;
+    bool IsUpdated = false;
     SmallVector<SUnit *, 4> ReadyNodes;
     for (SUnit *SU : WorkList) {
       if (SU->NumSuccsLeft > 0)
@@ -621,7 +621,7 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
       ReadyNodes.emplace_back(SU);
       // Ready, move it to Processed.
       Processed.insert(SU);
-      bUpdated = true;
+      IsUpdated = true;
       // Only update 1 node once.
       // Order of schedle here should not affect pressure.
       break;
@@ -977,7 +977,7 @@ void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
       continue;
     if (ChainedNodes.count(SU) > 0)
       continue;
-    bRecomputeHeight = false;
+    IsRecomputeHeight = false;
     Lineage lineage = buildChain(SU, SUnits);
 
     // Remove chained nodes from worklist.
@@ -992,7 +992,7 @@ void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
 
     Lineages.emplace_back(lineage);
 
-    if (bRecomputeHeight) {
+    if (IsRecomputeHeight) {
       // Update height from tail.
       SUnit *tail = lineage.Nodes.back();
       tail->setDepthDirty();
@@ -1111,7 +1111,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
     // Update height if need.
     unsigned Height = Succ->getHeight();
     if (Height <= HeriHeight) {
-      bRecomputeHeight = true;
+      IsRecomputeHeight = true;
     }
   }
   return Heir;
@@ -1345,9 +1345,9 @@ bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits) {
 void HRB::fusionLineages(std::vector<llvm::SUnit> &SUnits) {
   if (Lineages.empty())
     return;
-  bool bUpdated = true;
-  while (bUpdated) {
-    bUpdated = false;
+  bool IsUpdated = true;
+  while (IsUpdated) {
+    IsUpdated = false;
     int size = Lineages.size();
     for (int i = 0; i < size; i++) {
       Lineage &a = Lineages[i];
@@ -1359,7 +1359,7 @@ void HRB::fusionLineages(std::vector<llvm::SUnit> &SUnits) {
         if (b.length() == 0)
           continue;
         if (tryFuse(a, b, SUnits)) {
-          bUpdated = true;
+          IsUpdated = true;
           if (a.length() == 0)
             break;
         }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
index a7d29430b4276..952126798b1de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -28,17 +28,17 @@ struct SubExp {
   llvm::DenseSet<unsigned> TopRegs;
   llvm::DenseSet<llvm::MachineInstr *> BottomRoots;
   llvm::DenseSet<unsigned> BottomRegs;
-  bool bMultiDefOutput = false;
-  bool bHasTerminatorInst = false;
-  bool bUseIncomingReg = false;
-  bool bMoveIntoLoop = false;
-  bool bNotSafeToCopy = false;
-  bool bHasMemInst = false;
-  bool bHoist = false;
+  bool IsMultiDefOutput = false;
+  bool IsHasTerminatorInst = false;
+  bool IsUseIncomingReg = false;
+  bool IsMoveIntoLoop = false;
+  bool IsNotSafeToCopy = false;
+  bool IsHasMemInst = false;
+  bool IsHoist = false;
   // If temp/out reg is used by inst not in the subExp, cannot move since not
   // all users will be move. But OK to clone.
-  bool bCloneOnly = false;
-  bool bTouchSCC = false;
+  bool IsCloneOnly = false;
+  bool IsTouchSCC = false;
   llvm::MachineBasicBlock *FromBB;
   llvm::MachineBasicBlock *ToBB;
   unsigned sInputSize;
@@ -49,7 +49,7 @@ struct SubExp {
   unsigned vMaxSize;
   LiveSet inputLive;
   LiveSet outputLive;
-  bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool bMoveUp) const;
+  bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool IsMoveUp) const;
   void calcMaxPressure(const llvm::MachineRegisterInfo &MRI,
                        const llvm::SIRegisterInfo *SIRI);
   void dump(const llvm::MachineRegisterInfo &MRI,
@@ -59,11 +59,11 @@ struct SubExp {
 
 struct ExpDag {
   ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
-         const llvm::SIInstrInfo *SIII, const bool bJoinInput);
+         const llvm::SIInstrInfo *SIII, const bool IsJoinInput);
   const llvm::MachineRegisterInfo &MRI;
   const llvm::SIRegisterInfo *SIRI;
   const llvm::SIInstrInfo *SIII;
-  const bool bJoinInputToSubExp;
+  const bool IsJoinInputToSubExp;
 
   std::vector<llvm::SUnit> SUnits; ///< The scheduling units.
   llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
@@ -181,7 +181,7 @@ class HRB {
 
   llvm::DenseSet<llvm::SUnit *> ChainedNodes;
   llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> ReachMap;
-  bool bRecomputeHeight = false;
+  bool IsRecomputeHeight = false;
   std::vector<Lineage> Lineages;
   ColorResult Color;
   const llvm::MachineRegisterInfo &MRI;

From 78ab7f34417b7207547f84c3a02ec7dbc939b0e8 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Thu, 13 Mar 2025 15:53:20 -0700
Subject: [PATCH 13/25] Batch 2

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 2244 ++++++++---------
 1 file changed, 1105 insertions(+), 1139 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index ed7093f85823d..4c46cee69a038 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -102,7 +102,7 @@ typedef AMDGPUHotBlockRematerialize Remat;
 // Util functions.
 namespace {
 
-MachineBasicBlock *NearestCommonDominator(MachineDominatorTree *DT,
+MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT,
                                           BlockSet &Blocks) {
   auto I = Blocks.begin(), E = Blocks.end();
 
@@ -181,7 +181,7 @@ findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
 
   MachineBasicBlock *BB = *BBSet.begin();
   if (BBSet.size() > 1) {
-    MachineBasicBlock *BDom = NearestCommonDominator(DT, BBSet);
+    MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet);
     if (!BDom)
       return nullptr;
     BB = BDom;
@@ -194,7 +194,7 @@ findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
     return nullptr;
 
   // If BB is already a hot block, move to BB will not help.
-  // hotBlockRemat will fail it when process BB.
+  // hotBlockRemat will fail It when process BB.
 
   // Must reachable from DefMI.
   if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB))
@@ -221,8 +221,8 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
 
   unsigned OpNum = DefMI->getNumOperands();
   // Only move DefMI which all operand is unique def.
-  for (unsigned i = 0; i < OpNum; i++) {
-    MachineOperand &Op = DefMI->getOperand(i);
+  for (unsigned I = 0; I < OpNum; I++) {
+    MachineOperand &Op = DefMI->getOperand(I);
     if (!Op.isReg())
       continue;
     if (!MRI.getUniqueVRegDef(Op.getReg()) &&
@@ -257,7 +257,7 @@ struct RematStatus {
   unsigned MaxSPressure;
   unsigned InputPhysicalVPressure;
   unsigned InputPhysicalSPressure;
-  // More occupancy can help more than latency cost to reach it.
+  // More occupancy can help more than latency cost to reach It.
   bool MemBound;
   // abs(VTargetOcc-STargetOcc) > 1.
   bool NotBalance;
@@ -273,7 +273,7 @@ struct RematStatus {
 unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
                             const GCNSubtarget *ST, unsigned &MaxVPressure,
                             unsigned &MaxSPressure, RematStatus &Status) {
-  // Skip processing current block if it has only debug instructions
+  // Skip processing current block if It has only debug instructions
   if (MBB.getFirstNonDebugInstr() == MBB.end())
     return ST->getOccupancyWithNumVGPRs(0);
   auto BBEnd = MBB.rbegin();
@@ -366,17 +366,17 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
 
   LLVM_DEBUG(
       const SIRegisterInfo *SIRI = ST->getRegisterInfo();
-      dbgs() << "output live"; for (auto &it
+      dbgs() << "output live"; for (auto &It
                                     : Status.MBBOutputLiveMap) {
-        unsigned Idx = it.first->getNumber();
-        auto LiveReg = it.second;
+        unsigned Idx = It.first->getNumber();
+        auto LiveReg = It.second;
         dbgs() << "MBB" << Idx << ":";
         llvm::dumpLiveSet(LiveReg, SIRI);
       } dbgs() << "input live";
-      for (auto &it
+      for (auto &It
            : Status.MBBInputLiveMap) {
-        unsigned Idx = it.first->getNumber();
-        auto LiveReg = it.second;
+        unsigned Idx = It.first->getNumber();
+        auto LiveReg = It.second;
         dbgs() << "MBB" << Idx << ":";
         llvm::dumpLiveSet(LiveReg, SIRI);
       });
@@ -548,7 +548,7 @@ void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       MachineBasicBlock *InsertBB = Node.InsertBlock;
       // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
-      // still before LiveInfo.BB, it is still live.
+      // still before LiveInfo.BB, It is still live.
       unsigned LiveBBIndex = RPOTIndexMap[CurBB];
       unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
       if (LiveBBIndex > InsertBBIndex) {
@@ -607,8 +607,8 @@ int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
         unsigned PrevMask = SharedRegIt->second.getAsInteger();
         if (unsigned SharedMask = (PrevMask & Mask)) {
           // Some thing is shared.
-          for (int i = 0; i < MOSize; i++) {
-            if (SharedMask & (1 << i)) {
+          for (int I = 0; I < MOSize; I++) {
+            if (SharedMask & (1 << I)) {
               SharedSize += 1;
             }
           }
@@ -637,7 +637,7 @@ int getReducedSize(MapVector<Register, RematNode> &RematMap,
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       MachineBasicBlock *InsertBB = Node.InsertBlock;
       // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
-      // moved after LiveInfo.BB, it is not live anymore.
+      // moved after LiveInfo.BB, It is not live anymore.
       unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
       unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
       if (LiveBBIndex < InsertBBIndex)
@@ -706,7 +706,7 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
     }
 
     if (IsSingleDef) {
-      // The reg might share with other candidates,  check it here.
+      // The reg might share with other candidates,  check It here.
       // Count share reg in getReducedSize.
       if (EnableAggressive) {
         // In case of aggressive remat, treat multi use reg as shared reg and
@@ -720,7 +720,7 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
           OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
       }
       int InputSize = SIRI->getRegSizeInBits(*OpRC);
-      // If input not live in hotspot, move it cross hotspot should have
+      // If input not live in hotspot, move It cross hotspot should have
       // less reg then DefMi.
       if (RematSize > InputSize) {
         RematSize -= InputSize;
@@ -789,7 +789,7 @@ void buildRematCandiates(std::vector<RematNode> &Candidates,
 
   // Sort by gain.
   std::sort(Candidates.begin(), Candidates.end(),
-            [](RematNode &i, RematNode &j) { return i.Size > j.Size; });
+            [](RematNode &I, RematNode &J) { return I.Size > J.Size; });
 }
 
 // For case like
@@ -799,7 +799,7 @@ void buildRematCandiates(std::vector<RematNode> &Candidates,
 //  xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit
 //  killed $scc; xb.uniform
 // Sink S_AND right before S_CSELECT will overwrite SCC.
-// To avoid it, skip case when DefMI and UseMI has implicit define use.
+// To avoid It, skip case when DefMI and UseMI has implicit define use.
 bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
   if (DefMI->getDesc().NumImplicitDefs == 0)
     return false;
@@ -880,12 +880,12 @@ void addCloneCandidate(std::vector<RematNode *> &CloneList,
   // Group user in same blocks.
   std::vector<BlockSet> UserSetList(CloneList.size());
 
-  for (size_t i = 0; i < CloneList.size(); i++) {
-    auto *Node = CloneList[i];
+  for (size_t I = 0; I < CloneList.size(); I++) {
+    auto *Node = CloneList[I];
     unsigned Reg = Node->Reg;
     MachineInstr *DefMI = Node->DefMI;
     // Group user in same blocks.
-    BlockSet &UserSet = UserSetList[i];
+    BlockSet &UserSet = UserSetList[I];
 
     for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
          UseIt != MRI.use_instr_nodbg_end();) {
@@ -973,8 +973,8 @@ int filterRematCandiates(std::vector<RematNode> &Candidates,
 }
 
 void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
-                 SmallVector<MachineInstr *, 2> &userMIs) {
-  for (MachineInstr *UseMI : userMIs) {
+                 SmallVector<MachineInstr *, 2> &UserMIs) {
+  for (MachineInstr *UseMI : UserMIs) {
     for (MachineOperand &MO : UseMI->operands()) {
       if (!MO.isReg())
         continue;
@@ -988,14 +988,14 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
 }
 
 DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
-    unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &userBlocks,
+    unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
     DenseSet<MachineBasicBlock *> &UserMBBSet,
-    std::vector<BlockLiveInfo> &hotBlocks, MachineDominatorTree *DT) {
+    std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT) {
   // Collect hot blocks which Exp is live in.
-  DenseSet<MachineBasicBlock *> hotBlockSet;
-  for (BlockLiveInfo &hotBlock : hotBlocks) {
-    if (hotBlock.InputLive.count(Reg)) {
-      hotBlockSet.insert(hotBlock.BB);
+  DenseSet<MachineBasicBlock *> HotBlockSet;
+  for (BlockLiveInfo &HotBlock : HotBlocks) {
+    if (HotBlock.InputLive.count(Reg)) {
+      HotBlockSet.insert(HotBlock.BB);
     }
   }
 
@@ -1003,19 +1003,19 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
   // the value not cross hotBlocks when later blocks are cloned.
   // For userBlocks which dominated by all hotBlocks, they could share clones
   // because once after hot block, the pressure is OK.
-  DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
+  DenseSet<MachineBasicBlock *> AfterHotRangeMBBs;
   for (MachineBasicBlock *MBB : UserMBBSet) {
     // Always clone in hot block.
-    if (hotBlockSet.count(MBB))
+    if (HotBlockSet.count(MBB))
       continue;
 
     bool IsDomAllHotBlocks = true;
     bool IsDomedByAllHotBlocks = true;
-    for (MachineBasicBlock *hotMBB : hotBlockSet) {
-      if (!DT->dominates(MBB, hotMBB)) {
+    for (MachineBasicBlock *HotMBB : HotBlockSet) {
+      if (!DT->dominates(MBB, HotMBB)) {
         IsDomAllHotBlocks = false;
       }
-      if (!DT->dominates(hotMBB, MBB)) {
+      if (!DT->dominates(HotMBB, MBB)) {
         IsDomedByAllHotBlocks = false;
       }
       if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
@@ -1023,19 +1023,17 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
       }
     }
     if (IsDomAllHotBlocks) {
-      userBlocks.erase(MBB);
+      UserBlocks.erase(MBB);
     } else if (IsDomedByAllHotBlocks) {
-      afterHotRangeMBBs.insert(MBB);
+      AfterHotRangeMBBs.insert(MBB);
     }
   }
 
   // Split after hotRange block set by domtree.
   DenseMap<MachineBasicBlock *, BlockSet> DomMap;
-  if (!afterHotRangeMBBs.empty()) {
-    for (auto it : afterHotRangeMBBs) {
-      MachineBasicBlock *MBB = it;
-      for (auto it2 : afterHotRangeMBBs) {
-        MachineBasicBlock *MBB2 = it2;
+  if (!AfterHotRangeMBBs.empty()) {
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+      for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) {
         if (MBB == MBB2)
           continue;
         if (DT->dominates(MBB, MBB2)) {
@@ -1046,13 +1044,12 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
         }
       }
     }
-    for (auto it : afterHotRangeMBBs) {
-      MachineBasicBlock *MBB = it;
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
       auto &Dom = DomMap[MBB];
-      for (MachineBasicBlock *domedMBB : Dom) {
+      for (MachineBasicBlock *DomedMBB : Dom) {
         // Remove domedMBB.
-        DomMap.erase(domedMBB);
-        UserMBBSet.erase(domedMBB);
+        DomMap.erase(DomedMBB);
+        UserMBBSet.erase(DomedMBB);
       }
     }
   }
@@ -1062,7 +1059,7 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
 
 // Look for an earlier insert point if the InstructionToMove
 // writes to scc and scc is live at the CurrentInsertPoint.
-static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
+static MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
     MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
     MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
@@ -1078,7 +1075,7 @@ static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
 
 // Look for an earlier insert point if the SubExp
 // writes to scc and scc is live at the CurrentInsertPoint.
-static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
+static MachineBasicBlock::iterator adjustInsertPointForSubExpToAvoidSccSmash(
     const SubExp &SubExpToMove, MachineBasicBlock *MBB,
     MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
@@ -1092,7 +1089,7 @@ static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
 }
 
 // Return trun if moving MI to Location will smash a live scc value.
-static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
+static bool willSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator Location) {
   // It is ok to pass nullptr to `modifiesRegister` for TRI here since
   // SCC has no subreg/suprereg relationships.
@@ -1100,8 +1097,8 @@ static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
          llvm::IsSccLiveAt(MBB, Location);
 }
 
-void ApplyCloneRemat(Remat *Remat, RematNode &Node,
-                     std::vector<BlockLiveInfo> &hotBlocks,
+void applyCloneRemat(Remat *Remat, RematNode &Node,
+                     std::vector<BlockLiveInfo> &HotBlocks,
                      MachineDominatorTree *DT, MachineRegisterInfo &MRI,
                      SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
                      const SIInstrInfo *SIII, MachineFunction &MF) {
@@ -1125,18 +1122,18 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
   // Group user in same blocks.
   BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
   DenseSet<MachineBasicBlock *> UserMBBSet;
-  for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
-       useIt != MRI.use_instr_nodbg_end();) {
-    MachineInstr &UseMI = *(useIt++);
+  for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+       UseIt != MRI.use_instr_nodbg_end();) {
+    MachineInstr &UseMI = *(UseIt++);
     UserMap[UseMI.getParent()].emplace_back(&UseMI);
     UserMBBSet.insert(UseMI.getParent());
   }
 
   DenseMap<MachineBasicBlock *, BlockSet> DomMap =
-      reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, DT);
+      reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT);
 
-  for (auto useIt : UserMap) {
-    MachineBasicBlock *MBB = useIt.first;
+  for (auto UseIt : UserMap) {
+    MachineBasicBlock *MBB = UseIt.first;
     // Skip same block uses.
     if (MBB == DefMI->getParent()) {
       continue;
@@ -1145,24 +1142,24 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
     if (UserMBBSet.count(MBB) == 0)
       continue;
 
-    unsigned NewReg = MRI.createVirtualRegister(RC);
+    Register NewReg = MRI.createVirtualRegister(RC);
     auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
-    for (unsigned i = 1; i < OpNum; i++) {
-      NewDef = NewDef.add(DefMI->getOperand(i));
+    for (unsigned I = 1; I < OpNum; I++) {
+      NewDef = NewDef.add(DefMI->getOperand(I));
     }
 
-    MachineInstr *InsertPointMI = useIt.second.front();
-    SlotIndex lastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+    MachineInstr *InsertPointMI = UseIt.second.front();
+    SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
 
-    for (MachineInstr *UseMI : useIt.second) {
-      SlotIndex slot = SlotIndexes->getInstructionIndex(*UseMI);
-      if (lastSlot > slot) {
-        lastSlot = slot;
+    for (MachineInstr *UseMI : UseIt.second) {
+      SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI);
+      if (LastSlot > Slot) {
+        LastSlot = Slot;
         InsertPointMI = UseMI;
       }
     }
 
-    MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash(
+    MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash(
         DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
 
     for (MachineMemOperand *MO : DefMI->memoperands()) {
@@ -1173,15 +1170,15 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
 
     SlotIndexes->insertMachineInstrInMaps(*NewDef);
 
-    SmallVector<MachineInstr *, 2> &userMIs = useIt.second;
-    updateUsers(Reg, NewReg, IsSubRegDef, userMIs);
+    SmallVector<MachineInstr *, 2> &UserMIs = UseIt.second;
+    updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
 
     // update users in dom MBBs.
-    auto domMapIt = DomMap.find(MBB);
-    if (domMapIt != DomMap.end()) {
-      for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
-        SmallVector<MachineInstr *, 2> &userMIs = UserMap[UpdateMBB];
-        updateUsers(Reg, NewReg, IsSubRegDef, userMIs);
+    auto DomMapIt = DomMap.find(MBB);
+    if (DomMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : DomMapIt->second) {
+        SmallVector<MachineInstr *, 2> &UserMIs = UserMap[UpdateMBB];
+        updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
       }
     }
 
@@ -1194,8 +1191,8 @@ void ApplyCloneRemat(Remat *Remat, RematNode &Node,
   }
 }
 
-void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
-                            SlotIndexes *slotIndexes,
+void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+                            SlotIndexes *SlotIndexes,
                             const SIRegisterInfo *SIRI,
                             const SIInstrInfo *SIII) {
   MachineInstr *DefMI = Node.DefMI;
@@ -1212,7 +1209,7 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
     MBB = Node.InsertBlock;
   }
 
-  InsertPoint = AdjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+  InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
                                                  SIRI, SIII);
 
   // Move instruction to new location.
@@ -1220,33 +1217,33 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
   InsertPoint->getParent()->insert(InsertPoint, DefMI);
 
   // Update slot index.
-  slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
-  slotIndexes->insertMachineInstrInMaps(*DefMI);
+  SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+  SlotIndexes->insertMachineInstrInMaps(*DefMI);
 }
 
-void ApplyRemat(Remat *Remat, MapVector<Register, RematNode> &RematMap,
-                std::vector<BlockLiveInfo> &hotBlocks,
-                MachineDominatorTree *DT, SlotIndexes *slotIndexes,
-                MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                const SIInstrInfo *SIII, MachineFunction &MF) {
+void applyRemat(Remat *Remat, MapVector<Register, RematNode> &RematMap,
+                std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+                SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+                const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                MachineFunction &MF) {
   std::vector<RematNode> UpdateList;
-  for (auto &it : RematMap) {
-    UpdateList.emplace_back(it.second);
+  for (auto &It : RematMap) {
+    UpdateList.emplace_back(It.second);
   }
   // Sort update list with slotIndex to make sure def moved before use.
-  // If use moved before def, it might not be the first use anymore.
+  // If use moved before def, It might not be the first use anymore.
   std::sort(UpdateList.begin(), UpdateList.end(),
-            [&slotIndexes](RematNode &i, RematNode &j) {
-              SlotIndex a = slotIndexes->getInstructionIndex(*i.DefMI);
-              SlotIndex b = slotIndexes->getInstructionIndex(*j.DefMI);
-              return a < b;
+            [&SlotIndexes](RematNode &I, RematNode &J) {
+              SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI);
+              SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI);
+              return A < B;
             });
 
   for (RematNode &Node : UpdateList) {
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
-      ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
+      applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
     } else if (Node.Kind == RematNode::RematKind::Clone) {
-      ApplyCloneRemat(Remat, Node, hotBlocks, DT, MRI, slotIndexes, SIRI, SIII,
+      applyCloneRemat(Remat, Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII,
                       MF);
     }
   }
@@ -1255,8 +1252,8 @@ void ApplyRemat(Remat *Remat, MapVector<Register, RematNode> &RematMap,
 void dumpRematMap(MapVector<Register, RematNode> &RematMap,
                   const SIRegisterInfo *SIRI) {
   dbgs() << "\n rematMap: \n";
-  for (auto it : RematMap) {
-    int Reg = it.first;
+  for (auto It : RematMap) {
+    int Reg = It.first;
     dbgs() << printReg(Reg, SIRI);
     dbgs() << "\n";
   }
@@ -1308,29 +1305,29 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
   auto &MRI = MF.getRegInfo();
 
   bool IsUpdated = false;
-  RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST);
+  RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST);
 
   const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
-  if (status.TargetOcc >= MaxOcc)
+  if (Status.TargetOcc >= MaxOcc)
     return false;
 
-  unsigned VLimit = status.TargetVLimit;
-  unsigned SLimit = status.TargetSLimit;
+  unsigned VLimit = Status.TargetVLimit;
+  unsigned SLimit = Status.TargetSLimit;
 
-  int rematSCnt = status.MaxSPressure - SLimit;
+  int RematSCnt = Status.MaxSPressure - SLimit;
   // when agressive sgpr remat, reserve some for allocation lost.
   if (EnableAggressive)
-    rematSCnt += NearTargetRegLimit;
+    RematSCnt += NearTargetRegLimit;
 
   bool IsSGPRSpill = false;
-  if (rematSCnt > 0) {
-    IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+  if (RematSCnt > 0) {
+    IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
   }
 
-  bool IsForceRematSgpr = IsSGPRSpill | status.NotBalance;
+  const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
 
   // If bound by lds, skip.
-  if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
+  if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
       !IsForceRematSgpr)
     return false;
 
@@ -1343,27 +1340,27 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
   MapVector<Register, RematNode> SRematMap;
   // Reg which cannot move around to remat.
   DenseSet<unsigned> PinnedRegSet;
-  std::vector<BlockLiveInfo> hotBlocks;
-  for (auto it = po_begin(EntryMBB); it != po_end(EntryMBB); it++) {
-    MachineBasicBlock *MBB = *it;
-    auto &RP = status.MBBPressureMap[MBB];
+  std::vector<BlockLiveInfo> HotBlocks;
+  for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) {
+    MachineBasicBlock *MBB = *It;
+    auto &RP = Status.MBBPressureMap[MBB];
     // ignore block not hot.
-    if (RP.getVGPRNum(ST->hasGFX90AInsts()) < status.TargetVLimit &&
-        (RP.getMaxSGPR() + RegForVCC + status.InputPhysicalSPressure) <
-            status.TargetSLimit)
+    if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit &&
+        (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) <
+            Status.TargetSLimit)
       continue;
     // Collect reg pressure.
-    unsigned maxVPressure = 0;
-    unsigned maxSPressure = 0;
-    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+    unsigned MaxVPressure = 0;
+    unsigned MaxSPressure = 0;
+    const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB];
 
-    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+    const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB];
     LLVM_DEBUG(
-        dumpHotBlock(inputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
+        dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
 
     GCNDownwardRPTracker Tracker(*LIS);
 
-    Tracker.reset(*MBB->begin(), &inputLive);
+    Tracker.reset(*MBB->begin(), &InputLive);
 
     for (MachineInstr &MI : *MBB) {
       if (MI.isDebugInstr())
@@ -1371,30 +1368,30 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
       Tracker.advance();
       auto LISLR = Tracker.getLiveRegs();
       // Update live set for things already remated.
-      updateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
-      updateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+      updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
+      updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
 
-      const GCNRPTracker::LiveRegSet &liveSet = LISLR;
+      const GCNRPTracker::LiveRegSet &LiveSet = LISLR;
       unsigned VPressure = 0;
       unsigned SPressure = 0;
-      CollectLiveSetPressure(liveSet, MRI, SIRI, VPressure, SPressure);
-      if (maxVPressure < VPressure)
-        maxVPressure = VPressure;
-      if (maxSPressure < SPressure)
-        maxSPressure = SPressure;
-    }
-    maxSPressure += RegForVCC + status.InputPhysicalSPressure;
-    if (maxVPressure <= VLimit && maxSPressure <= SLimit)
+      CollectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure);
+      if (MaxVPressure < VPressure)
+        MaxVPressure = VPressure;
+      if (MaxSPressure < SPressure)
+        MaxSPressure = SPressure;
+    }
+    MaxSPressure += RegForVCC + Status.InputPhysicalSPressure;
+    if (MaxVPressure <= VLimit && MaxSPressure <= SLimit)
       continue;
 
     // Build block live info.
     // Use outputLive for EntryMBB.
-    BlockLiveInfo LiveInfo = {MBB, maxSPressure, maxVPressure,
-                              MBB != EntryMBB ? inputLive : outputLive};
+    BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure,
+                              MBB != EntryMBB ? InputLive : OutputLive};
     // Skip entry block when save hotBlock to reduce clone because not clone in
     // entry block.
     if (MBB != EntryMBB)
-      hotBlocks.emplace_back(LiveInfo);
+      HotBlocks.emplace_back(LiveInfo);
     GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive;
 
     // Update reg pressure based on remat list.
@@ -1406,18 +1403,18 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
                                   LiveInfo, RPOTIndexMap);
 
     // Calculate size need to be remat.
-    int rematVCnt = maxVPressure - VReduced - VLimit;
-    int rematSCnt = maxSPressure - SReduced - SLimit;
+    int RematVCnt = MaxVPressure - VReduced - VLimit;
+    int RematSCnt = MaxSPressure - SReduced - SLimit;
 
     bool IsSGPRSpill = false;
-    if (rematSCnt > 0) {
-      IsSGPRSpill = nearSgprSpill(maxSPressure, ST, MF);
+    if (RematSCnt > 0) {
+      IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF);
     }
-    bool IsForceRematSgpr = IsSGPRSpill || status.NotBalance;
+    bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
     // Try to add candidates into remat list.
 
-    int newRematSCnt = 0;
-    if (rematSCnt > 0) {
+    int NewRematSCnt = 0;
+    if (RematSCnt > 0) {
       // Build candidate nodes.
       std::vector<RematNode> SRematCandidates;
       buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
@@ -1426,19 +1423,19 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
       LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
       std::vector<RematNode> SRematList;
       // Filter candidates.
-      newRematSCnt = filterRematCandiates(SRematCandidates, SRematList,
+      NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList,
                                           PinnedRegSet, DT, PDT, MLI, MRI,
-                                          /*IsVGPR*/ false, status.MemBound);
-      if (newRematSCnt > rematSCnt) {
+                                          /*IsVGPR*/ false, Status.MemBound);
+      if (NewRematSCnt > RematSCnt) {
         // Has enough remat node to cover rematCnt.
-        int rematCnt = 0;
+        int RematCnt = 0;
         for (RematNode &Node : SRematList) {
           SRematMap[Node.Reg] = Node;
-          rematCnt += Node.Size;
-          if (rematCnt > rematSCnt && !EnableAggressive)
+          RematCnt += Node.Size;
+          if (RematCnt > RematSCnt && !EnableAggressive)
             break;
         }
-        newRematSCnt = 0;
+        NewRematSCnt = 0;
       } else {
 
         for (RematNode &Node : SRematList) {
@@ -1447,8 +1444,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
         // Check shared size.
         int SharedReducedSize =
             getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
-        if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
-            rematSCnt) {
+        if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
+            RematSCnt) {
           for (RematNode &Node : SRematList) {
             SRematMap[Node.Reg] = Node;
           }
@@ -1477,14 +1474,14 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
             MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
             if (UseMI.getParent() != MBB)
               continue;
-            int gain = rematGain(&MI, Reg, MRI, SIRI,
+            int Gain = rematGain(&MI, Reg, MRI, SIRI,
                                  /*IsVGPR*/ false);
-            if (gain > 0) {
+            if (Gain > 0) {
               // Skip case when DefMI has implicit define which used by UseMI.
               if (isImplicitDefUse(&MI, &UseMI)) {
                 continue;
               }
-              RematNode Node = {Reg, &MI, (unsigned)gain >> 5};
+              RematNode Node = {Reg, &MI, (unsigned)Gain >> 5};
               Node.InsertPointMI = &UseMI;
               Node.Kind = RematNode::RematKind::OneDefOneUse;
               SRematMap[Reg] = Node;
@@ -1492,7 +1489,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
             }
           }
         }
-        newRematSCnt = rematSCnt - newRematSCnt - SharedReducedSize;
+        NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize;
       }
     }
     // If works, continue.
@@ -1503,17 +1500,17 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     // Apply the remat.
 
     int NewRematVCnt = 0;
-    if (rematVCnt > 0) {
+    if (RematVCnt > 0) {
       // TODO: V remat.
     }
 
-    bool NeedSRemat = rematSCnt > 0;
-    bool NeedVRemat = rematVCnt > 0;
+    bool NeedSRemat = RematSCnt > 0;
+    bool NeedVRemat = RematVCnt > 0;
     // If sgpr spill, always do remat.
     bool IsSRematOK =
-        (newRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr;
+        (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr;
     bool IsVRematOK =
-        (status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
+        (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
     if (NeedSRemat && NeedVRemat) {
       if (IsVRematOK && IsSRematOK) {
         IsUpdated = true;
@@ -1530,8 +1527,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
       }
     }
     // TODO: what to do when cannot reach target?
-    if (newRematSCnt > 0) {
-      if ((unsigned)newRematSCnt <= NearTargetRegLimit) {
+    if (NewRematSCnt > 0) {
+      if ((unsigned)NewRematSCnt <= NearTargetRegLimit) {
         IsNearTarget = true;
       } else {
         if (!IsSGPRSpill)
@@ -1546,7 +1543,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
 
   if (!SRematMap.empty()) {
     IsUpdated = true;
-    ApplyRemat(Remat, SRematMap, hotBlocks, DT, SlotIndexes, MRI, SIRI, SIII,
+    applyRemat(Remat, SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII,
                MF);
     LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
   }
@@ -1567,7 +1564,7 @@ bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) {
   return DefMIs.size() == 1;
 }
 
-static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) {
+static bool isImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) {
   if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) {
     return false;
   }
@@ -1575,15 +1572,7 @@ static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) {
   return MO.getReg() == Reg;
 }
 
-static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) {
-  if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) {
-    return false;
-  }
-
-  return MO.getReg() == Reg;
-}
-
-static bool IsSafeRematCandidateUser(const MachineInstr *UseMI,
+static bool isSafeRematCandidateUser(const MachineInstr *UseMI,
                                      const SIInstrInfo *SIII) {
   // Make sure UseMI is not wqm like sample.
   if (SIII->isWQM(UseMI->getOpcode()))
@@ -1628,17 +1617,17 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
   unsigned OpNum = DefMI->getNumOperands();
 
   // Only move DefMI which all operand is unique def.
-  for (unsigned i = 0; i < OpNum; i++) {
-    MachineOperand &Op = DefMI->getOperand(i);
+  for (unsigned I = 0; I < OpNum; I++) {
+    MachineOperand &Op = DefMI->getOperand(I);
     if (!Op.isReg())
       continue;
     Register OpReg = Op.getReg();
-    if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) ||
-        IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
+    if (isImplicitUseOfReg(Op, AMDGPU::EXEC) ||
+        isImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
       continue;
-    if (IsImplicitUseOfReg(Op, AMDGPU::MODE))
+    if (isImplicitUseOfReg(Op, AMDGPU::MODE))
       continue;
-    if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
+    if (isImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
       continue;
     // Alow unused scc define.
     if (Op.isImplicit() && Op.isDead() && Op.isDef())
@@ -1658,7 +1647,7 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
     }
 
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
-      if (!IsSafeRematCandidateUser(&UseMI, SIII))
+      if (!isSafeRematCandidateUser(&UseMI, SIII))
         return false;
     }
   }
@@ -1669,30 +1658,30 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
 std::vector<SubExp> buildSubExpFromCandidates(
     Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-    const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
-    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) {
+    const MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes,
+    GCNRPTracker::LiveRegSet &UnusedPassThrus, bool AllowPartialUseInSubExp) {
   InstSet CandidateDefs;
   DenseSet<unsigned> RemovedCandidates;
   std::vector<unsigned> CandidateRegs;
   CandidateRegs.reserve(Candidates.size());
-  for (auto it : Candidates) {
-    unsigned Reg = it.first;
+  for (auto It : Candidates) {
+    unsigned Reg = It.first;
     CandidateRegs.emplace_back(Reg);
   }
   // Sort candidate by defMI order to make sure defMI has dependent check after
   // all its dependent node.
   std::sort(CandidateRegs.begin(), CandidateRegs.end(),
-            [&MRI, &slotIndexes](const unsigned a, unsigned b) {
-              MachineInstr *MIa = MRI.getUniqueVRegDef(a);
+            [&MRI, &SlotIndexes](const unsigned A, unsigned B) {
+              MachineInstr *MIa = MRI.getUniqueVRegDef(A);
 
-              MachineInstr *MIb = MRI.getUniqueVRegDef(b);
+              MachineInstr *MIb = MRI.getUniqueVRegDef(B);
               // Later instr first.
               return !SlotIndex::isEarlierInstr(
-                  slotIndexes->getInstructionIndex(*MIa),
-                  slotIndexes->getInstructionIndex(*MIb));
+                  SlotIndexes->getInstructionIndex(*MIa),
+                  SlotIndexes->getInstructionIndex(*MIb));
             });
 
-  // If Candidate def has user in MBB, add it when allow partial candidates.
+  // If Candidate def has user in MBB, add It when allow partial candidates.
   // And the subExp has the define could only be clone, cannot move cross blocks
   // because user in MBB.
   DenseSet<MachineInstr *> PartialCandidates;
@@ -1704,7 +1693,7 @@ std::vector<SubExp> buildSubExpFromCandidates(
       if (UseMI.getParent() == MI->getParent()) {
         if (UseMI.getNumExplicitDefs() == 1) {
           // Skip user which already in Candidates.
-          unsigned UserDefReg = UseMI.getOperand(0).getReg();
+          Register UserDefReg = UseMI.getOperand(0).getReg();
           if (Candidates.count(UserDefReg) > 0 &&
               RemovedCandidates.count(UserDefReg) == 0)
             continue;
@@ -1728,14 +1717,14 @@ std::vector<SubExp> buildSubExpFromCandidates(
   if (CandidateDefs.empty())
     return std::vector<SubExp>();
   for (unsigned Reg : RemovedCandidates) {
-    unUsedPassThrus[Reg] = Candidates[Reg];
+    UnusedPassThrus[Reg] = Candidates[Reg];
     Candidates.erase(Reg);
   }
 
   // iterate MBB backward.
   // add inst which only used for candidate defines.
-  for (auto it = MBB->rbegin(); it != MBB->rend(); it++) {
-    MachineInstr &MI = *it;
+  for (auto It = MBB->rbegin(); It != MBB->rend(); It++) {
+    MachineInstr &MI = *It;
     if (CandidateDefs.count(&MI) > 0) {
       continue;
     }
@@ -1815,45 +1804,45 @@ std::vector<SubExp> buildSubExpFromCandidates(
   }
 
   // Build defs in order.
-  std::vector<MachineInstr *> defs;
-  defs.reserve(CandidateDefs.size());
+  std::vector<MachineInstr *> Defs;
+  Defs.reserve(CandidateDefs.size());
   for (MachineInstr &MI : *MBB) {
     if (CandidateDefs.count(&MI) == 0)
       continue;
-    defs.emplace_back(&MI);
+    Defs.emplace_back(&MI);
   }
 
   LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
-                                                            : defs) {
+                                                            : Defs) {
     MI->dump();
   } dbgs() << "\nFinished Candidate Defs End\n";);
 
   // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
   // Candidates as output.
-  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
-  dag.build(CandidateInput, Candidates, defs);
+  ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
+  Dag.build(CandidateInput, Candidates, Defs);
   if (AllowPartialUseInSubExp) {
-    for (auto &subExp : dag.SubExps) {
-      for (auto *MI : subExp.SUnits) {
+    for (auto &SubExp : Dag.SubExps) {
+      for (auto *MI : SubExp.SUnits) {
         if (PartialCandidates.count(MI)) {
-          subExp.IsCloneOnly = true;
+          SubExp.IsCloneOnly = true;
           break;
         }
       }
     }
   }
-  return dag.SubExps;
+  return Dag.SubExps;
 }
 
 std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
     Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-    const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) {
+    const MachineRegisterInfo &MRI) {
   InstSet CandidateDefs;
 
   LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
-  for (auto it : Candidates) {
-    unsigned Reg = it.first;
+  for (auto It : Candidates) {
+    unsigned Reg = It.first;
     MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
 
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
@@ -1893,8 +1882,8 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
   // iterate MBB.
   GCNRPTracker::LiveRegSet LocalCandidates = Candidates;
   // add inst which only used by candidate defines.
-  for (auto it = MBB->begin(); it != MBB->end(); it++) {
-    MachineInstr &MI = *it;
+  for (auto It = MBB->begin(); It != MBB->end(); It++) {
+    MachineInstr &MI = *It;
     if (CandidateDefs.count(&MI) > 0) {
       for (MachineOperand &MO : MI.operands()) {
         if (!MO.isReg())
@@ -2003,33 +1992,33 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
   }
 
   // Build defs in order.
-  std::vector<MachineInstr *> defs;
-  defs.reserve(CandidateDefs.size());
+  std::vector<MachineInstr *> Defs;
+  Defs.reserve(CandidateDefs.size());
   for (MachineInstr &MI : *MBB) {
     if (CandidateDefs.count(&MI) == 0)
       continue;
-    defs.emplace_back(&MI);
+    Defs.emplace_back(&MI);
   }
 
   LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
-                                                            : defs) {
+                                                            : Defs) {
     MI->dump();
   } dbgs() << "\nFinished Candidate Defs End\n";);
 
-  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it
+  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto It
                                                     : LocalCandidates) {
-    pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs());
+    pressure::print_reg(It.first, MRI, SIRI, llvm::dbgs());
   } dbgs() << "\nLocalCandidates End\n";);
   // Make sure all input reg are uniqueDef.
   // Input is Candidates, output is?
   // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
   // Candidates as output.
-  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
-  dag.build(Candidates, LocalCandidates, defs);
-  return dag.SubExps;
+  ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
+  Dag.build(Candidates, LocalCandidates, Defs);
+  return Dag.SubExps;
 }
 
-void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
+void printVreg(Register Reg, const MachineRegisterInfo &MRI) {
   if (Reg.isVirtual()) {
     StringRef Name = MRI.getVRegName(Reg);
     if (Name != "") {
@@ -2040,50 +2029,49 @@ void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
   }
 }
 
-MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
+MachineBasicBlock *findTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
                                    const MachineRegisterInfo &MRI,
                                    MachineDominatorTree *DT) {
-  BlockSet userBlocks;
+  BlockSet UserBlocks;
   for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
     MachineBasicBlock *UserBB = UseMI.getParent();
     // Skip current BB.
     if (UserBB != FromBB)
-      userBlocks.insert(UserBB);
+      UserBlocks.insert(UserBB);
     else
       // When has user in FromBB, userBlock will be FromBB.
       return nullptr;
   }
-  if (userBlocks.empty())
+  if (UserBlocks.empty())
     return nullptr;
-  MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks);
-  if (!DT->dominates(FromBB, userBlock)) {
+  MachineBasicBlock *UserBlock = nearestCommonDominator(DT, UserBlocks);
+  if (!DT->dominates(FromBB, UserBlock)) {
     return nullptr;
   }
-  if (userBlock == FromBB)
+  if (UserBlock == FromBB)
     return nullptr;
-  return userBlock;
+  return UserBlock;
 }
 
-void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
+void applySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
                              MachineDominatorTree *DT,
-                             SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
-                             const SIRegisterInfo *SIRI) {
+                             SlotIndexes *SlotIndexes) {
   // Move from bottom.
   MachineBasicBlock *FromBB = Exp.FromBB;
-  for (auto it = Exp.SUnits.rbegin(); it != Exp.SUnits.rend(); it++) {
-    MachineInstr *DefMI = *it;
+  for (auto It = Exp.SUnits.rbegin(); It != Exp.SUnits.rend(); It++) {
+    MachineInstr *DefMI = *It;
     if (DefMI->getNumExplicitDefs() != 1)
       continue;
 
-    unsigned Reg = DefMI->getOperand(0).getReg();
-    MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, DT);
+    Register Reg = DefMI->getOperand(0).getReg();
+    MachineBasicBlock *ToBB = findTargetBlock(Reg, FromBB, MRI, DT);
     if (!ToBB)
       continue;
 
     // Do not overwrite a live scc.
     MachineBasicBlock::iterator InsertPoint =
         ToBB->SkipPHIsAndLabels(ToBB->begin());
-    if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint))
+    if (willSmashSccAtLocation(DefMI, ToBB, InsertPoint))
       continue;
 
     DefMI->removeFromParent();
@@ -2094,14 +2082,13 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
     if (DefMI->isDebugInstr())
       continue;
     // Update slot index.
-    slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
-    slotIndexes->insertMachineInstrInMaps(*DefMI);
+    SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    SlotIndexes->insertMachineInstrInMaps(*DefMI);
   }
 }
 
-void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
-                               MachineDominatorTree *DT,
-                               SlotIndexes *slotIndexes,
+void applySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
+                               SlotIndexes *SlotIndexes,
                                const SIInstrInfo *SIII,
                                const SIRegisterInfo *SIRI) {
   // Move from top.
@@ -2119,11 +2106,11 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
       Terminator = ToBB->end();
   }
 
-  Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator,
+  Terminator = adjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator,
                                                          MRI, SIRI, SIII);
 
-  for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
-    MachineInstr *DefMI = *it;
+  for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) {
+    MachineInstr *DefMI = *It;
     if (DefMI->getNumExplicitDefs() != 1)
       continue;
     if (SIII->isEXP(DefMI->getOpcode()))
@@ -2138,38 +2125,38 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
     if (DefMI->isDebugInstr())
       continue;
     // Update slot index.
-    slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
-    slotIndexes->insertMachineInstrInMaps(*DefMI);
+    SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    SlotIndexes->insertMachineInstrInMaps(*DefMI);
   }
 }
 
-DenseSet<MachineInstr *> buildCloneSet(ExpDag &dag,
-                                       DenseSet<SUnit *> &dagBottoms,
-                                       GCNRPTracker::LiveRegSet &usedOutput) {
-  DenseSet<MachineInstr *> copySet;
-  for (auto it = dag.SUnits.rbegin(); it != dag.SUnits.rend(); it++) {
-    SUnit &SU = *it;
+DenseSet<MachineInstr *> buildCloneSet(ExpDag &Dag,
+                                       DenseSet<SUnit *> &DagBottoms,
+                                       GCNRPTracker::LiveRegSet &UsedOutput) {
+  DenseSet<MachineInstr *> CopySet;
+  for (auto It = Dag.SUnits.rbegin(); It != Dag.SUnits.rend(); It++) {
+    SUnit &SU = *It;
     // Skip non-inst node.
     if (!SU.isInstr())
       continue;
     MachineInstr *MI = SU.getInstr();
-    if (dagBottoms.find(&SU) != dagBottoms.end()) {
+    if (DagBottoms.find(&SU) != DagBottoms.end()) {
       bool IsUsed = false;
       // For bottom SU, if in usedOutput, add to copySet;
       for (MachineOperand &DefMO : MI->defs()) {
         if (!DefMO.isReg())
           continue;
-        unsigned Reg = DefMO.getReg();
-        if (usedOutput.count(Reg) > 0) {
+        Register Reg = DefMO.getReg();
+        if (UsedOutput.count(Reg) > 0) {
           IsUsed = true;
           break;
         }
       }
       if (IsUsed) {
-        copySet.insert(MI);
+        CopySet.insert(MI);
         continue;
       }
-      // bottom SU may still have succNode when it used both inExp and outExp.
+      // bottom SU may still have succNode when It used both inExp and outExp.
       // So continue check succNode.
     }
 
@@ -2178,29 +2165,29 @@ DenseSet<MachineInstr *> buildCloneSet(ExpDag &dag,
     for (SDep &SucDep : SU.Succs) {
       SUnit *SucSU = SucDep.getSUnit();
       MachineInstr *SuccMI = SucSU->getInstr();
-      if (copySet.count(SuccMI) > 0) {
+      if (CopySet.count(SuccMI) > 0) {
         IsSuccCopied = true;
         break;
       }
     }
     if (IsSuccCopied)
-      copySet.insert(MI);
+      CopySet.insert(MI);
   }
-  return copySet;
+  return CopySet;
 }
 
-void updateUsers(SmallVector<MachineInstr *, 2> &userMIs,
+void updateUsers(SmallVector<MachineInstr *, 2> &UserMIs,
                  DenseMap<unsigned, unsigned> &RegMap) {
 
-  for (MachineInstr *UserMI : userMIs) {
+  for (MachineInstr *UserMI : UserMIs) {
     for (MachineOperand &MO : UserMI->uses()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
-      auto it = RegMap.find(Reg);
-      if (it == RegMap.end())
+      Register Reg = MO.getReg();
+      auto It = RegMap.find(Reg);
+      if (It == RegMap.end())
         continue;
-      unsigned NewReg = it->second;
+      unsigned NewReg = It->second;
       MO.setReg(NewReg);
     }
   }
@@ -2208,24 +2195,24 @@ void updateUsers(SmallVector<MachineInstr *, 2> &userMIs,
 
 struct HotBlock {
   MachineBasicBlock *MBB = nullptr;
-  GCNRPTracker::LiveRegSet inputLive;
-  std::pair<unsigned, unsigned> maxPressures;
+  GCNRPTracker::LiveRegSet InputLive;
+  std::pair<unsigned, unsigned> MaxPressures;
   // Info about vmemLd.
-  int vmemLdInputSize;
-  int vmemLdOutputSize;
+  int VmemLdInputSize;
+  int VmemLdOutputSize;
 };
 
 DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     SubExp &Exp,
-    MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &userBlocks,
-    DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &userBlocksLiveRegs,
-    std::vector<HotBlock> &hotBlocks, MachineDominatorTree *DT) {
+    MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &UserBlocks,
+    DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &UserBlocksLiveRegs,
+    std::vector<HotBlock> &HotBlocks, MachineDominatorTree *DT) {
   // Collect hot blocks which Exp is live in.
-  DenseSet<MachineBasicBlock *> hotBlockSet;
-  for (HotBlock &hotBlock : hotBlocks) {
+  DenseSet<MachineBasicBlock *> HotBlockSet;
+  for (HotBlock &HotBlock : HotBlocks) {
     for (unsigned Reg : Exp.BottomRegs) {
-      if (hotBlock.inputLive.count(Reg)) {
-        hotBlockSet.insert(hotBlock.MBB);
+      if (HotBlock.InputLive.count(Reg)) {
+        HotBlockSet.insert(HotBlock.MBB);
         break;
       }
     }
@@ -2235,20 +2222,20 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
   // the value not cross hotBlocks when later blocks are cloned.
   // For userBlocks which dominated by all hotBlocks, they could share clones
   // because once after hot block, the pressure is OK.
-  DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
-  for (auto it : userBlocksLiveRegs) {
-    MachineBasicBlock *MBB = it.first;
+  DenseSet<MachineBasicBlock *> AfterHotRangeMBBs;
+  for (auto It : UserBlocksLiveRegs) {
+    MachineBasicBlock *MBB = It.first;
     // Always clone in hot block.
-    if (hotBlockSet.count(MBB))
+    if (HotBlockSet.count(MBB))
       continue;
 
     bool IsDomAllHotBlocks = true;
     bool IsDomedByAllHotBlocks = true;
-    for (MachineBasicBlock *hotMBB : hotBlockSet) {
-      if (!DT->dominates(MBB, hotMBB)) {
+    for (MachineBasicBlock *HotMBB : HotBlockSet) {
+      if (!DT->dominates(MBB, HotMBB)) {
         IsDomAllHotBlocks = false;
       }
-      if (!DT->dominates(hotMBB, MBB)) {
+      if (!DT->dominates(HotMBB, MBB)) {
         IsDomedByAllHotBlocks = false;
       }
       if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
@@ -2256,19 +2243,17 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
       }
     }
     if (IsDomAllHotBlocks) {
-      userBlocks.erase(MBB);
+      UserBlocks.erase(MBB);
     } else if (IsDomedByAllHotBlocks) {
-      afterHotRangeMBBs.insert(MBB);
+      AfterHotRangeMBBs.insert(MBB);
     }
   }
 
   // Split after hotRange block set by domtree.
   DenseMap<MachineBasicBlock *, BlockSet> DomMap;
-  if (!afterHotRangeMBBs.empty()) {
-    for (auto it : afterHotRangeMBBs) {
-      MachineBasicBlock *MBB = it;
-      for (auto it2 : afterHotRangeMBBs) {
-        MachineBasicBlock *MBB2 = it2;
+  if (!AfterHotRangeMBBs.empty()) {
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+      for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) {
         if (MBB == MBB2)
           continue;
         if (DT->dominates(MBB, MBB2)) {
@@ -2279,16 +2264,15 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
         }
       }
     }
-    for (auto it : afterHotRangeMBBs) {
-      MachineBasicBlock *MBB = it;
-      auto &usedOutput = userBlocksLiveRegs[MBB];
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+      auto &UsedOutput = UserBlocksLiveRegs[MBB];
       auto &Dom = DomMap[MBB];
-      for (MachineBasicBlock *domedMBB : Dom) {
+      for (MachineBasicBlock *DomedMBB : Dom) {
         // Merge domed use to MBB use.
-        mergeLiveRegSet(usedOutput, userBlocksLiveRegs[domedMBB]);
+        mergeLiveRegSet(UsedOutput, UserBlocksLiveRegs[DomedMBB]);
         // Remove domedMBB.
-        DomMap.erase(domedMBB);
-        userBlocksLiveRegs.erase(domedMBB);
+        DomMap.erase(DomedMBB);
+        UserBlocksLiveRegs.erase(DomedMBB);
       }
     }
   }
@@ -2296,13 +2280,13 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
   return DomMap;
 }
 
-void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
+void applySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &HotBlocks,
                               MachineDominatorTree *DT,
                               MachineRegisterInfo &MRI,
-                              SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+                              SlotIndexes *SlotIndexes, const SIInstrInfo *SIII,
                               const SIRegisterInfo *SIRI) {
-  MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> userBlocks;
-  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> userBlocksLiveRegs;
+  MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> UserBlocks;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> UserBlocksLiveRegs;
   for (unsigned Reg : Exp.BottomRegs) {
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       MachineBasicBlock *UserBB = UseMI.getParent();
@@ -2310,36 +2294,36 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
       if (UserBB == Exp.FromBB)
         continue;
 
-      userBlocks[UserBB].emplace_back(&UseMI);
-      auto &userLives = userBlocksLiveRegs[UserBB];
+      UserBlocks[UserBB].emplace_back(&UseMI);
+      auto &UserLives = UserBlocksLiveRegs[UserBB];
       for (MachineOperand &MO : UseMI.uses()) {
         if (!MO.isReg())
           continue;
-        unsigned UseReg = MO.getReg();
+        Register UseReg = MO.getReg();
         if (Reg != UseReg)
           continue;
-        userLives[Reg] |= getRegMask(MO, MRI);
+        UserLives[Reg] |= getRegMask(MO, MRI);
       }
     }
   }
   // Build dag for SubExp to help remove unused inst when clone.
-  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
-  dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits);
-  DenseSet<SUnit *> dagBottoms;
-  for (SUnit &SU : dag.SUnits) {
+  ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
+  Dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits);
+  DenseSet<SUnit *> DagBottoms;
+  for (SUnit &SU : Dag.SUnits) {
     if (!SU.isInstr())
       continue;
     if (SU.NumSuccs == 0) {
-      dagBottoms.insert(&SU);
+      DagBottoms.insert(&SU);
     } else {
       MachineInstr *MI = SU.getInstr();
       // Add SU which def value in Exp.outputLive.
       for (MachineOperand &DefMO : MI->defs()) {
         if (!DefMO.isReg())
           continue;
-        unsigned Reg = DefMO.getReg();
+        Register Reg = DefMO.getReg();
         if (Exp.BottomRegs.count(Reg) > 0) {
-          dagBottoms.insert(&SU);
+          DagBottoms.insert(&SU);
           break;
         }
       }
@@ -2351,46 +2335,46 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
   // For userBlocks which dominated by all hotBlocks, they could share clones
   // because once after hot block, the pressure is OK.
   DenseMap<MachineBasicBlock *, BlockSet> DomMap =
-      reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, DT);
+      reduceClonedMBBs(Exp, UserBlocks, UserBlocksLiveRegs, HotBlocks, DT);
 
   // Sort to make stable order.
   std::sort(
-      userBlocks.begin(), userBlocks.end(),
-      [](std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it0,
-         std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it1) {
-        return it0.first->getNumber() < it1.first->getNumber();
+      UserBlocks.begin(), UserBlocks.end(),
+      [](std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &It0,
+         std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &It1) {
+        return It0.first->getNumber() < It1.first->getNumber();
       });
 
   const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
 
   // Clone for each userBlocks. Not share clone thru dom tree which cannot help
   // reg pressure.
-  for (auto it : userBlocks) {
-    MachineBasicBlock *MBB = it.first;
+  for (auto It : UserBlocks) {
+    MachineBasicBlock *MBB = It.first;
     // Skip MBB which share clone from other MBBs.
-    if (userBlocksLiveRegs.count(MBB) == 0)
+    if (UserBlocksLiveRegs.count(MBB) == 0)
       continue;
-    auto &usedOutput = userBlocksLiveRegs[MBB];
-    auto copySet = buildCloneSet(dag, dagBottoms, usedOutput);
+    auto &UsedOutput = UserBlocksLiveRegs[MBB];
+    auto CopySet = buildCloneSet(Dag, DagBottoms, UsedOutput);
     // Clone to MBB.
     // Create new regs first.
     DenseMap<unsigned, unsigned> RegMap;
-    auto insertPtr = MBB->getFirstNonPHI();
+    auto InsertPtr = MBB->getFirstNonPHI();
     // If Exp has scc read/write, make sure MBB not have scc in liveins.
-    if (IsModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr))
+    if (IsModifiesScc && llvm::IsSccLiveAt(MBB, InsertPtr))
       continue;
     MachineFunction *MF = MBB->getParent();
-    for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
-      MachineInstr *DefMI = *it;
+    for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) {
+      MachineInstr *DefMI = *It;
       // Not clone if already in MBB.
       if (DefMI->getParent() == MBB)
         continue;
       // Not clone if not used for MBB.
-      if (copySet.count(DefMI) == 0)
+      if (CopySet.count(DefMI) == 0)
         continue;
 
       auto ClonedMI =
-          BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+          BuildMI(*MBB, InsertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
 
       for (MachineOperand &Def : DefMI->defs()) {
         Register Reg = Def.getReg();
@@ -2399,7 +2383,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
             continue;
           ClonedMI.addDef(Reg, 0, Def.getSubReg());
         } else {
-          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+          Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
           RegMap[Reg] = NewReg;
           ClonedMI.addDef(NewReg, 0, Def.getSubReg());
         }
@@ -2413,11 +2397,11 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
               continue;
             ClonedMI.addReg(Reg, 0, MO.getSubReg());
           } else {
-            auto it = RegMap.find(Reg);
-            if (it == RegMap.end()) {
+            auto It = RegMap.find(Reg);
+            if (It == RegMap.end()) {
               ClonedMI.addReg(Reg, 0, MO.getSubReg());
             } else {
-              ClonedMI.addReg(it->second, 0, MO.getSubReg());
+              ClonedMI.addReg(It->second, 0, MO.getSubReg());
             }
           }
         } else {
@@ -2426,7 +2410,7 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
       }
 
       MachineInstr *NewDef = ClonedMI.getInstr();
-      slotIndexes->insertMachineInstrInMaps(*NewDef);
+      SlotIndexes->insertMachineInstrInMaps(*NewDef);
       // Set mem operand
       for (MachineMemOperand *MO : DefMI->memoperands()) {
         NewDef->addMemOperand(*MF, MO);
@@ -2434,43 +2418,43 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
     }
 
     // update users in MBB.
-    SmallVector<MachineInstr *, 2> &userMIs = it.second;
-    updateUsers(userMIs, RegMap);
+    SmallVector<MachineInstr *, 2> &UserMIs = It.second;
+    updateUsers(UserMIs, RegMap);
 
     // update users in dom MBBs.
-    auto domMapIt = DomMap.find(MBB);
-    if (domMapIt != DomMap.end()) {
-      for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
-        SmallVector<MachineInstr *, 2> &userMIs = userBlocks[UpdateMBB];
-        updateUsers(userMIs, RegMap);
+    auto DomMapIt = DomMap.find(MBB);
+    if (DomMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : DomMapIt->second) {
+        SmallVector<MachineInstr *, 2> &UserMIs = UserBlocks[UpdateMBB];
+        updateUsers(UserMIs, RegMap);
       }
     }
   }
 }
 
-void ApplySubExpCloneNearUserInBlock(
+void applySubExpCloneNearUserInBlock(
     SubExp &Exp,
-    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
-    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
-    MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &InBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &InBlockHotSInstMap,
+    MachineRegisterInfo &MRI, SlotIndexes *SlotIndexes,
     const SIRegisterInfo *SIRI) {
   MachineBasicBlock *MBB = Exp.FromBB;
   MachineFunction *MF = MBB->getParent();
-  MachineInstr *hotVMI = inBlockHotVInstMap[MBB];
-  MachineInstr *hotSMI = inBlockHotSInstMap[MBB];
+  MachineInstr *HotVMI = InBlockHotVInstMap[MBB];
+  MachineInstr *HotSMI = InBlockHotSInstMap[MBB];
   // Exp is build with hotVMI or hotSMI, cannot mix.
-  assert(!(hotVMI && hotSMI) && "cannot mix hot MI");
-  MachineInstr *hotMI = hotVMI;
-  if (!hotMI) {
-    hotMI = hotSMI;
+  assert(!(HotVMI && HotSMI) && "cannot mix hot MI");
+  MachineInstr *HotMI = HotVMI;
+  if (!HotMI) {
+    HotMI = HotSMI;
   }
 
-  SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex();
+  SlotIndex HotSlot = SlotIndexes->getInstructionIndex(*HotMI).getBaseIndex();
   const bool IsModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
 
   for (unsigned Reg : Exp.BottomRegs) {
 
-    SmallVector<MachineInstr *, 2> useMIs;
+    SmallVector<MachineInstr *, 2> UseMIs;
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       MachineBasicBlock *UserBB = UseMI.getParent();
       // Skip current BB.
@@ -2479,40 +2463,40 @@ void ApplySubExpCloneNearUserInBlock(
       // Skip inst in Exp.
       if (Exp.BottomRoots.find(&UseMI) != Exp.BottomRoots.end())
         continue;
-      SlotIndex useSlot =
-          slotIndexes->getInstructionIndex(UseMI).getBaseIndex();
+      SlotIndex UseSlot =
+          SlotIndexes->getInstructionIndex(UseMI).getBaseIndex();
       // Only clone for use after hot slot.
-      if (useSlot < hotSlot)
+      if (UseSlot < HotSlot)
         continue;
 
       // Do not overwrite a live scc.
       if (IsModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI))
         continue;
 
-      useMIs.emplace_back(&UseMI);
+      UseMIs.emplace_back(&UseMI);
     }
-    if (useMIs.empty())
+    if (UseMIs.empty())
       continue;
     DenseMap<unsigned, unsigned> RegMap;
 
-    std::sort(useMIs.begin(), useMIs.end(),
-              [&slotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) {
-                return slotIndexes->getInstructionIndex(*MIa).getBaseIndex() <
-                       slotIndexes->getInstructionIndex(*MIb).getBaseIndex();
+    std::sort(UseMIs.begin(), UseMIs.end(),
+              [&SlotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) {
+                return SlotIndexes->getInstructionIndex(*MIa).getBaseIndex() <
+                       SlotIndexes->getInstructionIndex(*MIb).getBaseIndex();
               });
-    auto insertPtr = useMIs.front()->getIterator();
+    auto InsertPtr = UseMIs.front()->getIterator();
 
-    for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
-      MachineInstr *DefMI = *it;
+    for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) {
+      MachineInstr *DefMI = *It;
       auto ClonedMI =
-          BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+          BuildMI(*MBB, InsertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
 
       for (MachineOperand &Def : DefMI->defs()) {
         Register Reg = Def.getReg();
         if (Reg.isPhysical()) {
           ClonedMI.addDef(Reg, 0, Def.getSubReg());
         } else {
-          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+          Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
           RegMap[Reg] = NewReg;
           ClonedMI.addDef(NewReg, 0, Def.getSubReg());
         }
@@ -2527,11 +2511,11 @@ void ApplySubExpCloneNearUserInBlock(
           if (Reg.isPhysical()) {
             ClonedMI.addReg(Reg, 0, MO.getSubReg());
           } else {
-            auto it = RegMap.find(Reg);
-            if (it == RegMap.end()) {
+            auto It = RegMap.find(Reg);
+            if (It == RegMap.end()) {
               ClonedMI.addReg(Reg, 0, MO.getSubReg());
             } else {
-              ClonedMI.addReg(it->second, 0, MO.getSubReg());
+              ClonedMI.addReg(It->second, 0, MO.getSubReg());
             }
           }
         } else {
@@ -2540,55 +2524,55 @@ void ApplySubExpCloneNearUserInBlock(
       }
 
       MachineInstr *NewDef = ClonedMI.getInstr();
-      slotIndexes->insertMachineInstrInMaps(*NewDef);
+      SlotIndexes->insertMachineInstrInMaps(*NewDef);
       // Set mem operand
       for (MachineMemOperand *MO : DefMI->memoperands()) {
         NewDef->addMemOperand(*MF, MO);
       }
     }
     // TODO: only clone to cross hot range.
-    for (MachineInstr *UseMI : useMIs) {
+    for (MachineInstr *UseMI : UseMIs) {
       for (MachineOperand &MO : UseMI->uses()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
-        auto it = RegMap.find(Reg);
-        if (it == RegMap.end())
+        Register Reg = MO.getReg();
+        auto It = RegMap.find(Reg);
+        if (It == RegMap.end())
           continue;
-        unsigned NewReg = it->second;
+        Register NewReg = It->second;
         MO.setReg(NewReg);
       }
     }
   }
 }
 
-bool isInLiveSet(unsigned Reg, LaneBitmask mask,
-                 const GCNRPTracker::LiveRegSet &live) {
-  auto it = live.find(Reg);
-  if (it == live.end())
+bool isInLiveSet(unsigned Reg, LaneBitmask Mask,
+                 const GCNRPTracker::LiveRegSet &Live) {
+  auto It = Live.find(Reg);
+  if (It == Live.end())
     return false;
 
-  LaneBitmask liveMask = it->second;
-  return (liveMask | mask) == liveMask;
+  LaneBitmask LiveMask = It->second;
+  return (LiveMask | Mask) == LiveMask;
 }
 
 unsigned getPacifistLevel(unsigned Reg,
-                          DenseMap<MachineInstr *, unsigned> &pacifistLevels,
+                          DenseMap<MachineInstr *, unsigned> &PacifistLevels,
                           const MachineRegisterInfo &MRI) {
-  unsigned level = 0;
+  unsigned Level = 0;
   for (MachineInstr &MI : MRI.def_instructions(Reg)) {
-    auto it = pacifistLevels.find(&MI);
-    if (it == pacifistLevels.end())
+    auto It = PacifistLevels.find(&MI);
+    if (It == PacifistLevels.end())
       continue;
-    level = it->second;
+    Level = It->second;
   }
-  return level;
+  return Level;
 }
 
 bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
                    const MachineRegisterInfo &MRI) {
-  for (MachineInstr &def : MRI.def_instructions(Reg)) {
-    if (def.getParent() != MBB)
+  for (MachineInstr &Def : MRI.def_instructions(Reg)) {
+    if (Def.getParent() != MBB)
       continue;
     return true;
   }
@@ -2596,38 +2580,36 @@ bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
 }
 
 MachineInstr *getInBlockUniqueDef(unsigned Reg, MachineBasicBlock *MBB,
-                                  const GCNRPTracker::LiveRegSet &inputLive,
-                                  const GCNRPTracker::LiveRegSet &outputLive,
+                                  const GCNRPTracker::LiveRegSet &InputLive,
                                   const MachineRegisterInfo &MRI) {
   MachineInstr *DefMI = nullptr;
   // If live as input for MBB, cannot be unique def.
-  if (inputLive.count(Reg))
+  if (InputLive.count(Reg))
     return DefMI;
-  for (MachineInstr &def : MRI.def_instructions(Reg)) {
-    if (def.getParent() != MBB)
+  for (MachineInstr &Def : MRI.def_instructions(Reg)) {
+    if (Def.getParent() != MBB)
       continue;
     if (DefMI) {
       // Not unique.
       DefMI = nullptr;
       break;
     }
-    DefMI = &def;
+    DefMI = &Def;
   }
   return DefMI;
 }
 
-bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive,
-                const GCNRPTracker::LiveRegSet &outputLive) {
-  return inputLive.count(Reg) && outputLive.count(Reg);
+bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &InputLive,
+                const GCNRPTracker::LiveRegSet &OutputLive) {
+  return InputLive.count(Reg) && OutputLive.count(Reg);
 }
 
 // Instructions which only use imm/passThru reg/output only reg will not kill
 // any live reg, so name them pacifist here.
 bool collectPacifist(MachineInstr &MI,
-                     const GCNRPTracker::LiveRegSet &inputLive,
-                     const GCNRPTracker::LiveRegSet &outputLive,
-                     const MachineRegisterInfo &MRI,
-                     const SIRegisterInfo *SIRI) {
+                     const GCNRPTracker::LiveRegSet &InputLive,
+                     const GCNRPTracker::LiveRegSet &OutputLive,
+                     const MachineRegisterInfo &MRI) {
   // If has implicit def, not move.
   if (MI.getDesc().NumImplicitDefs != 0)
     return false;
@@ -2645,16 +2627,15 @@ bool collectPacifist(MachineInstr &MI,
     if (Reg.isPhysical())
       return false;
     // The def for reg must be unique def in block or pass thru which not has
-    // def in block. If not, it is not safe to move.
-    if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), inputLive,
-                                         outputLive, MRI) ||
-          (isPassThru(Reg, inputLive, outputLive) &&
+    // def in block. If not, It is not safe to move.
+    if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI) ||
+          (isPassThru(Reg, InputLive, OutputLive) &&
            !hasInBlockDef(Reg, MI.getParent(), MRI))))
       return false;
 
-    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+    LaneBitmask Mask = llvm::getRegMask(MO, MRI);
 
-    if (isInLiveSet(Reg, mask, outputLive))
+    if (isInLiveSet(Reg, Mask, OutputLive))
       continue;
 
     return false;
@@ -2666,13 +2647,12 @@ bool collectPacifist(MachineInstr &MI,
     if (Reg.isPhysical())
       return false;
 
-    if (nullptr ==
-        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+    if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI))
       return false;
 
     IsHasDef = true;
   }
-  // If no def, it will not increase pressure, don't mark it.
+  // If no def, It will not increase pressure, don't mark It.
   return IsHasDef;
 }
 
@@ -2696,103 +2676,102 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI,
                                              MachineBasicBlock &MBB,
                                              MachineRegisterInfo &MRI,
                                              AliasAnalysis *AA,
-                                             SlotIndexes *slotIndexes) {
+                                             SlotIndexes *SlotIndexes) {
 
-  SmallVector<MachineInstr *, 2> users;
+  SmallVector<MachineInstr *, 2> Users;
 
   // We cannot move the pacifist instruction past any memory
-  // op with which it aliases. Find the first instruction
-  // that aliases the pacifist MI (if any) and add it to the list
+  // op with which It aliases. Find the first instruction
+  // that aliases the pacifist MI (if any) and add It to the list
   // of users. The sort() below will select the earliest user instruction.
   if (MachineInstr *AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
-    users.push_back(AliasMI);
+    Users.push_back(AliasMI);
   }
 
   for (MachineOperand &MO : MI.defs()) {
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       if (&MBB != UseMI.getParent())
         continue;
-      users.emplace_back(&UseMI);
+      Users.emplace_back(&UseMI);
     }
   }
-  if (users.empty())
+  if (Users.empty())
     return nullptr;
 
-  std::sort(users.begin(), users.end(),
-            [&slotIndexes](const MachineInstr *MIa, MachineInstr *MIb) {
+  std::sort(Users.begin(), Users.end(),
+            [&SlotIndexes](const MachineInstr *MIa, MachineInstr *MIb) {
               // Early instr first.
               return SlotIndex::isEarlierInstr(
-                  slotIndexes->getInstructionIndex(*MIa),
-                  slotIndexes->getInstructionIndex(*MIb));
+                  SlotIndexes->getInstructionIndex(*MIa),
+                  SlotIndexes->getInstructionIndex(*MIb));
             });
-  return users.front();
+  return Users.front();
 }
 
 // Pacifist inst will only add pressure since they don't kill.
 // Try to hold them as late as possible in a MBB to help pressure.
 bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
                      MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                     const SIInstrInfo *SIII, AliasAnalysis *AA,
-                     RematStatus &status) {
-  const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
-  const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+                     AliasAnalysis *AA, RematStatus &Status) {
+  const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[&MBB];
+  const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
 
-  SmallVector<MachineInstr *, 32> pacifistList;
+  SmallVector<MachineInstr *, 32> PacifistList;
   LLVM_DEBUG(dbgs() << "pacifist begin\n");
   for (MachineInstr &MI : MBB) {
     if (MI.isDebugInstr())
       continue;
-    if (collectPacifist(MI, inputLive, outputLive, MRI, SIRI)) {
-      pacifistList.emplace_back(&MI);
+    if (collectPacifist(MI, InputLive, OutputLive, MRI)) {
+      PacifistList.emplace_back(&MI);
       LLVM_DEBUG(MI.dump());
     }
   }
   LLVM_DEBUG(dbgs() << "pacifist end\n");
 
-  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
   bool IsUpdated = false;
 
   // Move pacifist to its first user.
   // for (MachineInstr *MI : pacifistList) {
-  for (auto it = pacifistList.rbegin(); it != pacifistList.rend(); it++) {
-    MachineInstr *MI = *it;
-    MachineInstr *firstUser =
-        findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
-    if (firstUser == MI)
+  for (auto It = PacifistList.rbegin(); It != PacifistList.rend(); It++) {
+    MachineInstr *MI = *It;
+    MachineInstr *FirstUser =
+        findPacifistInsertPoint(*MI, MBB, MRI, AA, SlotIndexes);
+    if (FirstUser == MI)
       continue;
-    if (firstUser == MI->getNextNode())
+    if (FirstUser == MI->getNextNode())
       continue;
 
-    auto insertPoint = MBB.getFirstInstrTerminator();
-    if (firstUser) {
-      insertPoint = firstUser->getIterator();
+    auto InsertPoint = MBB.getFirstInstrTerminator();
+    if (FirstUser) {
+      InsertPoint = FirstUser->getIterator();
     } else {
       // When there's no terminator.
-      if (insertPoint == MBB.end())
-        insertPoint--;
+      if (InsertPoint == MBB.end())
+        InsertPoint--;
       else
-        // BRANCH may have exec update before it.
-        insertPoint--;
-
-      insertPoint =
-          llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
-
-      while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
-              insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
-             insertPoint != MI->getIterator()) {
-        insertPoint--;
-        insertPoint =
-            llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+        // BRANCH may have exec update before It.
+        InsertPoint--;
+
+      InsertPoint =
+          llvm::skipDebugInstructionsBackward(InsertPoint, MBB.instr_begin());
+
+      while ((InsertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
+              InsertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
+             InsertPoint != MI->getIterator()) {
+        InsertPoint--;
+        InsertPoint =
+            llvm::skipDebugInstructionsBackward(InsertPoint, MBB.instr_begin());
       }
-      if (insertPoint == MI->getIterator())
+      if (InsertPoint == MI->getIterator())
         continue;
     }
     // Do not overwrite a live scc.
-    if (WillSmashSccAtLocation(MI, &MBB, insertPoint))
+    if (willSmashSccAtLocation(MI, &MBB, InsertPoint))
       continue;
     MI->removeFromParent();
-    MBB.insert(insertPoint, MI);
+    MBB.insert(InsertPoint, MI);
 
     LIS->handleMove(*MI);
     IsUpdated = true;
@@ -2813,16 +2792,16 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI,
         continue;
       if (MI.getNumDefs() != 1)
         continue;
-      unsigned dstIdx =
+      unsigned DstIdx =
           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
-      if (dstIdx == (unsigned)-1)
+      if (DstIdx == (unsigned)-1)
         continue;
-      MachineOperand &DstMO = MI.getOperand(dstIdx);
+      MachineOperand &DstMO = MI.getOperand(DstIdx);
       if (DstMO.getSubReg() != 0)
         continue;
       if (DstMO.isTied())
         continue;
-      unsigned Reg = DstMO.getReg();
+      Register Reg = DstMO.getReg();
       if (MRI.getUniqueVRegDef(Reg) == nullptr)
         continue;
 
@@ -2839,22 +2818,21 @@ collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI,
   return UniformMap;
 }
 
-// Try insert readfirstlane on uniform vgpr to turn it in sgpr and save vgpr
+// Try insert readfirstlane on uniform vgpr to turn It in sgpr and save vgpr
 // pressure.
 bool collectVToSCrossHotSpot(
-    MachineBasicBlock &MBB, RematStatus &status,
+    MachineBasicBlock &MBB, RematStatus &Status,
     DenseMap<unsigned, MachineInstr *> &UniformMap,
-    SmallMapVector<unsigned, MachineInstr *, 4> &VToSMap, LiveIntervals *LIS)
-{
-  unsigned VLimit = status.TargetVLimit;
-  unsigned SLimit = status.TargetSLimit;
+    SmallMapVector<unsigned, MachineInstr *, 4> &VToSMap, LiveIntervals *LIS) {
+  unsigned VLimit = Status.TargetVLimit;
+  unsigned SLimit = Status.TargetSLimit;
   auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
 
   GCNDownwardRPTracker Tracker(*LIS);
 
   bool IsUpdated = false;
-  const auto inputLive = status.MBBInputLiveMap[&MBB];
-  Tracker.reset(*MBB.begin(), &inputLive);
+  const auto InputLive = Status.MBBInputLiveMap[&MBB];
+  Tracker.reset(*MBB.begin(), &InputLive);
   for (MachineInstr &MI : MBB) {
     if (MI.isDebugInstr()) {
       continue;
@@ -2876,8 +2854,8 @@ bool collectVToSCrossHotSpot(
 
     // Try to make all possible vtos to reduce vpressure.
     const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs();
-    for (auto it : CurLives) {
-      unsigned Reg = it.first;
+    for (auto It : CurLives) {
+      unsigned Reg = It.first;
       auto UniformIt = UniformMap.find(Reg);
       if (UniformIt == UniformMap.end())
         continue;
@@ -2889,53 +2867,53 @@ bool collectVToSCrossHotSpot(
 }
 
 // Return true if the user is outside of the def's loop.
-static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User,
+static bool isCrossLoopUse(MachineInstr *Def, MachineInstr *User,
                            MachineLoopInfo *MLI) {
   MachineLoop *L = MLI->getLoopFor(Def->getParent());
   return L && !L->contains(User->getParent());
 }
 
-bool rematUniformVgprToSgpr(
-    Remat *Remat, MachineFunction &MF, RematStatus &status,
-    DenseMap<MachineBasicBlock *, GCNRegPressure> &MBBPressureMap,
-    std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS,
-    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
+bool rematUniformVgprToSgpr(Remat *Remat, MachineFunction &MF,
+                            RematStatus &Status,
+                            std::vector<HotBlock> &HotBlocks,
+                            LiveIntervals *LIS, MachineRegisterInfo &MRI,
+                            const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                            MachineLoopInfo *MLI) {
   DenseMap<unsigned, MachineInstr *> UniformVgprMap =
       collectUniformVgprs(Remat, MF, MRI, SIRI);
 
   SmallMapVector<unsigned, MachineInstr *, 4> VToSMap;
 
-  for (auto &hotBlock : hotBlocks) {
-    MachineBasicBlock &MBB = *hotBlock.MBB;
-    collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS);
+  for (auto &HotBlock : HotBlocks) {
+    MachineBasicBlock &MBB = *HotBlock.MBB;
+    collectVToSCrossHotSpot(MBB, Status, UniformVgprMap, VToSMap, LIS);
   }
 
   if (VToSMap.empty())
     return false;
-  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
   const MCInstrDesc &ReadFirstLaneDesc = SIII->get(AMDGPU::V_READFIRSTLANE_B32);
-  for (auto it : VToSMap) {
-    unsigned Reg = it.first;
-    MachineInstr *MI = it.second;
+  for (auto It : VToSMap) {
+    unsigned Reg = It.first;
+    MachineInstr *MI = It.second;
 
     auto *VRC = SIRI->getRegClassForReg(MRI, Reg);
     // TODO: support bigger vgpr to sgpr.
     if (VRC != &AMDGPU::VGPR_32RegClass)
       continue;
     auto *NewRC = SIRI->getEquivalentSGPRClass(VRC);
-    unsigned newDst = MRI.createVirtualRegister(NewRC);
+    Register NewDst = MRI.createVirtualRegister(NewRC);
 
     auto ReadFirstLane =
-        BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, newDst);
-    SmallVector<MachineInstr *, 2> userMIs;
-    for (MachineInstr &userMI : MRI.use_nodbg_instructions(Reg)) {
+        BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, NewDst);
+    SmallVector<MachineInstr *, 2> UserMIs;
+    for (MachineInstr &UserMI : MRI.use_nodbg_instructions(Reg)) {
       // Do not replace v->s across loops. Even if the value is uniform
       // branch divergence can cause a uniform value in a loop to be
       // non-uniform when used outside a loop.
-      if (IsSafeRematCandidateUser(&userMI, SIII) &&
-          !IsCrossLoopUse(MI, &userMI, MLI))
-        userMIs.emplace_back(&userMI);
+      if (isSafeRematCandidateUser(&UserMI, SIII) &&
+          !isCrossLoopUse(MI, &UserMI, MLI))
+        UserMIs.emplace_back(&UserMI);
     }
 
     // Finish readfirstlane
@@ -2945,32 +2923,32 @@ bool rematUniformVgprToSgpr(
     Remat->SafeToRemoveInsts.insert(VToSMI);
     MachineBasicBlock *MBB = MI->getParent();
     MBB->insertAfter(MI->getIterator(), VToSMI);
-    slotIndexes->insertMachineInstrInMaps(*VToSMI);
+    SlotIndexes->insertMachineInstrInMaps(*VToSMI);
 
-    for (MachineInstr *userMI : userMIs) {
-      const auto &Desc = userMI->getDesc();
+    for (MachineInstr *UserMI : UserMIs) {
+      const auto &Desc = UserMI->getDesc();
       bool IsIllegal = false;
-      for (unsigned i = 0; i < userMI->getNumOperands(); i++) {
-        MachineOperand &MO = userMI->getOperand(i);
+      for (unsigned I = 0; I < UserMI->getNumOperands(); I++) {
+        MachineOperand &MO = UserMI->getOperand(I);
         if (!MO.isReg())
           continue;
         if (MO.isDef())
           continue;
         if (MO.getReg() != Reg)
           continue;
-        if (i >= Desc.getNumOperands()) {
+        if (I >= Desc.getNumOperands()) {
           IsIllegal = true;
           break;
         }
 
-        MO.setReg(newDst);
-        if (userMI->getDesc().operands()[i].RegClass != -1) {
-          if (!SIII->isOperandLegal(*userMI, i, &MO)) {
-            SIII->legalizeOperands(*userMI);
+        MO.setReg(NewDst);
+        if (UserMI->getDesc().operands()[I].RegClass != -1) {
+          if (!SIII->isOperandLegal(*UserMI, I, &MO)) {
+            SIII->legalizeOperands(*UserMI);
             // In case legalizeOperands not help, just legalize with mov.
-            if (userMI->getDesc().operands()[i].RegClass != -1 &&
-                !SIII->isOperandLegal(*userMI, i)) {
-              SIII->legalizeOpWithMove(*userMI, i);
+            if (UserMI->getDesc().operands()[I].RegClass != -1 &&
+                !SIII->isOperandLegal(*UserMI, I)) {
+              SIII->legalizeOpWithMove(*UserMI, I);
             }
           }
         } else {
@@ -2980,12 +2958,12 @@ bool rematUniformVgprToSgpr(
       if (IsIllegal)
         continue;
 
-      auto rit = userMI->getReverseIterator();
-      rit++;
-      auto endIt = userMI->getParent()->rend();
-      while (rit != endIt && !rit->isDebugInstr() &&
-             !slotIndexes->hasIndex(*rit))
-        slotIndexes->insertMachineInstrInMaps(*(rit++));
+      auto RIt = UserMI->getReverseIterator();
+      RIt++;
+      auto EndIt = UserMI->getParent()->rend();
+      while (RIt != EndIt && !RIt->isDebugInstr() &&
+             !SlotIndexes->hasIndex(*RIt))
+        SlotIndexes->insertMachineInstrInMaps(*(RIt++));
     }
   }
 
@@ -2993,19 +2971,17 @@ bool rematUniformVgprToSgpr(
 }
 
 bool collectRematableHotReg(
-    MachineInstr &MI, const GCNRPTracker::LiveRegSet &hotLive,
-    GCNRPTracker::LiveRegSet &pureHotRematSet,
-    DenseMap<MachineInstr *, unsigned> &pureHotRematLevels, unsigned &DefReg,
-    const GCNRPTracker::LiveRegSet &inputLive,
-    const GCNRPTracker::LiveRegSet &outputLive, const MachineRegisterInfo &MRI,
-    const SIRegisterInfo *SIRI) {
+    MachineInstr &MI, const GCNRPTracker::LiveRegSet &HotLive,
+    GCNRPTracker::LiveRegSet &PureHotRematSet,
+    DenseMap<MachineInstr *, unsigned> &PureHotRematLevels, unsigned &DefReg,
+    const GCNRPTracker::LiveRegSet &InputLive, const MachineRegisterInfo &MRI) {
   // Ignore inst not have def or more than 1 def.
   if (MI.getDesc().getNumDefs() != 1)
     return false;
 
   DefReg = MI.defs().begin()->getReg();
 
-  unsigned level = 0;
+  unsigned Level = 0;
   for (MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
@@ -3016,7 +2992,7 @@ bool collectRematableHotReg(
 
     // If user is in same MI like
     //  %4:vgpr_32 = V_MAD_LEGACY_F32 %2:vgpr_32, %3:vgpr_32, %4:vgpr_32
-    // remat it will not help.
+    // remat It will not help.
     if (Reg == DefReg) {
       return false;
     }
@@ -3026,18 +3002,17 @@ bool collectRematableHotReg(
     if (Reg.isPhysical())
       return false;
 
-    if (nullptr ==
-        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+    if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI))
       return false;
 
-    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+    LaneBitmask Mask = llvm::getRegMask(MO, MRI);
 
-    if (isInLiveSet(Reg, mask, hotLive))
+    if (isInLiveSet(Reg, Mask, HotLive))
       continue;
 
-    if (isInLiveSet(Reg, mask, pureHotRematSet)) {
-      unsigned regLevel = getPacifistLevel(Reg, pureHotRematLevels, MRI);
-      level = std::max(level, regLevel);
+    if (isInLiveSet(Reg, Mask, PureHotRematSet)) {
+      unsigned RegLevel = getPacifistLevel(Reg, PureHotRematLevels, MRI);
+      Level = std::max(Level, RegLevel);
       continue;
     }
 
@@ -3050,46 +3025,44 @@ bool collectRematableHotReg(
     if (Reg.isPhysical())
       return false;
 
-    if (nullptr ==
-        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+    if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), InputLive, MRI))
       return false;
 
-    LaneBitmask mask = llvm::getRegMask(MO, MRI);
-    pureHotRematSet[Reg] |= mask;
+    LaneBitmask Mask = llvm::getRegMask(MO, MRI);
+    PureHotRematSet[Reg] |= Mask;
   }
 
-  pureHotRematLevels[&MI] = level + 1;
-  // If no def, it will not increase pressure, don't mark it.
+  PureHotRematLevels[&MI] = Level + 1;
+  // If no def, It will not increase pressure, don't mark It.
   return true;
 }
 
-bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
-              std::vector<SubExp> &inBlockCloneSubExps, bool IsVGPR,
-              const GCNRPTracker::LiveRegSet &inputLive,
-              const GCNRPTracker::LiveRegSet &outputLive,
-              DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
+bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi,
+              std::vector<SubExp> &InBlockCloneSubExps, bool IsVGPR,
+              const GCNRPTracker::LiveRegSet &InputLive,
+              DenseSet<MachineInstr *> &HotSet, int VDistance, int SDistance,
               unsigned VLimit, unsigned SLimit,
               const DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
               LiveIntervals *LIS, const MachineRegisterInfo &MRI,
               const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
   auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
-  const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex();
+  const auto &SI = LIS->getInstructionIndex(*HotMi).getBaseIndex();
   const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
 
-  GCNRPTracker::LiveRegSet hotLive = LISLR;
+  GCNRPTracker::LiveRegSet HotLive = LISLR;
 
-  GCNRPTracker::LiveRegSet pureHotRematSet;
-  std::vector<MachineInstr *> pureHotRematList;
-  DenseMap<MachineInstr *, unsigned> pureHotRematLevels;
+  GCNRPTracker::LiveRegSet PureHotRematSet;
+  std::vector<MachineInstr *> PureHotRematList;
+  DenseMap<MachineInstr *, unsigned> PureHotRematLevels;
 
-  GCNRPTracker::LiveRegSet outputSet;
+  GCNRPTracker::LiveRegSet OutputSet;
   LLVM_DEBUG(dbgs() << "pure hot remat begin\n");
   // Find reg which could remat from other reg in liveSet.
-  const unsigned kMaxRematLevel = 6;
+  const unsigned KMaxRematLevel = 6;
   GCNDownwardRPTracker Tracker(*LIS);
-  Tracker.reset(*MBB.begin(), &inputLive);
-  for (auto it = MBB.begin(); it != MBB.end(); it++) {
-    MachineInstr &MI = *it;
+  Tracker.reset(*MBB.begin(), &InputLive);
+  for (auto It = MBB.begin(); It != MBB.end(); It++) {
+    MachineInstr &MI = *It;
     const GCNRegPressure &RP = Tracker.getPressure();
 
     if (MI.isDebugInstr())
@@ -3103,31 +3076,31 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
     }
 
     // Stop at hotMI.
-    if (&MI == hotMI)
+    if (&MI == HotMi)
       break;
 
     Tracker.advance();
 
     unsigned DefReg = 0;
-    if (collectRematableHotReg(MI, hotLive, pureHotRematSet, pureHotRematLevels,
-                               DefReg, inputLive, outputLive, MRI, SIRI)) {
-      unsigned level = pureHotRematLevels[&MI];
-      if (level >= kMaxRematLevel)
+    if (collectRematableHotReg(MI, HotLive, PureHotRematSet, PureHotRematLevels,
+                               DefReg, InputLive, MRI)) {
+      unsigned Level = PureHotRematLevels[&MI];
+      if (Level >= KMaxRematLevel)
         continue;
 
       // If the def reg is in hot reg.
       // Add to output.
-      if (hotLive.find(DefReg) != hotLive.end()) {
+      if (HotLive.find(DefReg) != HotLive.end()) {
         bool IsUserIsHot = false;
         for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) {
           if (UseMI.getParent() != &MBB)
             continue;
-          if (0 == hotSet.count(&UseMI))
+          if (0 == HotSet.count(&UseMI))
             continue;
 
-          const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex();
-          // When has a hot user after hotMI, remat it may not help.
-          if (useSI > SI) {
+          const auto &UseSI = LIS->getInstructionIndex(UseMI).getBaseIndex();
+          // When has a hot user after hotMI, remat It may not help.
+          if (UseSI > SI) {
             IsUserIsHot = true;
             break;
           }
@@ -3135,14 +3108,14 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
 
         if (IsUserIsHot)
           continue;
-        outputSet[DefReg];
+        OutputSet[DefReg];
         LLVM_DEBUG(dbgs() << "hotRemat:");
         LLVM_DEBUG(MI.getOperand(0).dump());
-        // remove it from hotLive to avoid it as input when build dag.
-        hotLive.erase(DefReg);
+        // remove It from hotLive to avoid It as input when build dag.
+        HotLive.erase(DefReg);
       }
-      pureHotRematList.emplace_back(&MI);
-      LLVM_DEBUG(dbgs() << "level:" << level);
+      PureHotRematList.emplace_back(&MI);
+      LLVM_DEBUG(dbgs() << "level:" << Level);
       LLVM_DEBUG(MI.dump());
     }
   }
@@ -3154,82 +3127,82 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
   // Build SubExp with pureHotRematList as Nodes, hotLive as input
   // rematHot as output.
   // Not join input when build ExpDag to get small subExps.
-  ExpDag dag(MRI, SIRI, SIII, /*IsJoinInput*/ false);
-  dag.build(hotLive, outputSet, pureHotRematList);
+  ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ false);
+  Dag.build(HotLive, OutputSet, PureHotRematList);
   // Find best subExp add to inBlockCloneSubExps.
   // Sort by size of subExp.
-  std::sort(dag.SubExps.begin(), dag.SubExps.end(),
+  std::sort(Dag.SubExps.begin(), Dag.SubExps.end(),
             [](const SubExp &A, const SubExp &B) {
               return A.SUnits.size() < B.SUnits.size();
             });
-  std::vector<SubExp> cloneSubExps;
-  int distance = IsVGPR ? vDistance : sDistance;
-  for (SubExp &subExp : dag.SubExps) {
-    if (subExp.IsNotSafeToCopy)
+  std::vector<SubExp> CloneSubExps;
+  int Distance = IsVGPR ? VDistance : SDistance;
+  for (SubExp &SubExp : Dag.SubExps) {
+    if (SubExp.IsNotSafeToCopy)
       continue;
     if (IsVGPR) {
-      if (subExp.vOutputSize == 0)
+      if (SubExp.vOutputSize == 0)
         continue;
     } else {
-      if (subExp.sOutputSize == 0)
+      if (SubExp.sOutputSize == 0)
         continue;
     }
-    if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
+    if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
       continue;
     // Not clone .
-    if (subExp.SUnits.size() > 10)
+    if (SubExp.SUnits.size() > 10)
       continue;
     // Do not allow remat in the block when the expression has a memory op and
     // the block has a write. We could allow this in some cases with better
     // analysis.
-    if (subExp.IsHasMemInst && MemWriteMBBSet.count(&MBB))
+    if (SubExp.IsHasMemInst && MemWriteMBBSet.count(&MBB))
       continue;
     if (IsVGPR) {
-      distance -= subExp.vOutputSize;
+      Distance -= SubExp.vOutputSize;
     } else {
-      distance -= subExp.sOutputSize;
+      Distance -= SubExp.sOutputSize;
     }
-    cloneSubExps.emplace_back(subExp);
-    if (distance <= 0)
+    CloneSubExps.emplace_back(SubExp);
+    if (Distance <= 0)
       break;
   }
-  if (distance <= 0) {
-    inBlockCloneSubExps.insert(inBlockCloneSubExps.end(), cloneSubExps.begin(),
-                               cloneSubExps.end());
+  if (Distance <= 0) {
+    InBlockCloneSubExps.insert(InBlockCloneSubExps.end(), CloneSubExps.begin(),
+                               CloneSubExps.end());
   }
-  return distance <= 0;
+  return Distance <= 0;
 }
 
 // Try to remat live reg in hot spot from other live reg in hot spot.
 //
 bool tryRematInHotSpot(
-    MachineBasicBlock &MBB, RematStatus &status, int vDistance, int sDistance,
-    int vSaved, int sSaved, std::vector<SubExp> &inBlockCloneSubExps,
-    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
-    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    MachineBasicBlock &MBB, RematStatus &Status, int VDistance, int SDistance,
+    int VSaved, int SSaved, std::vector<SubExp> &InBlockCloneSubExps,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &InBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &InBlockHotSInstMap,
     LiveIntervals *LIS, const MachineRegisterInfo &MRI,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
-  unsigned VLimit = status.TargetVLimit;
-  unsigned SLimit = status.TargetSLimit;
+  unsigned VLimit = Status.TargetVLimit;
+  unsigned SLimit = Status.TargetSLimit;
 
   auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
-  const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
+  const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[&MBB];
 
-  const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+  const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
 
   // Collect reg pressure.
-  unsigned maxLocalVPressure = 0;
-  unsigned maxLocalSPressure = 0;
+  unsigned MaxLocalVPressure = 0;
+  unsigned MaxLocalSPressure = 0;
   // Build a DAG or only on demand?
-  MachineInstr *hotVMI = nullptr;
-  MachineInstr *hotSMI = nullptr;
-  DenseSet<MachineInstr *> hotSet;
+  MachineInstr *HotVMI = nullptr;
+  MachineInstr *HotSMI = nullptr;
+  DenseSet<MachineInstr *> HotSet;
 
   GCNDownwardRPTracker Tracker(*LIS);
 
-  Tracker.reset(*MBB.begin(), &inputLive);
-  for (auto it = MBB.begin(); it != MBB.end(); it++) {
-    MachineInstr &MI = *it;
+  Tracker.reset(*MBB.begin(), &InputLive);
+  for (auto It = MBB.begin(); It != MBB.end(); It++) {
+    MachineInstr &MI = *It;
     if (MI.isDebugInstr()) {
       continue;
     }
@@ -3239,42 +3212,42 @@ bool tryRematInHotSpot(
 
     SPressure += RegForVCC;
 
-    VPressure -= vSaved;
-    SPressure -= sSaved;
+    VPressure -= VSaved;
+    SPressure -= SSaved;
     Tracker.advance();
 
     if (VPressure <= VLimit && SPressure <= SLimit) {
       continue;
     }
-    hotSet.insert(&MI);
-    if (maxLocalVPressure < VPressure) {
-      maxLocalVPressure = VPressure;
-      hotVMI = &MI;
+    HotSet.insert(&MI);
+    if (MaxLocalVPressure < VPressure) {
+      MaxLocalVPressure = VPressure;
+      HotVMI = &MI;
     }
-    if (maxLocalSPressure < SPressure) {
-      maxLocalSPressure = SPressure;
-      hotSMI = &MI;
+    if (MaxLocalSPressure < SPressure) {
+      MaxLocalSPressure = SPressure;
+      HotSMI = &MI;
     }
   }
 
-  inBlockHotVInstMap[&MBB] = hotVMI;
-  inBlockHotSInstMap[&MBB] = hotSMI;
-  if (vDistance > 0 && hotVMI) {
+  InBlockHotVInstMap[&MBB] = HotVMI;
+  InBlockHotSInstMap[&MBB] = HotSMI;
+  if (VDistance > 0 && HotVMI) {
     // Use hotVMI when apply.
-    inBlockHotSInstMap[&MBB] = nullptr;
-    if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*IsVGPR*/ true, inputLive,
-                 outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
-                 status.MemWriteMBBSet, LIS, MRI, SIRI, SIII))
+    InBlockHotSInstMap[&MBB] = nullptr;
+    if (tryRemat(MBB, HotVMI, InBlockCloneSubExps, /*IsVGPR*/ true, InputLive,
+                 HotSet, VDistance, SDistance, VLimit, SLimit,
+                 Status.MemWriteMBBSet, LIS, MRI, SIRI, SIII))
       return true;
   }
 
-  if (sDistance > 0 && hotSMI) {
+  if (SDistance > 0 && HotSMI) {
     // Use hotSMI when apply.
-    inBlockHotSInstMap[&MBB] = hotSMI;
-    inBlockHotVInstMap[&MBB] = nullptr;
-    return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*IsVGPR*/ false,
-                    inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
-                    SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII);
+    InBlockHotSInstMap[&MBB] = HotSMI;
+    InBlockHotVInstMap[&MBB] = nullptr;
+    return tryRemat(MBB, HotSMI, InBlockCloneSubExps, /*IsVGPR*/ false,
+                    InputLive, HotSet, VDistance, VDistance, VLimit, SLimit,
+                    Status.MemWriteMBBSet, LIS, MRI, SIRI, SIII);
   }
   return false;
 }
@@ -3282,9 +3255,9 @@ bool tryRematInHotSpot(
 // If subExp0 use result of subExp1, subExp0 is deeper than subExp1.
 // When apply subExp1 before subExp0, new clone of subExp0 which use result of
 // subExp1 will have old reg of subExp1. And reg pressure will not be reduced.
-void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
-  MapVector<unsigned, SetVector<SubExp *>> inputMap;
-  MapVector<unsigned, SetVector<SubExp *>> outputMap;
+void sortSubExpCandidates(std::vector<SubExp> &SubExpCandidates) {
+  MapVector<unsigned, SetVector<SubExp *>> InputMap;
+  MapVector<unsigned, SetVector<SubExp *>> OutputMap;
   struct SortNode {
     SubExp Exp;
     unsigned Depth;
@@ -3295,67 +3268,67 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
 
   {
     SmallVector<unsigned, 10> RegSortStorage;
-    for (SubExp &Exp : subExpCandidates) {
+    for (SubExp &Exp : SubExpCandidates) {
       RegSortStorage.assign(Exp.TopRegs.begin(), Exp.TopRegs.end());
       std::sort(RegSortStorage.begin(), RegSortStorage.end());
-      for (auto it : RegSortStorage) {
-        unsigned Reg = it;
-        inputMap[Reg].insert(&Exp);
+      for (auto It : RegSortStorage) {
+        unsigned Reg = It;
+        InputMap[Reg].insert(&Exp);
       }
 
       RegSortStorage.assign(Exp.BottomRegs.begin(), Exp.BottomRegs.end());
       std::sort(RegSortStorage.begin(), RegSortStorage.end());
-      for (auto it : RegSortStorage) {
-        unsigned Reg = it;
-        outputMap[Reg].insert(&Exp);
+      for (auto It : RegSortStorage) {
+        unsigned Reg = It;
+        OutputMap[Reg].insert(&Exp);
       }
     }
   }
 
-  MapVector<SubExp *, SortNode> sortMap;
-  for (auto it : inputMap) {
-    unsigned Reg = it.first;
-    auto outIt = outputMap.find(Reg);
-    if (outIt == outputMap.end())
+  MapVector<SubExp *, SortNode> SortMap;
+  for (auto It : InputMap) {
+    unsigned Reg = It.first;
+    auto OutIt = OutputMap.find(Reg);
+    if (OutIt == OutputMap.end())
       continue;
-    auto &inExps = it.second;
-    auto &outExps = outIt->second;
-    for (SubExp *inExp : inExps) {
-      for (SubExp *outExp : outExps) {
-        if (inExp->IsHoist != outExp->IsHoist) {
+    auto &InExps = It.second;
+    auto &OutExps = OutIt->second;
+    for (SubExp *InExp : InExps) {
+      for (SubExp *OutExp : OutExps) {
+        if (InExp->IsHoist != OutExp->IsHoist) {
           // Different direction.
           // If output (def) move up, input(use) move down, nothing happens.
-          if (outExp->IsHoist)
+          if (OutExp->IsHoist)
             continue;
           // Canot input(use) move up, output(def) move down.
           // Choose the exp which save more.
-          int inExpGain = inExp->vOutputSize - inExp->vInputSize;
-          int outExpGain = outExp->vInputSize - inExp->vOutputSize;
-          if (inExpGain >= outExpGain) {
-            outExp->SUnits.clear();
+          int InExpGain = InExp->vOutputSize - InExp->vInputSize;
+          int OutExpGain = OutExp->vInputSize - InExp->vOutputSize;
+          if (InExpGain >= OutExpGain) {
+            OutExp->SUnits.clear();
           } else {
-            inExp->SUnits.clear();
+            InExp->SUnits.clear();
           }
           continue;
         }
         // Link outExp to inExp.
-        if (inExp->IsHoist) {
-          sortMap[outExp].Preds.insert(inExp);
-          sortMap[inExp].Succs.insert(outExp);
+        if (InExp->IsHoist) {
+          SortMap[OutExp].Preds.insert(InExp);
+          SortMap[InExp].Succs.insert(OutExp);
         } else {
-          sortMap[inExp].Preds.insert(outExp);
-          sortMap[outExp].Succs.insert(inExp);
+          SortMap[InExp].Preds.insert(OutExp);
+          SortMap[OutExp].Succs.insert(InExp);
         }
       }
     }
   }
 
-  if (sortMap.empty())
+  if (SortMap.empty())
     return;
 
   SmallVector<SubExp *, 8> WorkList;
-  for (SubExp &Exp : subExpCandidates) {
-    SortNode &Node = sortMap[&Exp];
+  for (SubExp &Exp : SubExpCandidates) {
+    SortNode &Node = SortMap[&Exp];
     Node.Depth = 0;
     Node.Exp = Exp;
     Node.IsDepthDirty = !Node.Preds.empty();
@@ -3365,13 +3338,13 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
   // Calc depth.
   while (!WorkList.empty()) {
     SubExp *Exp = WorkList.pop_back_val();
-    SortNode &Node = sortMap[Exp];
+    SortNode &Node = SortMap[Exp];
     for (SubExp *Succ : Node.Succs) {
-      SortNode &SuccNode = sortMap[Succ];
+      SortNode &SuccNode = SortMap[Succ];
       SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1);
       bool IsAllPrevClean = true;
       for (SubExp *Prev : SuccNode.Preds) {
-        SortNode &PrevNode = sortMap[Prev];
+        SortNode &PrevNode = SortMap[Prev];
         if (PrevNode.IsDepthDirty) {
           IsAllPrevClean = false;
           break;
@@ -3384,35 +3357,35 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
     }
   }
 
-  std::vector<SortNode *> nodes;
-  for (auto &it : sortMap) {
-    SortNode &node = it.second;
-    nodes.emplace_back(&node);
+  std::vector<SortNode *> Nodes;
+  for (auto &It : SortMap) {
+    SortNode &Node = It.second;
+    Nodes.emplace_back(&Node);
   }
 
-  struct sorter {
-    bool operator()(const SortNode *a, const SortNode *b) {
-      return a->Depth > b->Depth;
+  struct Sorter {
+    bool operator()(const SortNode *A, const SortNode *B) {
+      return A->Depth > B->Depth;
     }
   };
 
   // subExp deeper should be apply first.
-  std::sort(nodes.begin(), nodes.end(), sorter());
+  std::sort(Nodes.begin(), Nodes.end(), Sorter());
 
-  subExpCandidates.clear();
-  for (auto &node : nodes) {
-    subExpCandidates.emplace_back(node->Exp);
+  SubExpCandidates.clear();
+  for (auto &Node : Nodes) {
+    SubExpCandidates.emplace_back(Node->Exp);
   }
 }
 
 // Compare pressure, return ture if maxV0/maxS0 pressure is higher than
 // maxV1/maxS1.
-bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
-                    unsigned maxS1, const GCNSubtarget *ST) {
-  unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0);
-  unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(maxV1);
-  unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(maxS0);
-  unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1);
+bool pressureHigher(unsigned MaxV0, unsigned MaxS0, unsigned MaxV1,
+                    unsigned MaxS1, const GCNSubtarget *ST) {
+  unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(MaxV0);
+  unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(MaxV1);
+  unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(MaxS0);
+  unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(MaxS1);
   unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0);
   unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1);
   //  is low pressure.
@@ -3422,146 +3395,146 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
     return true;
   // When sgpr bound,  is high pressure.
   if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) {
-    return maxS0 > maxS1;
+    return MaxS0 > MaxS1;
   }
   // When vgpr bound or mix, vgpr higher is higher pressure.
-  return maxV0 > maxV1;
+  return MaxV0 > MaxV1;
 }
 
 // Return true if the subExp can help pressure for passThrus.
-bool canHelpPressureWhenSink(
-    SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
-    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
-    MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound) {
-  LLVM_DEBUG(subExp.dump(MRI, SIRI));
-  if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
+bool canHelpPressureWhenSink(SubExp &SubExp,
+                             const GCNRPTracker::LiveRegSet &PassThrus,
+                             const MachineRegisterInfo &MRI,
+                             const SIRegisterInfo *SIRI,
+                             const MachineLoopInfo *MLI,
+                             MachineDominatorTree *DT, bool IsCanClone,
+                             bool IsSgprBound) {
+  LLVM_DEBUG(SubExp.dump(MRI, SIRI));
+  if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
     return false;
 
   // Update input size to ignore lives in which already in
   // passThrus.
-  for (auto it : subExp.inputLive) {
-    unsigned Reg = it.first;
-    if (passThrus.count(Reg) == 0)
+  for (auto It : SubExp.inputLive) {
+    unsigned Reg = It.first;
+    if (PassThrus.count(Reg) == 0)
       continue;
-    unsigned Size = getRegSize(Reg, it.second, MRI, SIRI);
+    unsigned Size = getRegSize(Reg, It.second, MRI, SIRI);
     if (SIRI->isVGPR(MRI, Reg)) {
-      subExp.vInputSize -= Size;
+      SubExp.vInputSize -= Size;
     } else {
-      subExp.sInputSize -= Size;
+      SubExp.sInputSize -= Size;
     }
   }
 
-  if (subExp.vInputSize > subExp.vOutputSize)
+  if (SubExp.vInputSize > SubExp.vOutputSize)
     return false;
 
-  if (subExp.sInputSize > subExp.sOutputSize && IsSgprBound)
+  if (SubExp.sInputSize > SubExp.sOutputSize && IsSgprBound)
     return false;
 
-  if (subExp.sInputSize >= subExp.sOutputSize &&
-      subExp.vInputSize == subExp.vOutputSize)
+  if (SubExp.sInputSize >= SubExp.sOutputSize &&
+      SubExp.vInputSize == SubExp.vOutputSize)
     return false;
 
   // Try to find a Insert Block.
   // Skip multi def output sub exp.
   // Collect user blocks, find common dom.
-  BlockSet userBlocks;
-  for (unsigned Reg : subExp.BottomRegs) {
+  BlockSet UserBlocks;
+  for (unsigned Reg : SubExp.BottomRegs) {
     for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       MachineBasicBlock *UserBB = UseMI.getParent();
       // Skip current BB.
-      if (UserBB != subExp.FromBB)
-        userBlocks.insert(UserBB);
+      if (UserBB != SubExp.FromBB)
+        UserBlocks.insert(UserBB);
     }
   }
-  if (userBlocks.empty())
+  if (UserBlocks.empty())
     return false;
-  MachineBasicBlock *userBlock = NearestCommonDominator(DT, userBlocks);
-  if (!DT->dominates(subExp.FromBB, userBlock)) {
+  MachineBasicBlock *UserBlock = nearestCommonDominator(DT, UserBlocks);
+  if (!DT->dominates(SubExp.FromBB, UserBlock)) {
     return false;
   }
-  if (userBlock == subExp.FromBB &&
+  if (UserBlock == SubExp.FromBB &&
       // When allow clone, could go clone path if cannot move subExp.
       !IsCanClone)
     return false;
 
-  subExp.ToBB = userBlock;
-  if (auto *toLoop = MLI->getLoopFor(userBlock)) {
-    auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
-    if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
-      subExp.IsMoveIntoLoop = true;
-  } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
-    auto *toLoop = MLI->getLoopFor(userBlock);
+  SubExp.ToBB = UserBlock;
+  if (auto *ToLoop = MLI->getLoopFor(UserBlock)) {
+    auto *FromLoop = MLI->getLoopFor(SubExp.FromBB);
+    if (!FromLoop || FromLoop->getLoopDepth() < ToLoop->getLoopDepth())
+      SubExp.IsMoveIntoLoop = true;
+  } else if (auto *FromLoop = MLI->getLoopFor(SubExp.FromBB)) {
+    auto *ToLoop = MLI->getLoopFor(UserBlock);
     // not safe to move out of loop.
-    if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
-        toLoop != fromLoop)
+    if (!ToLoop || FromLoop->getLoopDepth() > ToLoop->getLoopDepth() ||
+        ToLoop != FromLoop)
       return false;
   }
   return true;
 }
 
-bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
-                              const SIRegisterInfo *SIRI,
-                              const SIInstrInfo *SIII,
+bool canHelpPressureWhenHoist(SubExp &SubExp, const MachineRegisterInfo &MRI,
                               const MachineLoopInfo *MLI, bool IsSgprBound) {
-  if (!subExp.isSafeToMove(MRI, /*IsMoveUp*/ true))
+  if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ true))
     return false;
-  if (subExp.vInputSize < subExp.vOutputSize)
+  if (SubExp.vInputSize < SubExp.vOutputSize)
     return false;
-  if (subExp.sInputSize < subExp.sOutputSize && IsSgprBound)
+  if (SubExp.sInputSize < SubExp.sOutputSize && IsSgprBound)
     return false;
 
-  if (subExp.sInputSize <= subExp.sOutputSize &&
-      subExp.vInputSize == subExp.vOutputSize)
+  if (SubExp.sInputSize <= SubExp.sOutputSize &&
+      SubExp.vInputSize == SubExp.vOutputSize)
     return false;
 
   // Try to find a Insert Block.
   // Skip multi def output sub exp.
   // Collect user blocks, find common dom.
-  BlockSet defBlocks;
-  for (unsigned Reg : subExp.TopRegs) {
+  BlockSet DefBlocks;
+  for (unsigned Reg : SubExp.TopRegs) {
     MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
     if (!DefMI)
       continue;
-    defBlocks.insert(DefMI->getParent());
+    DefBlocks.insert(DefMI->getParent());
   }
-  if (defBlocks.size() != 1)
+  if (DefBlocks.size() != 1)
     return false;
-  MachineBasicBlock *defBlock = *defBlocks.begin();
-  subExp.ToBB = defBlock;
+  MachineBasicBlock *DefBlock = *DefBlocks.begin();
+  SubExp.ToBB = DefBlock;
   // Not do same block hoist.
-  if (subExp.ToBB == subExp.FromBB)
+  if (SubExp.ToBB == SubExp.FromBB)
     return false;
 
-  if (auto *toLoop = MLI->getLoopFor(defBlock)) {
-    auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
+  if (auto *ToLoop = MLI->getLoopFor(DefBlock)) {
+    auto *FromLoop = MLI->getLoopFor(SubExp.FromBB);
     // TODO: enable move into loop when hoist.
-    if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
+    if (!FromLoop || FromLoop->getLoopDepth() < ToLoop->getLoopDepth())
       return false;
-  } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
-    auto *toLoop = MLI->getLoopFor(defBlock);
+  } else if (auto *FromLoop = MLI->getLoopFor(SubExp.FromBB)) {
+    auto *ToLoop = MLI->getLoopFor(DefBlock);
     // not safe to move out of loop.
-    if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
-        toLoop != fromLoop)
+    if (!ToLoop || FromLoop->getLoopDepth() > ToLoop->getLoopDepth() ||
+        ToLoop != FromLoop)
       return false;
   }
   return true;
 }
 
 SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus,
-                        GCNRPTracker::LiveRegSet &usedPassThrus,
+groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &PassThrus,
+                        GCNRPTracker::LiveRegSet &UsedPassThrus,
                         MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                         const SIInstrInfo *SIII) {
   MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> Candidates;
 
   // Group safe candidates by define block.
-  for (auto it : passThrus) {
-    unsigned Reg = it.first;
-    // Skip used pass thru reg to avoid count it twice for different hot block.
-    if (usedPassThrus.count(Reg))
+  for (auto It : PassThrus) {
+    Register Reg = It.first;
+    // Skip used pass thru reg to avoid count It twice for different hot block.
+    if (UsedPassThrus.count(Reg))
       continue;
-    LLVM_DEBUG(print_vreg(Reg, MRI));
+    LLVM_DEBUG(printVreg(Reg, MRI));
     LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr ";
                else dbgs() << " vgpr ";);
     if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ true)) {
@@ -3573,61 +3546,60 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus,
     MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
 
     GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()];
-    DefInMBB[Reg] = it.second;
+    DefInMBB[Reg] = It.second;
   }
 
   llvm::SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-      result = Candidates.takeVector();
+      Result = Candidates.takeVector();
 
-  LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it
-                                                              : result) {
-    MachineBasicBlock *MBB = it.first;
-    auto &defInMBB = it.second;
+  LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto It
+                                                              : Result) {
+    MachineBasicBlock *MBB = It.first;
+    auto &defInMBB = It.second;
     MBB->dump();
     llvm::dumpLiveSet(defInMBB, SIRI);
   } llvm::dbgs() << "end of candidates\n";);
 
-  std::sort(result.begin(), result.end(),
-            [](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it0,
-               std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it1) {
-              return it0.first->getNumber() < it1.first->getNumber();
+  std::sort(Result.begin(), Result.end(),
+            [](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &It0,
+               std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &It1) {
+              return It0.first->getNumber() < It1.first->getNumber();
             });
 
-  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it
-                                                             : result) {
-    MachineBasicBlock *MBB = it.first;
-    auto &defInMBB = it.second;
+  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto It
+                                                             : Result) {
+    MachineBasicBlock *MBB = It.first;
+    auto &defInMBB = It.second;
     MBB->dump();
     llvm::dumpLiveSet(defInMBB, SIRI);
   } llvm::dbgs() << "end of candidates\n";);
 
-  return result;
+  return Result;
 }
 
 // collect pass thru regs of MBB.
 GCNRPTracker::LiveRegSet
 collectPassThrus(MachineBasicBlock *MBB,
-                 const GCNRPTracker::LiveRegSet &inputLive,
-                 const GCNRPTracker::LiveRegSet &outputLive,
-                 const GCNRPTracker::LiveRegSet &usedPassThrus,
-                 const GCNRPTracker::LiveRegSet &liveRegCandidates,
+                 const GCNRPTracker::LiveRegSet &InputLive,
+                 const GCNRPTracker::LiveRegSet &OutputLive,
+                 const GCNRPTracker::LiveRegSet &LiveRegCandidates,
                  MachineRegisterInfo &MRI, bool IsCanClone) {
-  GCNRPTracker::LiveRegSet passThrus;
-  llvm::mergeLiveRegSet(passThrus, inputLive);
-  llvm::andLiveRegSet(passThrus, outputLive);
+  GCNRPTracker::LiveRegSet PassThrus;
+  llvm::mergeLiveRegSet(PassThrus, InputLive);
+  llvm::andLiveRegSet(PassThrus, OutputLive);
 
   // Remove reg which not in liveRegCandidates.
-  GCNRPTracker::LiveRegSet tmpPassThrus = passThrus;
-  for (auto it : tmpPassThrus) {
-    unsigned Reg = it.first;
-    if (!liveRegCandidates.count(Reg)) {
-      passThrus.erase(Reg);
+  GCNRPTracker::LiveRegSet TmpPassThrus = PassThrus;
+  for (auto It : TmpPassThrus) {
+    unsigned Reg = It.first;
+    if (!LiveRegCandidates.count(Reg)) {
+      PassThrus.erase(Reg);
     }
   }
-  tmpPassThrus = passThrus;
+  TmpPassThrus = PassThrus;
   // Remove reg which has read/write in MBB.
-  for (auto it : tmpPassThrus) {
-    unsigned Reg = it.first;
+  for (auto It : TmpPassThrus) {
+    unsigned Reg = It.first;
     DenseSet<MachineBasicBlock *> DefMBBs;
     for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
       MachineBasicBlock *MBB = DefMI.getParent();
@@ -3646,45 +3618,45 @@ collectPassThrus(MachineBasicBlock *MBB,
 
     bool IsPassThru = !IsW && !IsR;
     if (!IsPassThru)
-      passThrus.erase(Reg);
+      PassThrus.erase(Reg);
   }
-  return passThrus;
+  return PassThrus;
 }
 // Try to build a free subExp which all input is passThrus.
-SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
-                       GCNRPTracker::LiveRegSet &passThrus,
+SubExp buildFreeSubExp(SubExp &Exp,
+                       GCNRPTracker::LiveRegSet &PassThrus,
                        MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
-  SubExp freeExp;
+  SubExp FreeExp;
   // Try to split the subExp to find a help case.
   // Scan all inst in subExp, propagate free inst which input is from
   // passThrus.
-  SmallDenseSet<unsigned, 4> freeRegs;
-  SmallDenseSet<unsigned, 8> freeInstUseRegs;
-  SmallVector<MachineInstr *, 4> freeInsts;
-  for (MachineInstr *MI : subExp.SUnits) {
+  SmallDenseSet<Register, 4> FreeRegs;
+  SmallDenseSet<Register, 8> FreeInstUseRegs;
+  SmallVector<MachineInstr *, 4> FreeInsts;
+  for (MachineInstr *MI : Exp.SUnits) {
     bool IsFree = true;
     // Check all use regs are free.
     for (MachineOperand &MO : MI->uses()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (MO.isImplicit() && Reg == AMDGPU::EXEC)
         continue;
       if (MRI.getUniqueVRegDef(Reg) == nullptr) {
         IsFree = false;
         break;
       }
-      // Skip local pass thrus unless it is free.
-      if (passThrus.count(Reg) && subExp.TopRegs.count(Reg))
+      // Skip local pass thrus unless It is free.
+      if (PassThrus.count(Reg) && Exp.TopRegs.count(Reg))
         continue;
-      if (freeRegs.count(Reg))
+      if (FreeRegs.count(Reg))
         continue;
       IsFree = false;
       break;
     }
     // Check def is unique.
     for (MachineOperand &MO : MI->defs()) {
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (MRI.getUniqueVRegDef(Reg) == nullptr) {
         IsFree = false;
         break;
@@ -3693,104 +3665,103 @@ SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
     if (!IsFree)
       continue;
     // Save inst as free inst.
-    freeInsts.emplace_back(MI);
+    FreeInsts.emplace_back(MI);
     // Save def as free reg.
     for (MachineOperand &MO : MI->defs()) {
-      unsigned Reg = MO.getReg();
-      freeRegs.insert(Reg);
+      Register Reg = MO.getReg();
+      FreeRegs.insert(Reg);
     }
     // Save use regs as free use reg.
     for (MachineOperand &MO : MI->uses()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
 
-      freeInstUseRegs.insert(Reg);
+      FreeInstUseRegs.insert(Reg);
     }
   }
   // Then remove local inst has no output use.
-  for (MachineInstr *MI : freeInsts) {
+  for (MachineInstr *MI : FreeInsts) {
     bool IsFreeUsed = false;
     for (MachineOperand &MO : MI->defs()) {
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       // Used as freeInst or output.
-      IsFreeUsed |=
-          freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg);
+      IsFreeUsed |= FreeInstUseRegs.count(Reg) > 0 || Exp.BottomRegs.count(Reg);
     }
     if (!IsFreeUsed)
       continue;
-    freeExp.SUnits.emplace_back(MI);
+    FreeExp.SUnits.emplace_back(MI);
   }
-  if (freeExp.SUnits.empty()) {
-    // mark has terminator to make it unsafe.
-    freeExp.IsHasTerminatorInst = true;
-    return freeExp;
+  if (FreeExp.SUnits.empty()) {
+    // mark has terminator to make It unsafe.
+    FreeExp.IsHasTerminatorInst = true;
+    return FreeExp;
   }
   // Build BottomRegs and TopRegs for freeExp.
   // BottomRegs is freeRegs in subExp.BottomRegs.
-  for (unsigned freeReg : freeRegs) {
-    if (subExp.BottomRegs.count(freeReg))
-      freeExp.BottomRegs.insert(freeReg);
+  for (Register FreeReg : FreeRegs) {
+    if (Exp.BottomRegs.count(FreeReg))
+      FreeExp.BottomRegs.insert(FreeReg);
   }
   // TopRegs is freeInstUseRegs in subExp.TopRegs.
-  for (unsigned freeInstUseReg : freeInstUseRegs) {
-    if (subExp.TopRegs.count(freeInstUseReg))
-      freeExp.TopRegs.insert(freeInstUseReg);
+  for (Register FreeInstUseReg : FreeInstUseRegs) {
+    if (Exp.TopRegs.count(FreeInstUseReg))
+      FreeExp.TopRegs.insert(FreeInstUseReg);
   }
-  freeExp.FromBB = subExp.FromBB;
-  freeExp.ToBB = subExp.ToBB;
+  FreeExp.FromBB = Exp.FromBB;
+  FreeExp.ToBB = Exp.ToBB;
   // must be clone since is partial of subExp.
-  freeExp.IsCloneOnly = true;
+  FreeExp.IsCloneOnly = true;
 
   // Calc reg for freeExp.
-  for (unsigned Reg : freeExp.TopRegs) {
-    freeExp.inputLive[Reg];
+  for (unsigned Reg : FreeExp.TopRegs) {
+    FreeExp.inputLive[Reg];
   }
 
-  for (unsigned Reg : freeExp.BottomRegs) {
-    freeExp.outputLive[Reg];
+  for (unsigned Reg : FreeExp.BottomRegs) {
+    FreeExp.outputLive[Reg];
   }
 
-  CollectLiveSetPressure(freeExp.inputLive, MRI, SIRI, freeExp.vInputSize,
-                         freeExp.sInputSize);
-  CollectLiveSetPressure(freeExp.outputLive, MRI, SIRI, freeExp.vOutputSize,
-                         freeExp.sOutputSize);
-  return freeExp;
+  CollectLiveSetPressure(FreeExp.inputLive, MRI, SIRI, FreeExp.vInputSize,
+                         FreeExp.sInputSize);
+  CollectLiveSetPressure(FreeExp.outputLive, MRI, SIRI, FreeExp.vOutputSize,
+                         FreeExp.sOutputSize);
+  return FreeExp;
 }
 
 std::vector<SubExp> buildSubExpCandidates(
     Remat *Remat,
     SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
         &Candidates,
-    GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI,
+    GCNRPTracker::LiveRegSet &PassThrus, MachineRegisterInfo &MRI,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-    const MachineLoopInfo *MLI, SlotIndexes *slotIndexes,
+    const MachineLoopInfo *MLI, SlotIndexes *SlotIndexes,
     MachineDominatorTree *DT, bool IsCanClone, bool IsSgprBound,
-    GCNRPTracker::LiveRegSet &unUsedPassThrus,
+    GCNRPTracker::LiveRegSet &UnusedPassThrus,
     DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
     bool AllowPartialUseInSubExp) {
-  std::vector<SubExp> subExpCandidates;
+  std::vector<SubExp> SubExpCandidates;
   // Build exp dag on define blocks.
   // Save profit candidates into list.
-  for (auto &it : Candidates) {
-    MachineBasicBlock *DefMBB = it.first;
+  for (auto &It : Candidates) {
+    MachineBasicBlock *DefMBB = It.first;
     // Try to remove out reg def sub exp from DefMBB.
-    GCNRPTracker::LiveRegSet &DefInMBB = it.second;
+    GCNRPTracker::LiveRegSet &DefInMBB = It.second;
     // Go up on the dag until reach share node.
-    auto subExps = buildSubExpFromCandidates(
-        Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus,
+    auto SubExps = buildSubExpFromCandidates(
+        Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, SlotIndexes, UnusedPassThrus,
         AllowPartialUseInSubExp);
-    for (SubExp &subExp : subExps) {
-      if (subExp.IsHasMemInst) {
+    for (SubExp &Exp : SubExps) {
+      if (Exp.IsHasMemInst) {
         // Skip when memory ld/st inst need to cross MBB which write memory.
         // TODO: check all MBBs in between FromBB and ToBB not write memory.
         // Currently just skip when any memory write exist.
         if (!MemWriteMBBSet.empty()) {
-          MachineBasicBlock *FromBB = subExp.FromBB;
-          MachineBasicBlock *ToBB = subExp.ToBB;
-          if (subExp.IsHoist) {
-            FromBB = subExp.ToBB;
-            ToBB = subExp.FromBB;
+          MachineBasicBlock *FromBB = Exp.FromBB;
+          MachineBasicBlock *ToBB = Exp.ToBB;
+          if (Exp.IsHoist) {
+            FromBB = Exp.ToBB;
+            ToBB = Exp.FromBB;
           }
           bool IsCrossMemWriteMBB = false;
           for (MachineBasicBlock *MemMBB : MemWriteMBBSet) {
@@ -3805,37 +3776,36 @@ std::vector<SubExp> buildSubExpCandidates(
             continue;
         }
       }
-      if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, DT,
+      if (!canHelpPressureWhenSink(Exp, PassThrus, MRI, SIRI, MLI, DT,
                                    IsCanClone, IsSgprBound)) {
         if (AllowPartialUseInSubExp &&
-            subExp.isSafeToMove(MRI, /*IsMoveUp*/ false)) {
-          SubExp freeSubExp =
-              buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
-          if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII,
-                                      MLI, DT, IsCanClone, IsSgprBound)) {
-            subExpCandidates.emplace_back(freeSubExp);
+            Exp.isSafeToMove(MRI, /*IsMoveUp*/ false)) {
+          SubExp FreeSubExp = buildFreeSubExp(Exp, PassThrus, MRI, SIRI);
+          if (canHelpPressureWhenSink(FreeSubExp, PassThrus, MRI, SIRI, MLI, DT,
+                                      IsCanClone, IsSgprBound)) {
+            SubExpCandidates.emplace_back(FreeSubExp);
           }
         }
         continue;
       }
 
-      subExpCandidates.emplace_back(subExp);
+      SubExpCandidates.emplace_back(Exp);
     }
   }
-  return subExpCandidates;
+  return SubExpCandidates;
 }
 
 std::pair<int, int>
-calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
-                GCNRPTracker::LiveRegSet &inputLive,
-                GCNRPTracker::LiveRegSet &outputLive, bool IsVOutBound,
+calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
+                GCNRPTracker::LiveRegSet &InputLive,
+                GCNRPTracker::LiveRegSet &OutputLive, bool IsVOutBound,
                 bool IsSOutBound, bool IsCanClone, MachineDominatorTree *DT,
                 const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
-  int vgpr = 0;
-  int sgpr = 0;
-  MachineBasicBlock *MBB = hotBB.MBB;
+  int Vgpr = 0;
+  int Sgpr = 0;
+  MachineBasicBlock *MBB = HotBb.MBB;
   // Sink saving.
-  for (SubExp &Exp : subExpCandidates) {
+  for (SubExp &Exp : SubExpCandidates) {
     if (Exp.IsHoist) {
       // ToMBB -> MBB -> FromMBB.
       // If ToMBB not dom hot block, reg will not live in MBB.
@@ -3851,28 +3821,28 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           continue;
         if (IsSOutBound && Exp.sOutputSize < Exp.sInputSize)
           continue;
-        vgpr += Exp.vInputSize;
-        vgpr -= Exp.vOutputSize;
-        sgpr += Exp.sInputSize;
-        sgpr -= Exp.sOutputSize;
+        Vgpr += Exp.vInputSize;
+        Vgpr -= Exp.vOutputSize;
+        Sgpr += Exp.sInputSize;
+        Sgpr -= Exp.sOutputSize;
         continue;
       }
     }
-    int vgprDiff = 0;
-    int sgprDiff = 0;
+    int VgprDiff = 0;
+    int SgprDiff = 0;
     MachineBasicBlock *ToMBB = Exp.ToBB;
-    // If subExp is to hotBB, it is crossing output instead of input.
-    GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive;
+    // If subExp is to hotBB, It is crossing output instead of input.
+    GCNRPTracker::LiveRegSet &CrossLive = MBB == ToMBB ? OutputLive : InputLive;
 
     bool IsClone = false;
-    GCNRPTracker::LiveRegSet newInput;
+    GCNRPTracker::LiveRegSet NewInput;
     if (!Exp.IsMoveIntoLoop) {
       if (Exp.IsHoist) {
-        // If FromBB dom hot block, it will not change live for MBB.
+        // If FromBB dom hot block, It will not change live for MBB.
         if (Exp.FromBB != MBB && DT->dominates(Exp.FromBB, MBB))
           continue;
       } else {
-        // If ToBB dom hot block, it will not change live for MBB.
+        // If ToBB dom hot block, It will not change live for MBB.
         if (ToMBB != MBB && DT->dominates(ToMBB, MBB)) {
           if (IsCanClone && !Exp.IsNotSafeToCopy) {
             IsClone = true;
@@ -3882,19 +3852,19 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
         }
       }
 
-      for (auto outIt : Exp.outputLive) {
-        unsigned Reg = outIt.first;
-        LaneBitmask outMask = outIt.second;
+      for (auto OutIt : Exp.outputLive) {
+        unsigned Reg = OutIt.first;
+        LaneBitmask OutMask = OutIt.second;
         LaneBitmask MBBBeginMask;
-        if (crossLive.find(Reg) != crossLive.end())
-          MBBBeginMask = crossLive[Reg];
+        if (CrossLive.find(Reg) != CrossLive.end())
+          MBBBeginMask = CrossLive[Reg];
         // Check mask which live in both BeginSlot and exp output when sink to
         // kill the output. Check mask which not live in BeginSlot  in
         // exp output when hoist to live the output.
-        LaneBitmask profitMask = Exp.IsHoist ? (outMask & (~MBBBeginMask))
-                                             : (outMask & MBBBeginMask);
+        LaneBitmask ProfitMask = Exp.IsHoist ? (OutMask & (~MBBBeginMask))
+                                             : (OutMask & MBBBeginMask);
         if (MBBBeginMask.any()) {
-          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
           LLVM_DEBUG(std::string movStr =
                          Exp.IsHoist ? "output hoist:" : "output sink:";
                      dbgs()
@@ -3904,36 +3874,36 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             if (Exp.IsHoist)
-              vgprDiff += Size;
+              VgprDiff += Size;
             else
-              vgprDiff -= Size;
+              VgprDiff -= Size;
           } else {
             LLVM_DEBUG(dbgs() << "s\n");
             if (Exp.IsHoist)
-              sgprDiff += Size;
+              SgprDiff += Size;
             else
-              sgprDiff -= Size;
+              SgprDiff -= Size;
           }
         }
       }
 
-      for (auto inIt : Exp.inputLive) {
-        unsigned Reg = inIt.first;
-        LaneBitmask inMask = inIt.second;
+      for (auto InIt : Exp.inputLive) {
+        unsigned Reg = InIt.first;
+        LaneBitmask InMask = InIt.second;
         LaneBitmask MBBBeginMask;
-        if (crossLive.find(Reg) != crossLive.end())
-          MBBBeginMask = crossLive[Reg];
+        if (CrossLive.find(Reg) != CrossLive.end())
+          MBBBeginMask = CrossLive[Reg];
         // Check mask which not live in BeginSlot  in exp input when
         // sink to live the input. Check mask which live in both BeginSlot and
         // exp output when hoist to kill the input.
-        LaneBitmask profitMask =
-            Exp.IsHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask));
-        if (profitMask.any()) {
+        LaneBitmask ProfitMask =
+            Exp.IsHoist ? (InMask & MBBBeginMask) : (InMask & (~MBBBeginMask));
+        if (ProfitMask.any()) {
           // Update input live to avoid count same input more than once.
-          newInput[Reg] |= inMask;
+          NewInput[Reg] |= InMask;
           // Exp in not live at block input.
           // It will increase live for MBB.
-          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
 
           LLVM_DEBUG(
               std::string movStr = Exp.IsHoist ? "input hoist:" : "input sink:";
@@ -3941,26 +3911,26 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             if (Exp.IsHoist)
-              vgprDiff -= Size;
+              VgprDiff -= Size;
             else
-              vgprDiff += Size;
+              VgprDiff += Size;
           } else {
             LLVM_DEBUG(dbgs() << "s\n");
             if (Exp.IsHoist)
-              sgprDiff -= Size;
+              SgprDiff -= Size;
             else
-              sgprDiff += Size;
+              SgprDiff += Size;
           }
         }
       }
     } else {
       // When sink into loop, the input will live for every block inside loop.
       // The output will only lived between to blocks and the use blocks.
-      // If MBB dominate any user of output live reg, it will still live in
+      // If MBB dominate any user of output live reg, It will still live in
       // MBB. So cannot count that output live reg as profit.
       // Hoist into loop is not supported now.
-      for (auto outIt : Exp.outputLive) {
-        unsigned Reg = outIt.first;
+      for (auto OutIt : Exp.outputLive) {
+        unsigned Reg = OutIt.first;
         bool IsDomUser = false;
         for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
           MachineBasicBlock *UserMBB = MI.getParent();
@@ -3972,142 +3942,142 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
         if (IsDomUser)
           continue;
 
-        LaneBitmask outMask = outIt.second;
+        LaneBitmask OutMask = OutIt.second;
         LaneBitmask MBBBeginMask;
-        if (inputLive.find(Reg) != inputLive.end())
-          MBBBeginMask = inputLive[Reg];
-        LaneBitmask profitMask = outMask & MBBBeginMask;
+        if (InputLive.find(Reg) != InputLive.end())
+          MBBBeginMask = InputLive[Reg];
+        LaneBitmask ProfitMask = OutMask & MBBBeginMask;
         if (MBBBeginMask.any()) {
-          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
           LLVM_DEBUG(dbgs()
                      << "move:" << Register::virtReg2Index(Reg) << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB.
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
-            vgprDiff -= Size;
+            VgprDiff -= Size;
           } else {
             LLVM_DEBUG(dbgs() << "s\n");
-            sgprDiff -= Size;
+            SgprDiff -= Size;
           }
         }
       }
 
-      for (auto inIt : Exp.inputLive) {
-        unsigned Reg = inIt.first;
-        LaneBitmask inMask = inIt.second;
+      for (auto InIt : Exp.inputLive) {
+        unsigned Reg = InIt.first;
+        LaneBitmask InMask = InIt.second;
         LaneBitmask MBBBeginMask;
-        if (inputLive.find(Reg) != inputLive.end())
-          MBBBeginMask = inputLive[Reg];
+        if (InputLive.find(Reg) != InputLive.end())
+          MBBBeginMask = InputLive[Reg];
         // Check mask which not live in BeginSlot  in exp input.
-        LaneBitmask profitMask = inMask & (~MBBBeginMask);
-        if (profitMask.any()) {
+        LaneBitmask ProfitMask = InMask & (~MBBBeginMask);
+        if (ProfitMask.any()) {
           // Update input live to avoid count same input more than once.
-          newInput[Reg] |= inMask;
+          NewInput[Reg] |= InMask;
           // Exp in not live at block input.
           // It will increase live for MBB.
-          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
 
           LLVM_DEBUG(dbgs()
                      << "add:" << Register::virtReg2Index(Reg) << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
-            vgprDiff += Size;
+            VgprDiff += Size;
           } else {
             LLVM_DEBUG(dbgs() << "s\n");
-            sgprDiff += Size;
+            SgprDiff += Size;
           }
         }
       }
     }
 
-    if (IsVOutBound && vgprDiff > 0)
+    if (IsVOutBound && VgprDiff > 0)
       continue;
 
-    if (IsSOutBound && sgprDiff > 0)
+    if (IsSOutBound && SgprDiff > 0)
       continue;
-    llvm::mergeLiveRegSet(crossLive, newInput);
-    vgpr += vgprDiff;
-    sgpr += sgprDiff;
+    llvm::mergeLiveRegSet(CrossLive, NewInput);
+    Vgpr += VgprDiff;
+    Sgpr += SgprDiff;
     if (IsClone)
       Exp.IsCloneOnly = true;
   }
 
-  return std::make_pair(vgpr, sgpr);
+  return std::make_pair(Vgpr, Sgpr);
 }
 
-void addExpCandidates(std::vector<SubExp> &subExpCandidates,
-                      std::vector<SubExp> &subExps,
-                      GCNRPTracker::LiveRegSet &usedRegs) {
-  subExpCandidates.insert(subExpCandidates.end(), subExps.begin(),
-                          subExps.end());
-  for (auto &Exp : subExps) {
+void addExpCandidates(std::vector<SubExp> &SubExpCandidates,
+                      std::vector<SubExp> &SubExps,
+                      GCNRPTracker::LiveRegSet &UsedRegs) {
+  SubExpCandidates.insert(SubExpCandidates.end(), SubExps.begin(),
+                          SubExps.end());
+  for (auto &Exp : SubExps) {
     if (Exp.IsHoist) {
       for (auto &Reg : Exp.TopRegs) {
-        usedRegs[Reg];
+        UsedRegs[Reg];
       }
     } else {
       for (auto &Reg : Exp.BottomRegs) {
-        usedRegs[Reg];
+        UsedRegs[Reg];
       }
     }
   }
 }
 
 bool tryToAddSubExps(
-    Remat *Remat, HotBlock &hotBB, RematStatus &status,
-    std::vector<SubExp> &subExpCandidates,
-    std::vector<SubExp> &inBlockCloneSubExps,
-    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
-    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    Remat *Remat, HotBlock &HotBB, RematStatus &Status,
+    std::vector<SubExp> &SubExpCandidates,
+    std::vector<SubExp> &InBlockCloneSubExps,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &InBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &InBlockHotSInstMap,
     SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
         Candidates,
-    int vgpr, int sgpr, const GCNRPTracker::LiveRegSet &savingInputLive,
-    const GCNRPTracker::LiveRegSet &savingOutputLive,
-    GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs,
+    int Vgpr, int Sgpr, const GCNRPTracker::LiveRegSet &SavingInputLive,
+    const GCNRPTracker::LiveRegSet &SavingOutputLive,
+    GCNRPTracker::LiveRegSet &PassThrus, GCNRPTracker::LiveRegSet &UsedRegs,
     MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
-    SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *DT,
-    bool IsCanClone, bool IsVOutBound, bool IsSOutBound,
-    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool AllowPartialUseInSubExp) {
-  std::vector<SubExp> partialSubExps = buildSubExpCandidates(
-      Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, DT,
-      IsCanClone, IsSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
-      AllowPartialUseInSubExp);
-
-  GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
-  GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive;
-  std::pair<int, int> curSaving = calculateSaving(
-      hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive,
+    const SIInstrInfo *SIII, const MachineLoopInfo *MLI, SlotIndexes *SI,
+    LiveIntervals *LIS, MachineDominatorTree *DT, bool IsCanClone,
+    bool IsVOutBound, bool IsSOutBound,
+    GCNRPTracker::LiveRegSet &UnusedPassThrus, bool AllowPartialUseInSubExp) {
+  std::vector<SubExp> PartialSubExps =
+      buildSubExpCandidates(Remat, Candidates, PassThrus, MRI, SIRI, SIII, MLI,
+                            SI, DT, IsCanClone, IsSOutBound, UnusedPassThrus,
+                            Status.MemWriteMBBSet, AllowPartialUseInSubExp);
+
+  GCNRPTracker::LiveRegSet TmpSavingInputLive = SavingInputLive;
+  GCNRPTracker::LiveRegSet TmpSavingOutputLive = SavingOutputLive;
+  std::pair<int, int> CurSaving = calculateSaving(
+      HotBB, PartialSubExps, TmpSavingInputLive, TmpSavingOutputLive,
       IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI);
-  const int VLimit = status.TargetVLimit;
-  const int SLimit = status.TargetSLimit;
+  const int VLimit = Status.TargetVLimit;
+  const int SLimit = Status.TargetSLimit;
 
-  vgpr += curSaving.first;
-  sgpr += curSaving.second;
+  Vgpr += CurSaving.first;
+  Sgpr += CurSaving.second;
 
-  if (vgpr <= VLimit && sgpr <= SLimit) {
-    // nrmSubExps can help reach target occupancy, add it to
+  if (Vgpr <= VLimit && Sgpr <= SLimit) {
+    // nrmSubExps can help reach target occupancy, add It to
     // subExpCandidates.
-    addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+    addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs);
     return true;
   }
 
   if (EnableSubExpAggressive) {
     // Build candidates from passThrus  used in partialSubExps.
-    GCNRPTracker::LiveRegSet sinkUsedRegs;
-    for (auto &Exp : partialSubExps) {
+    GCNRPTracker::LiveRegSet SinkUsedRegs;
+    for (auto &Exp : PartialSubExps) {
       for (auto &Reg : Exp.BottomRegs) {
-        sinkUsedRegs[Reg];
+        SinkUsedRegs[Reg];
       }
     }
     MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> HoistCandidates;
-    for (auto &it : hotBB.inputLive) {
-      unsigned Reg = it.first;
+    for (auto &It : HotBB.InputLive) {
+      unsigned Reg = It.first;
       // Skip reg which already used for sink exp.
-      if (sinkUsedRegs.count(Reg))
+      if (SinkUsedRegs.count(Reg))
         continue;
-      if (usedRegs.count(Reg))
+      if (UsedRegs.count(Reg))
         continue;
       // Skip unsafe reg.
       if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*IsSink*/ false)) {
@@ -4133,42 +4103,40 @@ bool tryToAddSubExps(
       UseInMBB[Reg] = getRegMask(DefMI->getOperand(0), MRI);
     }
 
-    SlotIndexes *slotIndexes = LIS->getSlotIndexes();
     // Build exp dag on define blocks.
-    std::vector<SubExp> hoistSubExpCandidates;
+    std::vector<SubExp> HoistSubExpCandidates;
     // Save profit candidates into list.
-    for (auto it : HoistCandidates) {
-      MachineBasicBlock *UseMBB = it.first;
+    for (auto It : HoistCandidates) {
+      MachineBasicBlock *UseMBB = It.first;
       // Try to remove out reg def sub exp from DefMBB.
-      GCNRPTracker::LiveRegSet &UseInMBB = it.second;
+      GCNRPTracker::LiveRegSet &UseInMBB = It.second;
       // Go up on the dag until reach share node.
-      auto subExps = buildSubExpFromCandidatesTopBottom(
-          Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes);
-      for (SubExp &subExp : subExps) {
-        if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI,
-                                      IsSOutBound))
+      auto SubExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB,
+                                                        SIRI, SIII, MRI);
+      for (SubExp &SubExp : SubExps) {
+        if (!canHelpPressureWhenHoist(SubExp, MRI, MLI, IsSOutBound))
           continue;
-        subExp.IsHoist = true;
-        hoistSubExpCandidates.emplace_back(subExp);
+        SubExp.IsHoist = true;
+        HoistSubExpCandidates.emplace_back(SubExp);
       }
     }
 
-    std::pair<int, int> hoistSaving = calculateSaving(
-        hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive,
+    std::pair<int, int> HoistSaving = calculateSaving(
+        HotBB, HoistSubExpCandidates, TmpSavingInputLive, TmpSavingOutputLive,
         IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI);
 
-    int hoistVgpr = vgpr + hoistSaving.first;
-    int hoistSgpr = sgpr + hoistSaving.second;
+    int HoistVgpr = Vgpr + HoistSaving.first;
+    int HoistSgpr = Sgpr + HoistSaving.second;
 
-    if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) ||
+    if ((HoistVgpr <= VLimit && HoistSgpr <= SLimit) ||
         // If status not balance, do the remat even cannot reach target.
         // TODO: check the result not help even one occupancy.
-        (!hoistSubExpCandidates.empty() && !status.NotBalance &&
+        (!HoistSubExpCandidates.empty() && !Status.NotBalance &&
          TargetOccupancy != 0)) {
-      // nrmSubExps can help reach target occupancy, add it to
+      // nrmSubExps can help reach target occupancy, add It to
       // subExpCandidates.
-      addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
-      addExpCandidates(subExpCandidates, hoistSubExpCandidates, usedRegs);
+      addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs);
+      addExpCandidates(SubExpCandidates, HoistSubExpCandidates, UsedRegs);
 
       return true;
     }
@@ -4179,132 +4147,131 @@ bool tryToAddSubExps(
       // If not, AllowPartialUseInSubExp will no chance to be true.
       (AllowPartialUseInSubExp || !EnableSubExpAggressive)) {
     // Assume vmemLdSize could be optimized by not parallel.
-    if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
-         (vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
-        sgpr <= SLimit) {
-      // nrmSubExps can help reach target occupancy, add it to
+    if (((Vgpr - HotBB.VmemLdInputSize) <= VLimit ||
+         (Vgpr - HotBB.VmemLdOutputSize) <= VLimit) &&
+        Sgpr <= SLimit) {
+      // nrmSubExps can help reach target occupancy, add It to
       // subExpCandidates.
-      addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+      addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs);
       return true;
     }
   }
 
-  int vDistance = vgpr - (int)VLimit;
-  int sDistance = status.TargetOcc > 4 ? (sgpr - (int)SLimit) : 0;
-  int vSaved = hotBB.maxPressures.first - vgpr;
-  int sSaved = hotBB.maxPressures.second - sgpr;
+  int VDistance = Vgpr - (int)VLimit;
+  int SDistance = Status.TargetOcc > 4 ? (Sgpr - (int)SLimit) : 0;
+  int VSaved = HotBB.MaxPressures.first - Vgpr;
+  int SSaved = HotBB.MaxPressures.second - Sgpr;
   // Try to add inBlockCloneSubExps.
-  if (!tryRematInHotSpot(*hotBB.MBB, status, vDistance, sDistance, vSaved,
-                         sSaved, inBlockCloneSubExps, inBlockHotVInstMap,
-                         inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) {
-    // return false always when not allow partialUseInSubExp, it will try again
+  if (!tryRematInHotSpot(*HotBB.MBB, Status, VDistance, SDistance, VSaved,
+                         SSaved, InBlockCloneSubExps, InBlockHotVInstMap,
+                         InBlockHotSInstMap, LIS, MRI, SIRI, SIII)) {
+    // return false always when not allow partialUseInSubExp, It will try again
     // with partialUseInSubExp enabled.
     if (!AllowPartialUseInSubExp)
       return false;
     // If status not balance, do the remat even cannot reach target.
     // TODO: check the result not help even one occupancy.
-    if (!status.NotBalance && TargetOccupancy == 0)
+    if (!Status.NotBalance && TargetOccupancy == 0)
       return false;
   }
-  // nrmSubExps can help reach target occupancy, add it to
+  // nrmSubExps can help reach target occupancy, add It to
   // subExpCandidates.
-  addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+  addExpCandidates(SubExpCandidates, PartialSubExps, UsedRegs);
   return true;
 }
 
 // Remat passthru regs per hot block.
-// Reason to do it per block is to make sure passthru reuse is precise.
+// Reason to do It per block is to make sure passthru reuse is precise.
 // If try remat on all hot blocks together, the passthru might be on one block,
 //  reuse in on another block which the reg is not passthru there.
-bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
-                           RematStatus &status,
-                           GCNRPTracker::LiveRegSet &liveRegCandidates,
+bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &HotBlocks,
+                           RematStatus &Status,
+                           GCNRPTracker::LiveRegSet &LiveRegCandidates,
                            const GCNSubtarget *ST, LiveIntervals *LIS,
-                           const MachineLoopInfo *MLI,
-                           MachineDominatorTree *DT, MachineRegisterInfo &MRI,
-                           const SIRegisterInfo *SIRI,
+                           const MachineLoopInfo *MLI, MachineDominatorTree *DT,
+                           MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                            const SIInstrInfo *SIII) {
   bool IsUpdated = false;
   bool IsCanClone = EnableSubExpClone || EnableSubExpAggressive;
 
-  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
   // Sort hot blocks by pressure first.
   // The hot block with higher pressure is easier to fail.
-  // If fail, fail fast. It it works, save the subExpCandidates. The
+  // If fail, fail fast. It It works, save the subExpCandidates. The
   // subExpCandidates may help other hotblocks.
-  std::sort(hotBlocks.begin(), hotBlocks.end(),
-            [&ST](const HotBlock &a, const HotBlock &b) {
-              return pressureHigher(a.maxPressures.first, a.maxPressures.second,
-                                    b.maxPressures.first, b.maxPressures.second,
+  std::sort(HotBlocks.begin(), HotBlocks.end(),
+            [&ST](const HotBlock &A, const HotBlock &B) {
+              return pressureHigher(A.MaxPressures.first, A.MaxPressures.second,
+                                    B.MaxPressures.first, B.MaxPressures.second,
                                     ST);
             });
 
-  std::vector<SubExp> subExpCandidates;
+  std::vector<SubExp> SubExpCandidates;
   // For inBlock remat clone.
-  std::vector<SubExp> inBlockCloneSubExps;
-  DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotVInstMap;
-  DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotSInstMap;
+  std::vector<SubExp> InBlockCloneSubExps;
+  DenseMap<MachineBasicBlock *, MachineInstr *> InBlockHotVInstMap;
+  DenseMap<MachineBasicBlock *, MachineInstr *> InBlockHotSInstMap;
 
   // Save used passThrus to avoid use same reg on different MBB.
-  GCNRPTracker::LiveRegSet usedPassThrus;
+  GCNRPTracker::LiveRegSet UsedPassThrus;
   // Save moved regs to avoid use same reg hoist and sink.
-  GCNRPTracker::LiveRegSet usedRegs;
+  GCNRPTracker::LiveRegSet UsedRegs;
 
-  const int VLimit = status.TargetVLimit;
-  const int SLimit = status.TargetSLimit;
+  const int VLimit = Status.TargetVLimit;
+  const int SLimit = Status.TargetSLimit;
   // Collect passthru for hot block.
-  // Try remat on it.
-  for (auto &it : hotBlocks) {
-    MachineBasicBlock *MBB = it.MBB;
+  // Try remat on It.
+  for (auto &It : HotBlocks) {
+    MachineBasicBlock *MBB = It.MBB;
 
-    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
-    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+    const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB];
+    const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB];
 
-    it.inputLive = inputLive;
+    It.InputLive = InputLive;
 
     // Add pressure by 1 to consider spill to vgpr.
     const int PressureDelta = -1;
-    int vgpr = it.maxPressures.first - PressureDelta;
-    int sgpr = it.maxPressures.second;
-    bool IsVOutBound = vgpr > VLimit;
-    bool IsSOutBound = sgpr > SLimit;
+    int Vgpr = It.MaxPressures.first - PressureDelta;
+    int Sgpr = It.MaxPressures.second;
+    bool IsVOutBound = Vgpr > VLimit;
+    bool IsSOutBound = Sgpr > SLimit;
     // savingInputLive is used to calculate saving which will be modified to
     // avoid count same input multiple times.
-    GCNRPTracker::LiveRegSet savingInputLive = inputLive;
-    GCNRPTracker::LiveRegSet savingOutputLive = outputLive;
-    std::pair<int, int> curSaving =
-        calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive,
+    GCNRPTracker::LiveRegSet SavingInputLive = InputLive;
+    GCNRPTracker::LiveRegSet SavingOutputLive = OutputLive;
+    std::pair<int, int> CurSaving =
+        calculateSaving(It, SubExpCandidates, SavingInputLive, SavingOutputLive,
                         IsVOutBound, IsSOutBound, IsCanClone, DT, MRI, SIRI);
 
-    vgpr += curSaving.first;
-    sgpr += curSaving.second;
+    Vgpr += CurSaving.first;
+    Sgpr += CurSaving.second;
 
-    if (vgpr <= VLimit && sgpr <= SLimit)
+    if (Vgpr <= VLimit && Sgpr <= SLimit)
       continue;
 
     // Collect pass thru regs.
-    GCNRPTracker::LiveRegSet passThrus =
-        collectPassThrus(MBB, inputLive, outputLive, usedPassThrus,
-                         liveRegCandidates, MRI, IsCanClone);
+    GCNRPTracker::LiveRegSet PassThrus =
+        collectPassThrus(MBB, InputLive, OutputLive,
+                         LiveRegCandidates, MRI, IsCanClone);
 
     // Group pass thru regs by def MBB.
     SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-        Candidates = groupPassThruByDefBlock(Remat, passThrus, usedPassThrus,
+        Candidates = groupPassThruByDefBlock(Remat, PassThrus, UsedPassThrus,
                                              MRI, SIRI, SIII);
     // unUsedPassThrus used to collect passThru which is skipped when build
     // subExp.
-    GCNRPTracker::LiveRegSet unusedPassThrus;
+    GCNRPTracker::LiveRegSet UnusedPassThrus;
     // Build exp dag on define blocks.
     bool AllowPartialUseInSubExp = false;
     if (tryToAddSubExps(
-            Remat, it, status, subExpCandidates, inBlockCloneSubExps,
-            inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
-            savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
-            SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound,
-            IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) {
+            Remat, It, Status, SubExpCandidates, InBlockCloneSubExps,
+            InBlockHotVInstMap, InBlockHotSInstMap, Candidates, Vgpr, Sgpr,
+            SavingInputLive, SavingOutputLive, PassThrus, UsedRegs, MRI, SIRI,
+            SIII, MLI, SlotIndexes, LIS, DT, IsCanClone, IsVOutBound,
+            IsSOutBound, UnusedPassThrus, AllowPartialUseInSubExp)) {
       // Remove unusedPassThrus from passThrus first.
-      llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
-      llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+      llvm::andNotLiveRegSet(PassThrus, UnusedPassThrus);
+      llvm::mergeLiveRegSet(UsedPassThrus, PassThrus);
       continue;
     }
     // If cannot clone, don't need to try partialUseInSubExp which must clone.
@@ -4312,54 +4279,53 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
       return false;
 
     // Partial use subExp may result  count caused by clone.
-    // Only try it when enable aggressive remat.
+    // Only try It when enable aggressive remat.
     if (!EnableSubExpAggressive)
       return false;
 
     AllowPartialUseInSubExp = true;
     if (!tryToAddSubExps(
-            Remat, it, status, subExpCandidates, inBlockCloneSubExps,
-            inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
-            savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
-            SIII, MLI, slotIndexes, LIS, DT, IsCanClone, IsVOutBound,
-            IsSOutBound, unusedPassThrus, AllowPartialUseInSubExp)) {
+            Remat, It, Status, SubExpCandidates, InBlockCloneSubExps,
+            InBlockHotVInstMap, InBlockHotSInstMap, Candidates, Vgpr, Sgpr,
+            SavingInputLive, SavingOutputLive, PassThrus, UsedRegs, MRI, SIRI,
+            SIII, MLI, SlotIndexes, LIS, DT, IsCanClone, IsVOutBound,
+            IsSOutBound, UnusedPassThrus, AllowPartialUseInSubExp)) {
       return false;
     }
     // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
-    llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+    llvm::mergeLiveRegSet(UsedPassThrus, PassThrus);
   }
 
   // Apply changes.
   {
     // sort subExpCandidates to make sure input use apply before output use if a
     // reg is input and output of subExps.
-    LLVM_DEBUG(for (SubExp &Exp : subExpCandidates) { Exp.dump(MRI, SIRI); });
-    sortSubExpCandidates(subExpCandidates);
+    LLVM_DEBUG(for (SubExp &Exp : SubExpCandidates) { Exp.dump(MRI, SIRI); });
+    sortSubExpCandidates(SubExpCandidates);
 
-    for (SubExp &Exp : subExpCandidates) {
+    for (SubExp &Exp : SubExpCandidates) {
       // Skip exp which is cleared in sort for hoist sink conflict.
       if (Exp.SUnits.empty())
         continue;
       LLVM_DEBUG(Exp.dump(MRI, SIRI));
       if (Exp.IsHoist) {
-        ApplySubExpMoveNearDefine(Exp, MRI, DT, slotIndexes, SIII, SIRI);
+        applySubExpMoveNearDefine(Exp, MRI, SlotIndexes, SIII, SIRI);
       } else {
         if (Exp.IsCloneOnly)
-          ApplySubExpCloneNearUser(Exp, hotBlocks, DT, MRI, slotIndexes, SIII,
+          applySubExpCloneNearUser(Exp, HotBlocks, DT, MRI, SlotIndexes, SIII,
                                    SIRI);
         else
-          ApplySubExpMoveNearUser(Exp, MRI, DT, slotIndexes, SIII, SIRI);
+          applySubExpMoveNearUser(Exp, MRI, DT, SlotIndexes);
       }
     }
 
-    for (SubExp &Exp : inBlockCloneSubExps) {
-      ApplySubExpCloneNearUserInBlock(Exp, inBlockHotVInstMap,
-                                      inBlockHotSInstMap, MRI, slotIndexes,
-                                      SIII, SIRI);
+    for (SubExp &Exp : InBlockCloneSubExps) {
+      applySubExpCloneNearUserInBlock(
+          Exp, InBlockHotVInstMap, InBlockHotSInstMap, MRI, SlotIndexes, SIRI);
     }
     // Try to see possible occupancy could reach, then dicide a target.
     // Apply remat.
-    IsUpdated = subExpCandidates.size();
+    IsUpdated = SubExpCandidates.size();
   }
 
   return IsUpdated;
@@ -4367,7 +4333,7 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
 
 int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
                   const SIRegisterInfo *SIRI, const MachineRegisterInfo &MRI) {
-  int vmemLdSize = 0;
+  int VmemLdSize = 0;
   // Collect vmemLd when enable split.
   for (MachineInstr &MI : MBB) {
     bool IsHighLatency = SIII->isHighLatencyInstruction(MI);
@@ -4379,16 +4345,16 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
       continue;
     // a vmem ld.
     MachineOperand &Dst = MI.getOperand(0);
-    LaneBitmask mask = llvm::getRegMask(Dst, MRI);
-    unsigned size = llvm::getRegSize(Dst.getReg(), mask, MRI, SIRI);
-    vmemLdSize += size;
+    LaneBitmask Mask = llvm::getRegMask(Dst, MRI);
+    unsigned Size = llvm::getRegSize(Dst.getReg(), Mask, MRI, SIRI);
+    VmemLdSize += Size;
   }
-  return vmemLdSize;
+  return VmemLdSize;
 }
 
 } // namespace
 
-bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
+bool groupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
                 LiveIntervals *LIS, MachineDominatorTree *DT,
                 MachinePostDominatorTree *PDT, AliasAnalysis *AA) {
   if (MF.size() < 2)
@@ -4400,95 +4366,95 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
 
   auto &MRI = MF.getRegInfo();
 
-  RematStatus status = getRematStatus(MF, MLI, LIS, MRI, ST);
+  RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST);
 
   const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
-  if (status.TargetOcc >= MaxOcc)
+  if (Status.TargetOcc >= MaxOcc)
     return false;
 
-  unsigned VLimit = status.TargetVLimit;
-  unsigned SLimit = status.TargetSLimit;
+  unsigned VLimit = Status.TargetVLimit;
+  unsigned SLimit = Status.TargetSLimit;
 
-  int rematVCnt = status.MaxVPressure - VLimit;
-  int rematSCnt = status.MaxSPressure - SLimit;
+  int RematVCnt = Status.MaxVPressure - VLimit;
+  int RematSCnt = Status.MaxSPressure - SLimit;
 
   bool IsSGPRSpill = false;
-  if (rematSCnt > 0) {
-    IsSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+  if (RematSCnt > 0) {
+    IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
   }
 
   // If bound by lds, skip.
-  if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second &&
+  if ((Status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second &&
       !IsSGPRSpill)
     return false;
 
-  bool IsBothOutLimit = rematVCnt > 0 && rematSCnt > 0;
+  bool IsBothOutLimit = RematVCnt > 0 && RematSCnt > 0;
   // TODO: use check wqm and support vreg remat.
   bool IsCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
-  rematVCnt = IsCheckWQM & false;
+  RematVCnt = IsCheckWQM & false;
 
   // Remat on every hot block.
 
   // Collect all hot blocks.
-  std::vector<HotBlock> hotBlocks;
+  std::vector<HotBlock> HotBlocks;
   for (MachineBasicBlock &MBB : MF) {
     // Collect reg pressure.
-    auto &RP = status.MBBPressureMap[&MBB];
-    unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
-    unsigned maxLocalSPressure = RP.getMaxSGPR();
+    auto &RP = Status.MBBPressureMap[&MBB];
+    unsigned MaxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+    unsigned MaxLocalSPressure = RP.getMaxSGPR();
 
-    maxLocalSPressure += RegForVCC;
+    MaxLocalSPressure += RegForVCC;
 
     if (!EnableInBlockRemat) {
-      if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+      if (MaxLocalVPressure <= VLimit && MaxLocalSPressure <= SLimit)
         continue;
     }
 
     // Move inst which input is imm/pass thru reg/out reg to help pressure.
-    if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) {
-      maxLocalVPressure = 0;
-      maxLocalSPressure = 0;
-      collectMBBPressure(MBB, LIS, ST, maxLocalVPressure, maxLocalSPressure,
-                         status);
+    if (tryHoldPacifist(MBB, LIS, MRI, SIRI, AA, Status)) {
+      MaxLocalVPressure = 0;
+      MaxLocalSPressure = 0;
+      collectMBBPressure(MBB, LIS, ST, MaxLocalVPressure, MaxLocalSPressure,
+                         Status);
 
-      maxLocalSPressure += RegForVCC;
+      MaxLocalSPressure += RegForVCC;
     }
-    if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+    if (MaxLocalVPressure <= VLimit && MaxLocalSPressure <= SLimit)
       continue;
 
     // When both vgpr sgpr out limit, only help vgpr.
-    if (IsBothOutLimit && maxLocalVPressure <= VLimit)
+    if (IsBothOutLimit && MaxLocalVPressure <= VLimit)
       continue;
-    GCNRPTracker::LiveRegSet liveSet;
-    hotBlocks.push_back({&MBB, liveSet,
-                         std::make_pair(maxLocalVPressure, maxLocalSPressure),
+    GCNRPTracker::LiveRegSet LiveSet;
+    HotBlocks.push_back({&MBB, LiveSet,
+                         std::make_pair(MaxLocalVPressure, MaxLocalSPressure),
                          0, 0});
   }
   // Collect vmemLdInput/OutputSize.
   if (EnableVmemDegree) {
-    DenseMap<MachineBasicBlock *, unsigned> outputVMemLdSizeMap;
-    for (auto it : hotBlocks) {
-      MachineBasicBlock *MBB = it.MBB;
+    DenseMap<MachineBasicBlock *, unsigned> OutputVMemLdSizeMap;
+    for (auto It : HotBlocks) {
+      MachineBasicBlock *MBB = It.MBB;
       // Collect vmemLd when enable split.
-      int vmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI);
-      if (vmemLdSize) {
-        outputVMemLdSizeMap[MBB] = vmemLdSize;
+      int VmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI);
+      if (VmemLdSize) {
+        OutputVMemLdSizeMap[MBB] = VmemLdSize;
       }
     }
-    for (auto &it : hotBlocks) {
-      MachineBasicBlock *MBB = it.MBB;
+    for (auto &It : HotBlocks) {
+      MachineBasicBlock *MBB = It.MBB;
 
-      auto oit = outputVMemLdSizeMap.find(MBB);
-      if (oit != outputVMemLdSizeMap.end())
-        it.vmemLdOutputSize = oit->second;
+      auto OIt = OutputVMemLdSizeMap.find(MBB);
+      if (OIt != OutputVMemLdSizeMap.end())
+        It.VmemLdOutputSize = OIt->second;
 
       if (MBB->pred_size() != 1)
         continue;
 
       MachineBasicBlock *Pred = *MBB->pred_begin();
-      oit = outputVMemLdSizeMap.find(Pred);
-      if (oit != outputVMemLdSizeMap.end()) {
-        it.vmemLdInputSize = oit->second;
+      OIt = OutputVMemLdSizeMap.find(Pred);
+      if (OIt != OutputVMemLdSizeMap.end()) {
+        It.VmemLdInputSize = OIt->second;
       } else {
         if (Pred->getFirstTerminator() != Pred->end())
           continue;
@@ -4497,60 +4463,60 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
         bool IsHighLatency = SIII->isHighLatencyInstruction(Pred->back());
         if (!IsHighLatency)
           continue;
-        int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI);
-        it.vmemLdInputSize = vmemLdSize;
+        int VmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI);
+        It.VmemLdInputSize = VmemLdSize;
       }
     }
   }
 
   if (EnableUniformVectorToScalar) {
-    if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap,
-                               hotBlocks, LIS, MRI, SIRI, SIII, MLI)) {
+    if (rematUniformVgprToSgpr(Remat, MF, Status, HotBlocks, LIS, MRI, SIRI,
+                               SIII, MLI)) {
       // Rebuild LIS.
       LIS->reanalyze(MF);
-      status = getRematStatus(MF, MLI, LIS, MRI, ST);
-      bool IsSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF);
+      Status = getRematStatus(MF, MLI, LIS, MRI, ST);
+      bool IsSgprSpilled = nearSgprSpill(Status.MaxSPressure, ST, MF);
       if (IsSgprSpilled) {
         bool IsNearTarget = false;
         hotBlockRemat(Remat, MF, MLI, LIS, DT, PDT, IsNearTarget);
         // Rebuild LIS.
         LIS->reanalyze(MF);
-        status = getRematStatus(MF, MLI, LIS, MRI, ST);
+        Status = getRematStatus(MF, MLI, LIS, MRI, ST);
       }
 
-      for (auto &it : hotBlocks) {
-        MachineBasicBlock *MBB = it.MBB;
+      for (auto &It : HotBlocks) {
+        MachineBasicBlock *MBB = It.MBB;
 
         // Update pressure.
-        auto &RP = status.MBBPressureMap[MBB];
-        unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
-        unsigned maxLocalSPressure = RP.getMaxSGPR();
+        auto &RP = Status.MBBPressureMap[MBB];
+        unsigned MaxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+        unsigned MaxLocalSPressure = RP.getMaxSGPR();
 
-        maxLocalSPressure += RegForVCC;
-        it.maxPressures.first = maxLocalVPressure;
-        it.maxPressures.second = maxLocalSPressure;
+        MaxLocalSPressure += RegForVCC;
+        It.MaxPressures.first = MaxLocalVPressure;
+        It.MaxPressures.second = MaxLocalSPressure;
       }
     }
   }
 
   // Collect all live reg which cross hot blocks.
-  GCNRPTracker::LiveRegSet liveRegCandidates;
-  for (auto it : hotBlocks) {
-    MachineBasicBlock *MBB = it.MBB;
+  GCNRPTracker::LiveRegSet LiveRegCandidates;
+  for (auto It : HotBlocks) {
+    MachineBasicBlock *MBB = It.MBB;
 
-    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+    const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB];
 
-    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+    const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB];
 
-    llvm::mergeLiveRegSet(liveRegCandidates, inputLive);
-    llvm::mergeLiveRegSet(liveRegCandidates, outputLive);
+    llvm::mergeLiveRegSet(LiveRegCandidates, InputLive);
+    llvm::mergeLiveRegSet(LiveRegCandidates, OutputLive);
   }
 
   // Check min VGPR bound.
   BlockSet PressureUnderLimitSet;
   if (EnableSubExpMinReg) {
-    for (auto &it : hotBlocks) {
-      MachineBasicBlock *MBB = it.MBB;
+    for (auto &It : HotBlocks) {
+      MachineBasicBlock *MBB = It.MBB;
       unsigned MaxLocalVGPR = 0;
       unsigned MaxLocalSGPR = 0;
       llvm::getRegBound(MBB, MRI, SIRI, SIII, LIS, MaxLocalVGPR, MaxLocalSGPR);
@@ -4558,17 +4524,17 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
       if (MaxLocalVGPR < VLimit && MaxLocalSGPR < SLimit) {
         PressureUnderLimitSet.insert(MBB);
       } else {
-        if (MaxLocalVGPR < it.maxPressures.first)
-          it.maxPressures =
-              std::make_pair(MaxLocalVGPR, it.maxPressures.second);
-        if (MaxLocalSGPR < it.maxPressures.second)
-          it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR);
+        if (MaxLocalVGPR < It.MaxPressures.first)
+          It.MaxPressures =
+              std::make_pair(MaxLocalVGPR, It.MaxPressures.second);
+        if (MaxLocalSGPR < It.MaxPressures.second)
+          It.MaxPressures = std::make_pair(It.MaxPressures.first, MaxLocalSGPR);
       }
     }
   }
 
   bool IsUpdated =
-      perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST,
+      perBlockPassthruRemat(Remat, HotBlocks, Status, LiveRegCandidates, ST,
                             LIS, MLI, DT, MRI, SIRI, SIII);
 
   return IsUpdated;
@@ -4614,7 +4580,7 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
       LIS->reanalyze(MF);
     }
 
-    IsUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA);
+    IsUpdated = groupRemat(this, MF, MLI, LIS, DT, PDT, AA);
 
     IsFinalUpdated |= IsUpdated;
   }

From d8b6711de941bfa483a82f41bc47eff7e23ac16d Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Fri, 14 Mar 2025 12:34:53 -0700
Subject: [PATCH 14/25] More cleanup

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |  74 +++++-----
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     |   2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  39 ++---
 .../AMDGPUOccupancyAndLatencyHelper.cpp       | 139 +++++++++---------
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |  18 +--
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    |  56 ++++---
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h      |  20 +--
 7 files changed, 175 insertions(+), 173 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 4c46cee69a038..46d182ffd9e29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -2308,7 +2308,7 @@ void applySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &HotBlocks,
   }
   // Build dag for SubExp to help remove unused inst when clone.
   ExpDag Dag(MRI, SIRI, SIII, /*IsJoinInput*/ true);
-  Dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits);
+  Dag.build(Exp.InputLive, Exp.OutputLive, Exp.SUnits);
   DenseSet<SUnit *> DagBottoms;
   for (SUnit &SU : Dag.SUnits) {
     if (!SU.isInstr())
@@ -3141,10 +3141,10 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi,
     if (SubExp.IsNotSafeToCopy)
       continue;
     if (IsVGPR) {
-      if (SubExp.vOutputSize == 0)
+      if (SubExp.VOutputSize == 0)
         continue;
     } else {
-      if (SubExp.sOutputSize == 0)
+      if (SubExp.SOutputSize == 0)
         continue;
     }
     if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
@@ -3158,9 +3158,9 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi,
     if (SubExp.IsHasMemInst && MemWriteMBBSet.count(&MBB))
       continue;
     if (IsVGPR) {
-      Distance -= SubExp.vOutputSize;
+      Distance -= SubExp.VOutputSize;
     } else {
-      Distance -= SubExp.sOutputSize;
+      Distance -= SubExp.SOutputSize;
     }
     CloneSubExps.emplace_back(SubExp);
     if (Distance <= 0)
@@ -3256,8 +3256,8 @@ bool tryRematInHotSpot(
 // When apply subExp1 before subExp0, new clone of subExp0 which use result of
 // subExp1 will have old reg of subExp1. And reg pressure will not be reduced.
 void sortSubExpCandidates(std::vector<SubExp> &SubExpCandidates) {
-  MapVector<unsigned, SetVector<SubExp *>> InputMap;
-  MapVector<unsigned, SetVector<SubExp *>> OutputMap;
+  MapVector<Register, SetVector<SubExp *>> InputMap;
+  MapVector<Register, SetVector<SubExp *>> OutputMap;
   struct SortNode {
     SubExp Exp;
     unsigned Depth;
@@ -3288,7 +3288,7 @@ void sortSubExpCandidates(std::vector<SubExp> &SubExpCandidates) {
   MapVector<SubExp *, SortNode> SortMap;
   for (auto It : InputMap) {
     unsigned Reg = It.first;
-    auto OutIt = OutputMap.find(Reg);
+    MapVector<Register, SetVector<SubExp *>>::iterator OutIt = OutputMap.find(Reg);
     if (OutIt == OutputMap.end())
       continue;
     auto &InExps = It.second;
@@ -3302,8 +3302,8 @@ void sortSubExpCandidates(std::vector<SubExp> &SubExpCandidates) {
             continue;
           // Canot input(use) move up, output(def) move down.
           // Choose the exp which save more.
-          int InExpGain = InExp->vOutputSize - InExp->vInputSize;
-          int OutExpGain = OutExp->vInputSize - InExp->vOutputSize;
+          int InExpGain = InExp->VOutputSize - InExp->VInputSize;
+          int OutExpGain = OutExp->VInputSize - InExp->VOutputSize;
           if (InExpGain >= OutExpGain) {
             OutExp->SUnits.clear();
           } else {
@@ -3415,26 +3415,26 @@ bool canHelpPressureWhenSink(SubExp &SubExp,
 
   // Update input size to ignore lives in which already in
   // passThrus.
-  for (auto It : SubExp.inputLive) {
+  for (auto It : SubExp.InputLive) {
     unsigned Reg = It.first;
     if (PassThrus.count(Reg) == 0)
       continue;
     unsigned Size = getRegSize(Reg, It.second, MRI, SIRI);
     if (SIRI->isVGPR(MRI, Reg)) {
-      SubExp.vInputSize -= Size;
+      SubExp.VInputSize -= Size;
     } else {
-      SubExp.sInputSize -= Size;
+      SubExp.SInputSize -= Size;
     }
   }
 
-  if (SubExp.vInputSize > SubExp.vOutputSize)
+  if (SubExp.VInputSize > SubExp.VOutputSize)
     return false;
 
-  if (SubExp.sInputSize > SubExp.sOutputSize && IsSgprBound)
+  if (SubExp.SInputSize > SubExp.SOutputSize && IsSgprBound)
     return false;
 
-  if (SubExp.sInputSize >= SubExp.sOutputSize &&
-      SubExp.vInputSize == SubExp.vOutputSize)
+  if (SubExp.SInputSize >= SubExp.SOutputSize &&
+      SubExp.VInputSize == SubExp.VOutputSize)
     return false;
 
   // Try to find a Insert Block.
@@ -3479,13 +3479,13 @@ bool canHelpPressureWhenHoist(SubExp &SubExp, const MachineRegisterInfo &MRI,
                               const MachineLoopInfo *MLI, bool IsSgprBound) {
   if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ true))
     return false;
-  if (SubExp.vInputSize < SubExp.vOutputSize)
+  if (SubExp.VInputSize < SubExp.VOutputSize)
     return false;
-  if (SubExp.sInputSize < SubExp.sOutputSize && IsSgprBound)
+  if (SubExp.SInputSize < SubExp.SOutputSize && IsSgprBound)
     return false;
 
-  if (SubExp.sInputSize <= SubExp.sOutputSize &&
-      SubExp.vInputSize == SubExp.vOutputSize)
+  if (SubExp.SInputSize <= SubExp.SOutputSize &&
+      SubExp.VInputSize == SubExp.VOutputSize)
     return false;
 
   // Try to find a Insert Block.
@@ -3715,17 +3715,17 @@ SubExp buildFreeSubExp(SubExp &Exp,
 
   // Calc reg for freeExp.
   for (unsigned Reg : FreeExp.TopRegs) {
-    FreeExp.inputLive[Reg];
+    FreeExp.InputLive[Reg];
   }
 
   for (unsigned Reg : FreeExp.BottomRegs) {
-    FreeExp.outputLive[Reg];
+    FreeExp.OutputLive[Reg];
   }
 
-  CollectLiveSetPressure(FreeExp.inputLive, MRI, SIRI, FreeExp.vInputSize,
-                         FreeExp.sInputSize);
-  CollectLiveSetPressure(FreeExp.outputLive, MRI, SIRI, FreeExp.vOutputSize,
-                         FreeExp.sOutputSize);
+  CollectLiveSetPressure(FreeExp.InputLive, MRI, SIRI, FreeExp.VInputSize,
+                         FreeExp.SInputSize);
+  CollectLiveSetPressure(FreeExp.OutputLive, MRI, SIRI, FreeExp.VOutputSize,
+                         FreeExp.SOutputSize);
   return FreeExp;
 }
 
@@ -3817,14 +3817,14 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
         continue;
       // When subExp is from hotBB, check output instead of input.
       if (Exp.FromBB == MBB) {
-        if (IsVOutBound && Exp.vOutputSize < Exp.vInputSize)
+        if (IsVOutBound && Exp.VOutputSize < Exp.VInputSize)
           continue;
-        if (IsSOutBound && Exp.sOutputSize < Exp.sInputSize)
+        if (IsSOutBound && Exp.SOutputSize < Exp.SInputSize)
           continue;
-        Vgpr += Exp.vInputSize;
-        Vgpr -= Exp.vOutputSize;
-        Sgpr += Exp.sInputSize;
-        Sgpr -= Exp.sOutputSize;
+        Vgpr += Exp.VInputSize;
+        Vgpr -= Exp.VOutputSize;
+        Sgpr += Exp.SInputSize;
+        Sgpr -= Exp.SOutputSize;
         continue;
       }
     }
@@ -3852,7 +3852,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
         }
       }
 
-      for (auto OutIt : Exp.outputLive) {
+      for (auto OutIt : Exp.OutputLive) {
         unsigned Reg = OutIt.first;
         LaneBitmask OutMask = OutIt.second;
         LaneBitmask MBBBeginMask;
@@ -3887,7 +3887,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
         }
       }
 
-      for (auto InIt : Exp.inputLive) {
+      for (auto InIt : Exp.InputLive) {
         unsigned Reg = InIt.first;
         LaneBitmask InMask = InIt.second;
         LaneBitmask MBBBeginMask;
@@ -3929,7 +3929,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
       // If MBB dominate any user of output live reg, It will still live in
       // MBB. So cannot count that output live reg as profit.
       // Hoist into loop is not supported now.
-      for (auto OutIt : Exp.outputLive) {
+      for (auto OutIt : Exp.OutputLive) {
         unsigned Reg = OutIt.first;
         bool IsDomUser = false;
         for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
@@ -3963,7 +3963,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
         }
       }
 
-      for (auto InIt : Exp.inputLive) {
+      for (auto InIt : Exp.InputLive) {
         unsigned Reg = InIt.first;
         LaneBitmask InMask = InIt.second;
         LaneBitmask MBBBeginMask;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 365fb058bf6b3..63651ab82fcdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -453,7 +453,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
                              .addImm(offset * LaneSize);
         MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
         MachineBasicBlock::iterator InsertPoint =
-            llvm::FindOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
+            llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
                                                       SIII, &MRI);
         MI.getParent()->insert(InsertPoint, OffsetAddMI);
         SIII->legalizeOperands(*OffsetAddMI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 1e9f0bad12d19..04b4b74fbd726 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -1,3 +1,6 @@
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+
 #pragma once
 
 #include "llvm/ADT/DenseMap.h"
@@ -37,14 +40,14 @@ using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
                     const llvm::MachineRegisterInfo &MRI,
                     const llvm::SIRegisterInfo *SIRI);
-void CollectLiveSetPressure(const LiveSet &liveSet,
+void collectLiveSetPressure(const LiveSet &liveSet,
                             const llvm::MachineRegisterInfo &MRI,
                             const llvm::SIRegisterInfo *SIRI,
                             unsigned &VPressure, unsigned &SPressure);
 
 bool isExecUpdateForControlFlow(llvm::MachineInstr &MI);
 
-bool IsSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
+bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
 
 llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO,
                              const llvm::MachineRegisterInfo &MRI);
@@ -68,40 +71,40 @@ bool reach_block(llvm::MachineBasicBlock *FromBB,
 void viewCFGWithPhi(llvm::MachineFunction &MF);
 void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
 
-llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF,
+llvm::MachineBasicBlock *createNullExportBlock(llvm::MachineFunction &MF,
                                                const llvm::SIInstrInfo *TII);
 
-bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
+bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
                        llvm::MachineBasicBlock &MBB);
 
-void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+void updatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
                                  const llvm::MachineRegisterInfo *MRI);
 
-void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+void buildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
                                 llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
                                 const llvm::MachineRegisterInfo *MRI);
 
-MachineReg CreateVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand,
+MachineReg createVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand,
                                       llvm::MachineFunction &MF);
 
-MachineReg CreateVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF);
+MachineReg createVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF);
 
-bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec,
+bool isExecCopy(const llvm::MachineInstr &MI, MachineReg Exec,
                 MachineReg *pDst);
 struct MachineRegWithSubReg {
   MachineReg Reg = /*NoRegister*/ 0;
   unsigned SubReg = /*NoSubRegister*/ 0;
 };
-MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF);
-llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
+MachineRegWithSubReg getWqmEntryActiveMask(llvm::MachineFunction &MF);
+llvm::MachineInstr *getWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
 
 // Return true if this machine instruction represents a call to the fetch
 // shader. We curently have two mechanisims for calling fetch shader:
 // 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
 // 2. A CALL instruction with the `FetchShaderCall` flag set to true.
-bool IsFetchShaderCall(const llvm::MachineInstr *MI);
+bool isFetchShaderCall(const llvm::MachineInstr *MI);
 
-bool IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
                  llvm::MachineBasicBlock::iterator MI);
 
 // An enum used to pass additional constraints to
@@ -126,7 +129,7 @@ enum SccDefInsertPointConstraintFlags {
 // scc around BeforeInst. This way BeforeInst can safely be used
 // as the new insert location.
 //
-llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef(
+llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
     llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
     const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
     llvm::MachineRegisterInfo *MRI,
@@ -149,9 +152,9 @@ void buildEndLiveMap(
 
 void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
 
-unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF,
+unsigned getCurrentVGPRCount(llvm::MachineFunction &MF,
                              const llvm::SIRegisterInfo *SIRI);
-unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF,
+unsigned getCurrentSGPRCount(llvm::MachineFunction &MF,
                              const llvm::SIRegisterInfo *SIRI);
 
 bool isFastMathInst(llvm::MachineInstr &MI);
@@ -169,7 +172,7 @@ void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
 // Look for the successor `Succ` of the given `MBB`.
 // Returns MBB->succ_end() if `Succ` is not a successor of MBB.
 llvm::MachineBasicBlock::succ_iterator
-FindSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ);
+findSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ);
 
 // The enum and helper function for v_perm selection mask.
 //
@@ -210,3 +213,5 @@ constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0,
           (int)Sel_0);
 }
 } // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 2e48ec44f979c..a8eef88ac2af8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -32,40 +32,39 @@ float SchedScore::computeScore() const {
          LatencyHide;
 }
 float SchedScore::computeScore2() const {
-  float cycles = 0;
-  cycles = (MixAlu * Occupancy + MemLatency);
-  cycles /= Occupancy;
-  return cycles;
+  float Cycles = 0;
+  Cycles = (MixAlu * Occupancy + MemLatency);
+  Cycles /= Occupancy;
+  return Cycles;
 }
 
-void SchedScore::sum(const SchedScore &s, unsigned loopDepth) {
-  unsigned loopCount = loopDepth > 0 ? std::pow(3, loopDepth) : 1;
-  LatencyHide += loopCount * s.LatencyHide;
-  MemLatency += loopCount * s.MemLatency;
-  MixAlu += loopCount * s.MixAlu;
-  Alu += loopCount * s.Alu;
-  Lds += loopCount * s.Lds;
-  SgprSpill |= s.SgprSpill;
+void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) {
+  unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1;
+  LatencyHide += LoopCount * S.LatencyHide;
+  MemLatency += LoopCount * S.MemLatency;
+  MixAlu += LoopCount * S.MixAlu;
+  Alu += LoopCount * S.Alu;
+  Lds += LoopCount * S.Lds;
+  SgprSpill |= S.SgprSpill;
 }
-bool SchedScore::isBetter(const SchedScore &s) const {
-  float score = computeScore();
-  float newScore = s.computeScore();
-  bool spillBetter = !SgprSpill && s.SgprSpill;
-  return spillBetter ? true : newScore >= score;
+bool SchedScore::isBetter(const SchedScore &S) const {
+  float Score = computeScore();
+  float NewScore = S.computeScore();
+  bool SpillBetter = !SgprSpill && S.SgprSpill;
+  return SpillBetter ? true : NewScore >= Score;
 }
 // Does more occupancy give more perf.
 bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
-  unsigned gain = latencyGain(TargetOccupancy, ExtraOcc);
+  unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc);
   // 10% is good enough.
-  if ((10 * gain) >= Alu)
+  if ((10 * Gain) >= Alu)
     return true;
-  else
-    return false;
+  return false;
 }
 
 unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
-  unsigned latency = MemLatency;
-  return (latency / (TgtOcc)) - (latency / (TgtOcc + ExtraOcc));
+  unsigned Latency = MemLatency;
+  return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc));
 }
 
 // AMDGPULatencyTracker
@@ -75,113 +74,113 @@ AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
 void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
   if (MI.isDebugInstr())
     return;
-  int latency = SIII->getInstrLatency(ItinerayData, MI);
+  int Latency = SIII->getInstrLatency(ItinerayData, MI);
   // If inside latency hide.
   if (!LatencyMIs.empty()) {
-    bool bWaitCnt = false;
+    bool IsWaitCnt = false;
     for (auto &MO : MI.operands()) {
       if (MO.isReg()) {
-        unsigned reg = MO.getReg();
-        auto it = LatencyMIs.find(reg);
-        if (it != LatencyMIs.end()) {
-          bWaitCnt = true;
+        Register Reg = MO.getReg();
+        auto It = LatencyMIs.find(Reg);
+        if (It != LatencyMIs.end()) {
+          IsWaitCnt = true;
           // If MI use mem result, update latency to mem latency.
-          int cycle = it->second;
-          if (cycle > latency)
-            latency = cycle;
+          int Cycle = It->second;
+          if (Cycle > Latency)
+            Latency = Cycle;
         }
       }
     }
     // Update latency for each mem latency inst.
-    for (auto it = LatencyMIs.begin(); it != LatencyMIs.end();) {
-      auto prev = it;
-      auto l = (it++);
-      int cycle = l->second;
-      if (cycle <= latency) {
+    for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) {
+      auto Prev = It;
+      auto L = (It++);
+      int Cycle = L->second;
+      if (Cycle <= Latency) {
         // Only left cycles.
         // Remove the reg.
-        LatencyMIs.erase(prev);
-        if (bWaitCnt && cycle == latency) {
-          score.MemLatency += cycle;
+        LatencyMIs.erase(Prev);
+        if (IsWaitCnt && Cycle == Latency) {
+          Score.MemLatency += Cycle;
           // Only count memLatency once, the rest is hide.
-          bWaitCnt = false;
+          IsWaitCnt = false;
         } else {
           // Hide cycle or count mem latency?
-          score.LatencyHide += cycle;
+          Score.LatencyHide += Cycle;
         }
       } else {
-        l->second -= latency;
+        L->second -= Latency;
         // Hide latency.
-        score.LatencyHide += latency;
+        Score.LatencyHide += Latency;
       }
     }
 
   } else {
     // TODO: check branch/lds?
     // TODO: check prevVAlu?
-    auto getAluStatus = [](const MachineInstr &MI,
+    auto GetAluStatus = [](const MachineInstr &MI,
                            const llvm::SIInstrInfo *SIII) {
-      AluStatus status = AluStatus::Nothing;
+      AluStatus Status = AluStatus::Nothing;
       if (SIII->isVALU(MI.getOpcode())) {
-        status = AluStatus::Vector;
+        Status = AluStatus::Vector;
       } else if (SIII->isSALU(MI.getOpcode())) {
-        status = AluStatus::Scalar;
+        Status = AluStatus::Scalar;
       }
-      return status;
+      return Status;
     };
-    AluStatus status = getAluStatus(MI, SIII);
+    AluStatus Status = GetAluStatus(MI, SIII);
 
-    switch (prevStatus) {
+    switch (PrevStatus) {
     case AluStatus::Nothing: {
-      score.Alu += latency;
-      score.MixAlu += latency;
-      prevStatus = status;
+      Score.Alu += Latency;
+      Score.MixAlu += Latency;
+      PrevStatus = Status;
     } break;
     case AluStatus::Vector:
     case AluStatus::Scalar: {
-      score.Alu += latency;
+      Score.Alu += Latency;
       // Ignore mix alu.
-      if (prevStatus != status) {
-        prevStatus = AluStatus::Nothing;
+      if (PrevStatus != Status) {
+        PrevStatus = AluStatus::Nothing;
       } else {
-        score.MixAlu += latency;
+        Score.MixAlu += Latency;
       }
     } break;
     }
   }
   // Update latency inst.
   if (SIII->isHighLatencyInstruction(MI) && MI.mayLoad()) {
-    unsigned reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     // TODO: get correct latency.
     // SIII->getInstrLatency(ItinerayData, MI);
     constexpr unsigned kHighLetency = 180;
-    LatencyMIs[reg] = kHighLetency;
+    LatencyMIs[Reg] = kHighLetency;
   } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) {
-    unsigned reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     // TODO: get correct latency.
     // SIII->getInstrLatency(ItinerayData, MI);
     constexpr unsigned kLowLetency = 35;
-    LatencyMIs[reg] = kLowLetency;
+    LatencyMIs[Reg] = kLowLetency;
   }
 }
 
-SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
+SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
                           const llvm::MachineLoopInfo *MLI) {
-  SchedScore totalScore;
+  SchedScore TotalScore;
   for (auto &MFI : MF) {
     MachineBasicBlock &MBB = MFI;
     MachineBasicBlock::iterator Next;
-    AMDGPULatencyTracker latencyTracker(ST);
+    AMDGPULatencyTracker LatencyTracker(ST);
     for (auto &MI : MBB) {
-      latencyTracker.scan(MI);
+      LatencyTracker.scan(MI);
     }
-    unsigned loopDepth = 0;
+    unsigned LoopDepth = 0;
     if (MLI) {
-      loopDepth = MLI->getLoopDepth(&MBB);
+      LoopDepth = MLI->getLoopDepth(&MBB);
     }
-    totalScore.sum(latencyTracker.score, loopDepth);
+    TotalScore.sum(LatencyTracker.Score, LoopDepth);
   }
-  return totalScore;
+  return TotalScore;
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index a9a15f7538a58..c04afe61c9809 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -40,8 +40,8 @@ struct SchedScore {
   float computeScore() const;
   float computeScore2() const;
 
-  void sum(const SchedScore &s, unsigned loopDepth = 0);
-  bool isBetter(const SchedScore &s) const;
+  void sum(const SchedScore &S, unsigned LoopDepth = 0);
+  bool isBetter(const SchedScore &S) const;
   bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
   // More latency can be hiden with ExtraOcc.
   unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
@@ -53,23 +53,23 @@ struct AMDGPULatencyTracker {
   const llvm::InstrItineraryData *ItinerayData;
   // Latency MI dst reg to cycle map.
   llvm::DenseMap<unsigned, int> LatencyMIs;
-  SchedScore score;
+  SchedScore Score;
   // Low latency MI not wait.
-  unsigned hideLatency = 0;
-  unsigned memLatency = 0;
+  unsigned HideLatency = 0;
+  unsigned MemLatency = 0;
   // For simple, only consider mixture as one valu one salu.
   // Not group now.
-  unsigned prevSAlu = 0;
-  unsigned prevVAlu = 0;
+  unsigned PrevSAlu = 0;
+  unsigned PrevVAlu = 0;
   enum class AluStatus {
     Nothing,
     Vector,
     Scalar,
-  } prevStatus = AluStatus::Nothing;
+  } PrevStatus = AluStatus::Nothing;
   void scan(const llvm::MachineInstr &MI);
 };
 
-SchedScore CollectLatency(llvm::MachineFunction &MF,
+SchedScore collectLatency(llvm::MachineFunction &MF,
                           const llvm::GCNSubtarget &ST,
                           const llvm::MachineLoopInfo *MLI = nullptr);
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index be24bfce2851c..fec8ac9546a4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -31,12 +31,12 @@ void SubExp::dump(const MachineRegisterInfo &MRI,
                   const SIRegisterInfo *SIRI) const {
   dbgs() << "\nSubExp:\n";
   dbgs() << "input regs:\n";
-  for (auto &input : inputLive) {
+  for (auto &input : InputLive) {
     pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
     dbgs() << "\n";
   }
   dbgs() << "output regs:\n";
-  for (auto &output : outputLive) {
+  for (auto &output : OutputLive) {
     pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
     dbgs() << "\n";
   }
@@ -60,8 +60,8 @@ bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const {
 
 void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
                              const SIRegisterInfo *SIRI) {
-  sMaxSize = std::max(sInputSize, sOutputSize);
-  vMaxSize = std::max(vInputSize, vOutputSize);
+  SMaxSize = std::max(SInputSize, SOutputSize);
+  VMaxSize = std::max(VInputSize, VOutputSize);
 
   DenseMap<unsigned, LaneBitmask> LiveRegs;
   GCNRegPressure CurPressure;
@@ -125,10 +125,10 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
 
     unsigned sSize = CurPressure.getSGPRNum();
     unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts());
-    if (sSize > sMaxSize)
-      sMaxSize = sSize;
-    if (vSize > vMaxSize)
-      vMaxSize = vSize;
+    if (sSize > SMaxSize)
+      SMaxSize = sSize;
+    if (vSize > VMaxSize)
+      VMaxSize = vSize;
   }
 }
 
@@ -185,8 +185,8 @@ template void ExpDag::initNodes<std::vector<MachineInstr *>>(
 
 template <typename T>
 void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
-                   T &insts) {
-  initNodes(InputLiveReg, insts);
+                   T &Insts) {
+  initNodes(InputLiveReg, Insts);
   addDataDep(SIRI);
   addCtrlDep();
   buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII);
@@ -336,7 +336,7 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
       auto it = StartLiveReg.find(Reg);
       assert(it != StartLiveReg.end() &&
              "cannot find input reg in block start live");
-      Exp.inputLive[Reg] |= it->second;
+      Exp.InputLive[Reg] |= it->second;
     }
 
     for (unsigned Reg : Exp.BottomRegs) {
@@ -349,13 +349,13 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
         // outputLive which will affect profit count.
         continue;
       }
-      Exp.outputLive[Reg] |= it->second;
+      Exp.OutputLive[Reg] |= it->second;
     }
 
-    CollectLiveSetPressure(Exp.inputLive, MRI, SIRI, Exp.vInputSize,
-                           Exp.sInputSize);
-    CollectLiveSetPressure(Exp.outputLive, MRI, SIRI, Exp.vOutputSize,
-                           Exp.sOutputSize);
+    CollectLiveSetPressure(Exp.InputLive, MRI, SIRI, Exp.VInputSize,
+                           Exp.SInputSize);
+    CollectLiveSetPressure(Exp.OutputLive, MRI, SIRI, Exp.VOutputSize,
+                           Exp.SOutputSize);
   }
 }
 
@@ -415,8 +415,8 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
         auto curDefIt = curDefMI.find(Reg);
         // Check def inst first.
         if (curDefIt != curDefMI.end()) {
-          MachineInstr *curDef = curDefIt->second;
-          DefSU = MISUnitMap[curDef];
+          MachineInstr *CurDef = curDefIt->second;
+          DefSU = MISUnitMap[CurDef];
           // Add link between different defs.
           SU.addPred(SDep(DefSU, SDep::Data, Reg));
         }
@@ -445,12 +445,12 @@ void BlockExpDag::build() {
   const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
   const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
 
-  std::vector<MachineInstr *> insts;
+  std::vector<MachineInstr *> Insts;
   for (MachineInstr &MI : *MBB) {
-    insts.emplace_back(&MI);
+    Insts.emplace_back(&MI);
   }
 
-  ExpDag::build(StartLiveReg, EndLiveReg, insts);
+  ExpDag::build(StartLiveReg, EndLiveReg, Insts);
 }
 
 void BlockExpDag::buildWithPressure() {
@@ -461,17 +461,17 @@ void BlockExpDag::buildWithPressure() {
   const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
   const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
 
-  std::vector<MachineInstr *> insts;
+  std::vector<MachineInstr *> Insts;
   for (MachineInstr &MI : *MBB) {
-    insts.emplace_back(&MI);
+    Insts.emplace_back(&MI);
   }
 
-  ExpDag::build(StartLiveReg, EndLiveReg, insts);
+  ExpDag::build(StartLiveReg, EndLiveReg, Insts);
   // Build pressure.
   buildPressure(StartLiveReg, EndLiveReg);
 }
 
-void BlockExpDag::buildAvail(const LiveSet &passThruSet,
+void BlockExpDag::buildAvail(const LiveSet &PassThruSet,
                              DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
   DenseSet<SUnit *> Processed;
 
@@ -485,10 +485,10 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet,
   for (SUnit &SU : SUnits) {
     if (SU.NumPredsLeft == 0) {
       GCNDownwardRPTracker RP(*LIS);
-      RP.reset(BeginMI, &passThruSet);
+      RP.reset(BeginMI, &PassThruSet);
       MachineInstr *MI = SU.getInstr();
       if (MI) {
-        RP.reset(*MI, &passThruSet);
+        RP.reset(*MI, &PassThruSet);
         RP.advance();
       }
       DagAvailRegMap[&SU] = RP.getLiveRegs();
@@ -503,7 +503,6 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet,
     }
   }
   while (!WorkList.empty()) {
-    bool IsUpdated = false;
     SmallVector<SUnit *, 4> ReadyNodes;
     for (SUnit *SU : WorkList) {
       if (SU->NumPredsLeft > 0)
@@ -511,7 +510,6 @@ void BlockExpDag::buildAvail(const LiveSet &passThruSet,
       ReadyNodes.emplace_back(SU);
       // Ready, move it to Processed.
       Processed.insert(SU);
-      IsUpdated = true;
       // Only update 1 node once.
       // Order of schedle here should not affect pressure.
       break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
index 952126798b1de..c447750e17f1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -41,14 +41,14 @@ struct SubExp {
   bool IsTouchSCC = false;
   llvm::MachineBasicBlock *FromBB;
   llvm::MachineBasicBlock *ToBB;
-  unsigned sInputSize;
-  unsigned vInputSize;
-  unsigned sOutputSize;
-  unsigned vOutputSize;
-  unsigned sMaxSize;
-  unsigned vMaxSize;
-  LiveSet inputLive;
-  LiveSet outputLive;
+  unsigned SInputSize;
+  unsigned VInputSize;
+  unsigned SOutputSize;
+  unsigned VOutputSize;
+  unsigned SMaxSize;
+  unsigned VMaxSize;
+  LiveSet InputLive;
+  LiveSet OutputLive;
   bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool IsMoveUp) const;
   void calcMaxPressure(const llvm::MachineRegisterInfo &MRI,
                        const llvm::SIRegisterInfo *SIRI);
@@ -73,7 +73,7 @@ struct ExpDag {
   std::vector<SubExp> SubExps;
   template <typename T>
   void build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
-             T &insts);
+             T &Insts);
   void dump();
   void viewGraph(const llvm::Twine &Name, const llvm::Twine &Title) const;
   /// Returns a label for an SUnit node in a visualization of the ScheduleDAG.
@@ -104,7 +104,7 @@ struct BlockExpDag : public ExpDag {
   void buildWithPressure();
 
 private:
-  void buildAvail(const LiveSet &passThruSet,
+  void buildAvail(const LiveSet &PassThruSet,
                   llvm::DenseMap<llvm::SUnit *, LiveSet> &DagAvailRegMap);
   void buildPressure(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg);
 };

From f8eb7fb3a7f0d25b6773ec6d0598cd325138cbb8 Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:10:34 -0700
Subject: [PATCH 15/25] More cleanups

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |  40 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 405 +++++----
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    | 771 +++++++++---------
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h      |  29 +-
 4 files changed, 619 insertions(+), 626 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 46d182ffd9e29..853a212ac5bf3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -226,7 +226,7 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
     if (!Op.isReg())
       continue;
     if (!MRI.getUniqueVRegDef(Op.getReg()) &&
-        !llvm::IsSub0Sub1SingleDef(Op.getReg(), MRI)) {
+        !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) {
       return false;
     }
   }
@@ -280,7 +280,7 @@ unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
   GCNUpwardRPTracker RPTracker(*LIS);
   // R.End doesn't point to the boundary instruction.
   // Skip Debug instr.
-  if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB))
+  if (!llvm::getNonDebugMBBEnd(BBEnd, MBB))
     return ST->getOccupancyWithNumVGPRs(0);
 
   GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
@@ -327,7 +327,7 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
 
       // R.End doesn't point to the boundary instruction.
       // Skip Debug instr.
-      if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+      if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) {
         auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
         MBBOutputSlotMap[&MBB] = SI;
       }
@@ -417,7 +417,7 @@ RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
   unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure);
   unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure);
 
-  llvm::SchedScore TotalScore = llvm::CollectLatency(MF, *ST, MLI);
+  llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI);
   bool MemBound =
       TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
 
@@ -702,7 +702,7 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
     }
     bool IsSingleDef = MRI.hasOneDef(Reg);
     if (!IsSingleDef) {
-      IsSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI);
+      IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI);
     }
 
     if (IsSingleDef) {
@@ -1066,7 +1066,7 @@ static MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
   const bool WillSmashScc =
       InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
   if (WillSmashScc) {
-    CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+    CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
         MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
   }
 
@@ -1081,7 +1081,7 @@ static MachineBasicBlock::iterator adjustInsertPointForSubExpToAvoidSccSmash(
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
   const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
   if (WillSmashScc) {
-    CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+    CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
         MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
   }
 
@@ -1094,7 +1094,7 @@ static bool willSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
   // It is ok to pass nullptr to `modifiesRegister` for TRI here since
   // SCC has no subreg/suprereg relationships.
   return MI->modifiesRegister(AMDGPU::SCC, nullptr) &&
-         llvm::IsSccLiveAt(MBB, Location);
+         llvm::isSccLiveAt(MBB, Location);
 }
 
 void applyCloneRemat(Remat *Remat, RematNode &Node,
@@ -1374,7 +1374,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
       const GCNRPTracker::LiveRegSet &LiveSet = LISLR;
       unsigned VPressure = 0;
       unsigned SPressure = 0;
-      CollectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure);
+      collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure);
       if (MaxVPressure < VPressure)
         MaxVPressure = VPressure;
       if (MaxSPressure < SPressure)
@@ -1635,7 +1635,7 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
     if (OpReg.isPhysical())
       return false;
     if (!MRI.getUniqueVRegDef(OpReg) &&
-        !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
+        !llvm::isSub0Sub1SingleDef(OpReg, MRI)) {
       return false;
     }
   }
@@ -1794,7 +1794,7 @@ std::vector<SubExp> buildSubExpFromCandidates(
         continue;
 
       MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
-      assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+      assert((DefMI || llvm::isSub0Sub1SingleDef(Reg, MRI)) &&
              "UseMI should be safe to move");
       if (DefMI && CandidateDefs.count(DefMI) > 0)
         continue;
@@ -1982,7 +1982,7 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
         if (Candidates.count(Reg) == 0 && LocalCandidates.count(Reg) != 0)
           continue;
       }
-      assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+      assert((DefMI || llvm::isSub0Sub1SingleDef(Reg, MRI)) &&
              "UseMI should be safe to move");
       if (DefMI && CandidateDefs.count(DefMI) > 0)
         continue;
@@ -2361,7 +2361,7 @@ void applySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &HotBlocks,
     DenseMap<unsigned, unsigned> RegMap;
     auto InsertPtr = MBB->getFirstNonPHI();
     // If Exp has scc read/write, make sure MBB not have scc in liveins.
-    if (IsModifiesScc && llvm::IsSccLiveAt(MBB, InsertPtr))
+    if (IsModifiesScc && llvm::isSccLiveAt(MBB, InsertPtr))
       continue;
     MachineFunction *MF = MBB->getParent();
     for (auto It = Exp.SUnits.begin(); It != Exp.SUnits.end(); It++) {
@@ -2470,7 +2470,7 @@ void applySubExpCloneNearUserInBlock(
         continue;
 
       // Do not overwrite a live scc.
-      if (IsModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI))
+      if (IsModifiesScc && llvm::isSccLiveAt(UserBB, &UseMI))
         continue;
 
       UseMIs.emplace_back(&UseMI);
@@ -3147,7 +3147,7 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *HotMi,
       if (SubExp.SOutputSize == 0)
         continue;
     }
-    if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
+    if (!SubExp.isSafeToMove(MRI))
       continue;
     // Not clone .
     if (SubExp.SUnits.size() > 10)
@@ -3410,7 +3410,7 @@ bool canHelpPressureWhenSink(SubExp &SubExp,
                              MachineDominatorTree *DT, bool IsCanClone,
                              bool IsSgprBound) {
   LLVM_DEBUG(SubExp.dump(MRI, SIRI));
-  if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ false))
+  if (!SubExp.isSafeToMove(MRI))
     return false;
 
   // Update input size to ignore lives in which already in
@@ -3477,7 +3477,7 @@ bool canHelpPressureWhenSink(SubExp &SubExp,
 
 bool canHelpPressureWhenHoist(SubExp &SubExp, const MachineRegisterInfo &MRI,
                               const MachineLoopInfo *MLI, bool IsSgprBound) {
-  if (!SubExp.isSafeToMove(MRI, /*IsMoveUp*/ true))
+  if (!SubExp.isSafeToMove(MRI))
     return false;
   if (SubExp.VInputSize < SubExp.VOutputSize)
     return false;
@@ -3722,9 +3722,9 @@ SubExp buildFreeSubExp(SubExp &Exp,
     FreeExp.OutputLive[Reg];
   }
 
-  CollectLiveSetPressure(FreeExp.InputLive, MRI, SIRI, FreeExp.VInputSize,
+  collectLiveSetPressure(FreeExp.InputLive, MRI, SIRI, FreeExp.VInputSize,
                          FreeExp.SInputSize);
-  CollectLiveSetPressure(FreeExp.OutputLive, MRI, SIRI, FreeExp.VOutputSize,
+  collectLiveSetPressure(FreeExp.OutputLive, MRI, SIRI, FreeExp.VOutputSize,
                          FreeExp.SOutputSize);
   return FreeExp;
 }
@@ -3779,7 +3779,7 @@ std::vector<SubExp> buildSubExpCandidates(
       if (!canHelpPressureWhenSink(Exp, PassThrus, MRI, SIRI, MLI, DT,
                                    IsCanClone, IsSgprBound)) {
         if (AllowPartialUseInSubExp &&
-            Exp.isSafeToMove(MRI, /*IsMoveUp*/ false)) {
+            Exp.isSafeToMove(MRI)) {
           SubExp FreeSubExp = buildFreeSubExp(Exp, PassThrus, MRI, SIRI);
           if (canHelpPressureWhenSink(FreeSubExp, PassThrus, MRI, SIRI, MLI, DT,
                                       IsCanClone, IsSgprBound)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 63651ab82fcdb..d207b3aa3d4f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -5,7 +5,6 @@
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 
-// #include "dxc/DXIL/DxilMetadataHelper.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -30,12 +29,12 @@ class CFGWithPhi {
     MachineRegisterInfo &MRI = F.getRegInfo();
 
     for (MachineBasicBlock &BB : F) {
-      auto &phiInsts = blockToPhiInstsMap[&BB];
+      auto &PhiInsts = BlockToPhiInstsMap[&BB];
       for (MachineInstr &I : BB) {
         if (!I.isPHI())
           break;
-        phiInsts.insert(&I);
-        unsigned Reg = I.getOperand(0).getReg();
+        PhiInsts.insert(&I);
+        Register Reg = I.getOperand(0).getReg();
         // Add incoming values.
         for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
           MachineOperand &MO = I.getOperand(i);
@@ -44,11 +43,11 @@ class CFGWithPhi {
           MachineInstr *DefMI = MRI.getUniqueVRegDef(MO.getReg());
           if (!DefMI)
             continue;
-          blockToPhiInstsMap[DefMI->getParent()].insert(DefMI);
+          BlockToPhiInstsMap[DefMI->getParent()].insert(DefMI);
         }
         // Add users.
         for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
-          blockToPhiInstsMap[UseMI.getParent()].insert(&UseMI);
+          BlockToPhiInstsMap[UseMI.getParent()].insert(&UseMI);
         }
       }
     }
@@ -56,7 +55,7 @@ class CFGWithPhi {
   void addCustomGraphFeatures(llvm::GraphWriter<CFGWithPhi *> &) const {}
   MachineFunction &F;
   DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>>
-      blockToPhiInstsMap;
+      BlockToPhiInstsMap;
   void dump();
 };
 
@@ -64,13 +63,13 @@ void CFGWithPhi::dump() {
 #ifdef DBG
   for (MachineBasicBlock &BB : F) {
     dbgs() << BB.getName() << "\n";
-    auto &phiInsts = blockToPhiInstsMap[&BB];
-    for (MachineInstr *I : phiInsts) {
+    auto &PhiInsts = blockToPhiInstsMap[&BB];
+    for (MachineInstr *I : PhiInsts) {
       if (!I->isPHI())
         continue;
       I->dump();
     }
-    for (MachineInstr *I : phiInsts) {
+    for (MachineInstr *I : PhiInsts) {
       if (I->isPHI())
         continue;
       I->dump();
@@ -86,14 +85,14 @@ namespace llvm {
 
 template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
 
-  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
 
-  static std::string getGraphName(const CFGWithPhi *G) {
+  static std::string getGraphName(const CFGWithPhi *) {
     return "CFG with Phi graph";
   }
 
   static std::string getNodeIdentifierLabel(const MachineBasicBlock *Node,
-                                            const CFGWithPhi *Graph) {
+                                            const CFGWithPhi *) {
     std::string R;
     raw_string_ostream OS(R);
     OS << static_cast<const void *>(Node);
@@ -107,17 +106,17 @@ template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
     raw_string_ostream OS(Str);
 
     OS << "BB:" << BB->getName();
-    auto it = G->blockToPhiInstsMap.find(BB);
-    if (it != G->blockToPhiInstsMap.end()) {
+    auto It = G->BlockToPhiInstsMap.find(BB);
+    if (It != G->BlockToPhiInstsMap.end()) {
 
-      auto &phiInsts = it->second;
-      for (MachineInstr *I : phiInsts) {
+      auto &PhiInsts = It->second;
+      for (MachineInstr *I : PhiInsts) {
         if (!I->isPHI())
           continue;
         I->print(OS);
         OS << "\n";
       }
-      for (MachineInstr *I : phiInsts) {
+      for (MachineInstr *I : PhiInsts) {
         if (I->isPHI())
           continue;
         I->print(OS);
@@ -157,7 +156,7 @@ template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
     return OutStr;
   }
   static std::string getNodeDescription(const MachineBasicBlock *SU,
-                                        const CFGWithPhi *G) {
+                                        const CFGWithPhi *) {
     return SU->getName().str();
   }
 
@@ -200,25 +199,24 @@ unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
                     const llvm::SIRegisterInfo *SIRI) {
   unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
   Size >>= 5;
-  LaneBitmask mask = Mask;
-  if (mask.any()) {
-    if (unsigned maskSize = mask.getNumLanes()) {
-      if (maskSize < Size)
-        Size = maskSize;
+  if (Mask.any()) {
+    if (unsigned MaskSize = Mask.getNumLanes()) {
+      if (MaskSize < Size)
+        Size = MaskSize;
     }
   }
   return Size;
 }
 
-void CollectLiveSetPressure(const LiveSet &liveSet,
+void collectLiveSetPressure(const LiveSet &LiveSet,
                             const MachineRegisterInfo &MRI,
                             const SIRegisterInfo *SIRI, unsigned &VPressure,
                             unsigned &SPressure) {
   VPressure = 0;
   SPressure = 0;
-  for (auto liveIt : liveSet) {
-    unsigned Reg = liveIt.first;
-    unsigned Size = getRegSize(Reg, liveIt.second, MRI, SIRI);
+  for (auto LiveIt : LiveSet) {
+    unsigned Reg = LiveIt.first;
+    unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI);
     if (SIRI->isVGPR(MRI, Reg)) {
       VPressure += Size;
     } else {
@@ -228,58 +226,58 @@ void CollectLiveSetPressure(const LiveSet &liveSet,
 }
 
 bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) {
-  bool isExecUpdate = false;
-  unsigned opcode = MI.getOpcode();
-  if (opcode == AMDGPU::S_MOV_B64 || opcode == AMDGPU::S_MOV_B32 ||
-      opcode == AMDGPU::S_OR_B64_term || opcode == AMDGPU::S_OR_B32_term ||
-      opcode == AMDGPU::S_OR_SAVEEXEC_B64 ||
-      opcode == AMDGPU::S_OR_SAVEEXEC_B32 || opcode == AMDGPU::S_AND_B64 ||
-      opcode == AMDGPU::S_AND_B32 || opcode == AMDGPU::S_ANDN2_B64 ||
-      opcode == AMDGPU::S_ANDN2_B32) {
+  bool IsExecUpdate = false;
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode == AMDGPU::S_MOV_B64 || Opcode == AMDGPU::S_MOV_B32 ||
+      Opcode == AMDGPU::S_OR_B64_term || Opcode == AMDGPU::S_OR_B32_term ||
+      Opcode == AMDGPU::S_OR_SAVEEXEC_B64 ||
+      Opcode == AMDGPU::S_OR_SAVEEXEC_B32 || Opcode == AMDGPU::S_AND_B64 ||
+      Opcode == AMDGPU::S_AND_B32 || Opcode == AMDGPU::S_ANDN2_B64 ||
+      Opcode == AMDGPU::S_ANDN2_B32) {
     MachineOperand &Dst = MI.getOperand(0);
     if (Dst.getReg() == AMDGPU::EXEC || Dst.getReg() == AMDGPU::EXEC_LO) {
-      isExecUpdate = true;
+      IsExecUpdate = true;
     }
   }
-  return isExecUpdate;
+  return IsExecUpdate;
 }
 
-bool IsSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
   // Support multi def for pattern of pointer:
   // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
   // %808.sub1:sgpr_64 = S_MOV_B32 0
-  bool bHasSub0 = false;
-  bool bHasSub1 = false;
+  bool HasSub0 = false;
+  bool HasSub1 = false;
   for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) {
     if (unsigned SubReg = UserDefMO.getSubReg()) {
-      bool bSingleSubReg = false;
+      bool IsSingleSubReg = false;
       switch (SubReg) {
       default:
         break;
       case AMDGPU::sub0:
-        if (!bHasSub0) {
-          bHasSub0 = true;
-          bSingleSubReg = true;
+        if (!HasSub0) {
+          HasSub0 = true;
+          IsSingleSubReg = true;
         }
         break;
       case AMDGPU::sub1:
-        if (!bHasSub1) {
-          bHasSub1 = true;
-          bSingleSubReg = true;
+        if (!HasSub1) {
+          HasSub1 = true;
+          IsSingleSubReg = true;
         }
         break;
       }
-      if (!bSingleSubReg) {
-        bHasSub0 = false;
+      if (!IsSingleSubReg) {
+        HasSub0 = false;
         break;
       }
     } else {
-      bHasSub0 = false;
+      HasSub0 = false;
       break;
     }
   }
 
-  return (bHasSub0 && bHasSub1);
+  return (HasSub0 && HasSub1);
 }
 
 LaneBitmask getRegMask(const MachineOperand &MO,
@@ -293,46 +291,46 @@ LaneBitmask getRegMask(const MachineOperand &MO,
                    MO.getSubReg());
 }
 
-void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
-  for (auto Reg : inputSet) {
-    unsigned reg = Reg.first;
-    LaneBitmask mask = Reg.second;
-    auto targetReg = targetSet.find(reg);
-    if (targetReg != targetSet.end()) {
-      LaneBitmask targetMask = targetReg->second;
-      mask |= targetMask;
+void mergeLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) {
+  for (auto It : InputSet) {
+    Register Reg = It.first;
+    LaneBitmask Mask = It.second;
+    auto TargetReg = TargetSet.find(Reg);
+    if (TargetReg != TargetSet.end()) {
+      LaneBitmask TargetMask = TargetReg->second;
+      Mask |= TargetMask;
     }
-    targetSet[reg] = mask;
+    TargetSet[Reg] = Mask;
   }
 }
 
-void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+void andLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) {
   GCNRPTracker::LiveRegSet AndSet;
-  for (auto Reg : inputSet) {
-    unsigned reg = Reg.first;
-    LaneBitmask mask = Reg.second;
-    auto targetReg = targetSet.find(reg);
-    if (targetReg != targetSet.end()) {
-      LaneBitmask targetMask = targetReg->second;
-      mask &= targetMask;
-      AndSet[reg] = mask;
+  for (auto It : InputSet) {
+    Register Reg = It.first;
+    LaneBitmask Mask = It.second;
+    auto TargetReg = TargetSet.find(Reg);
+    if (TargetReg != TargetSet.end()) {
+      LaneBitmask TargetMask = TargetReg->second;
+      Mask &= TargetMask;
+      AndSet[Reg] = Mask;
     }
   }
 
-  targetSet = AndSet;
+  TargetSet = AndSet;
 }
 
-void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
-  for (auto Reg : inputSet) {
-    unsigned reg = Reg.first;
-    LaneBitmask mask = Reg.second;
-    auto targetReg = targetSet.find(reg);
-    if (targetReg != targetSet.end()) {
-      LaneBitmask targetMask = targetReg->second;
-      if ((targetMask | mask) == mask)
-        targetSet.erase(reg);
+void andNotLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet) {
+  for (auto It : InputSet) {
+    unsigned Reg = It.first;
+    LaneBitmask Mask = It.second;
+    auto TargetReg = TargetSet.find(Reg);
+    if (TargetReg != TargetSet.end()) {
+      LaneBitmask TargetMask = TargetReg->second;
+      if ((TargetMask | Mask) == Mask)
+        TargetSet.erase(Reg);
       else
-        targetSet[reg] = targetMask & (~mask);
+        TargetSet[Reg] = TargetMask & (~Mask);
     }
   }
 }
@@ -356,56 +354,55 @@ MachineBasicBlock *split(MachineInstr *Inst) {
 
 struct Piece {
   unsigned Reg;
-  unsigned offset;
-  unsigned size;
-  static SmallVector<Piece, 8> split(std::bitset<32> mask) {
+  unsigned Offset;
+  unsigned Size;
+  static SmallVector<Piece, 8> split(std::bitset<32> Mask) {
 
-    SmallVector<Piece, 8> pieces;
-    Piece piece = {0, 0, 0};
+    SmallVector<Piece, 8> Pieces;
+    Piece Piece = {0, 0, 0};
     for (unsigned i = 0; i < 32; i++) {
-      if (mask.test(i)) {
-        if (piece.size == 0)
-          piece.offset = i;
+      if (Mask.test(i)) {
+        if (Piece.Size == 0)
+          Piece.Offset = i;
 
-        piece.size++;
+        Piece.Size++;
         // Make sure no piece bigger than 8.
-        if (piece.size == 8) {
-          pieces.emplace_back(piece);
-          piece.size = 0;
+        if (Piece.Size == 8) {
+          Pieces.emplace_back(Piece);
+          Piece.Size = 0;
         }
       } else {
-        if (piece.size == 0) {
+        if (Piece.Size == 0) {
           continue;
         }
-        pieces.emplace_back(piece);
-        piece.size = 0;
+        Pieces.emplace_back(Piece);
+        Piece.Size = 0;
       }
     }
-    return pieces;
+    return Pieces;
   }
 };
 
-void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC,
-                  unsigned offset, const SIRegisterInfo *SIRI,
-                  const SIInstrInfo *SIII) {
-  unsigned size = NewRC->getLaneMask().getNumLanes();
-  if (size == 1) {
+static void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC,
+                  unsigned Offset, const SIRegisterInfo *SIRI) {
+  unsigned Size = NewRC->getLaneMask().getNumLanes();
+  if (Size == 1) {
     UseMO.setSubReg(0);
   } else {
     const uint32_t SubReg = UseMO.getSubReg();
-    LaneBitmask Mask = SIRI->getSubRegIndexLaneMask(SubReg);
+    LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg);
 
-    unsigned mask = Mask.getAsInteger() >> offset;
+    unsigned Mask = LaneMask.getAsInteger() >> Offset;
 
     unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask(
-                                 NewRC, LaneBitmask(mask))
+                                 NewRC, LaneBitmask(Mask))
                              .front();
 
     UseMO.setSubReg(NewSubReg);
   }
 }
 
-bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
+bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
                    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                    const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
   MachineOperand &DstMO = MI.getOperand(0);
@@ -413,7 +410,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
   if (DstMO.getSubReg()) {
     return false;
   }
-  unsigned Reg = DstMO.getReg();
+  Register Reg = DstMO.getReg();
 
   SmallVector<MachineOperand *, 2> UseMOs;
   for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
@@ -421,9 +418,9 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
   }
 
   const llvm::TargetRegisterClass *NewRC =
-      SIRI->getRegClass(desc.operands().front().RegClass);
-  unsigned size = NewRC->getLaneMask().getNumLanes();
-  if (offset > 0) {
+      SIRI->getRegClass(Desc.operands().front().RegClass);
+  unsigned Size = NewRC->getLaneMask().getNumLanes();
+  if (Offset > 0) {
     // Update offset operand in MI.
     MachineOperand *OffsetOp =
         SIII->getNamedOperand(MI, AMDGPU::OpName::offset);
@@ -433,7 +430,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
       if (OffsetOp->isImm()) {
         assert(OffsetOp != nullptr);
         int64_t Offset = OffsetOp->getImm();
-        Offset += offset * LaneSize;
+        Offset += Offset * LaneSize;
         if (!SIII->isLegalMUBUFImmOffset(Offset)) {
           return false;
         }
@@ -444,13 +441,13 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
     } else {
       OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset);
       if (OffsetOp) {
-        unsigned NewOffsetReg =
+        Register NewOffsetReg =
             MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
         auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(),
                                  SIII->get(AMDGPU::S_ADD_U32))
                              .addDef(NewOffsetReg)
                              .add(*OffsetOp)
-                             .addImm(offset * LaneSize);
+                             .addImm(Offset * LaneSize);
         MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
         MachineBasicBlock::iterator InsertPoint =
             llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
@@ -467,16 +464,16 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
     }
     // Update subReg for users.
     for (MachineOperand *UseMO : UseMOs) {
-      updateSubReg(*UseMO, NewRC, offset, SIRI, SIII);
+      updateSubReg(*UseMO, NewRC, Offset, SIRI);
     }
-  } else if (size == 1) {
+  } else if (Size == 1) {
     // Clear subReg when size is 1.
     for (MachineOperand *UseMO : UseMOs) {
       UseMO->setSubReg(0);
     }
   }
 
-  MI.setDesc(desc);
+  MI.setDesc(Desc);
   // Mutate reg class of Reg.
   MRI.setRegClass(Reg, NewRC);
   return true;
@@ -485,7 +482,7 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
 bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
                        const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
                        SlotIndexes *SlotIndexes) {
-  bool bImm = false;
+  bool IsImm = false;
   switch (MI.getOpcode()) {
   default:
     break;
@@ -493,67 +490,70 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
-    bImm = true;
+    IsImm = true;
+    LLVM_FALLTHROUGH;
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
   case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
-    unsigned Reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     if (!MRI.getUniqueVRegDef(Reg))
       return false;
-    LaneBitmask dstMask = getRegMask(MI.getOperand(0), MRI);
+    LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI);
     LaneBitmask UseMask;
     for (MachineOperand &MO : MRI.use_operands(Reg)) {
       UseMask |= llvm::getRegMask(MO, MRI);
     }
 
-    const unsigned fullMask = dstMask.getAsInteger();
-    unsigned mask = UseMask.getAsInteger();
-    if (mask == fullMask)
+    const unsigned FullMask = DstMask.getAsInteger();
+    unsigned Mask = UseMask.getAsInteger();
+    if (Mask == FullMask)
       return false;
     // Split mask when there's gap. Then group mask to 2/4/8.
-    auto pieces = Piece::split(std::bitset<32>(mask));
+    auto Pieces = Piece::split(std::bitset<32>(Mask));
     // Now only support 1 piece.
-    if (pieces.size() != 1)
+    if (Pieces.size() != 1)
       return false;
-    auto piece = pieces[0];
-    if (piece.size > 8)
+    auto Piece = Pieces[0];
+    if (Piece.Size > 8)
       return false;
 
-    // TODO: enable offset support when bImm is true.
+    // TODO: enable offset support when IsImm is true.
     // Now if break different test when mul LaneSize or not mul for the offset.
-    if (bImm && piece.offset != 0)
+    if (IsImm && Piece.Offset != 0)
       return false;
 
-    switch (piece.size) {
+    switch (Piece.Size) {
     default:
       return false;
     case 1:
-      return reduceChannel(piece.offset, MI,
-                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
                                           : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     case 2:
-      return reduceChannel(piece.offset, MI,
-                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
                                           : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     case 3:
-      if (fullMask == 0xf)
+      if (FullMask == 0xf)
         return false;
+      LLVM_FALLTHROUGH;
     case 4:
-      return reduceChannel(piece.offset, MI,
-                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
                                           : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     case 5:
     case 6:
     case 7:
-      if (fullMask == 0xff)
+      if (FullMask == 0xff)
         return false;
+      LLVM_FALLTHROUGH;
     case 8:
-      return reduceChannel(piece.offset, MI,
-                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
                                           : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     }
@@ -610,15 +610,15 @@ bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
 // If BB can reach hotMBBs.
 bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT,
                   MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
-                  DenseSet<MachineBasicBlock *> &hotMBBs) {
-  bool bCross = false;
-  for (MachineBasicBlock *hotBB : hotMBBs) {
-    if (reach_block(BB, DT, PDT, LI, hotBB)) {
-      bCross = true;
+                  DenseSet<MachineBasicBlock *> &HotMBBs) {
+  bool Cross = false;
+  for (MachineBasicBlock *HotBB : HotMBBs) {
+    if (reach_block(BB, DT, PDT, LI, HotBB)) {
+      Cross = true;
       break;
     }
   }
-  return bCross;
+  return Cross;
 }
 
 } // namespace llvm
@@ -634,7 +634,7 @@ void viewCFGWithPhi(llvm::MachineFunction &F) {
 } // namespace llvm
 
 namespace llvm {
-bool GetNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
                        MachineBasicBlock &MBB) {
   // R.End doesn't point to the boundary instruction.
   // Skip Debug instr.
@@ -951,13 +951,13 @@ void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes,
                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                   raw_ostream &os) {
   // Split subReg?  MO.getSubReg();
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   unsigned SubReg = MO.getSubReg();
   MachineInstr *MI = MO.getParent();
   SlotIndex Slot = SlotIndexes->getInstructionIndex(*MI);
   if (SubReg == 0) {
-    unsigned size = get_reg_size(Reg, MRI, SIRI);
-    for (unsigned i = 0; i < size; i++) {
+    unsigned Size = get_reg_size(Reg, MRI, SIRI);
+    for (unsigned i = 0; i < Size; i++) {
       write_define(Slot, Reg, i, MRI, SIRI, os);
     }
   } else {
@@ -1744,13 +1744,13 @@ void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) {
 }
 } // namespace llvm
 
-static bool IsPhysReg(const MachineOperand &Op) {
+static bool isPhysReg(const MachineOperand &Op) {
   return Op.isReg() && Op.getReg().isPhysical();
 }
 
 // Sometimes split bb uses physical registers defined in BB, have to add them to
 // live-in or the ir is malformed.
-void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+void llvm::updatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
                                        const MachineRegisterInfo *MRI) {
   // Initialize with current set of liveins. For new blocks this will be empty.
   SmallDenseSet<unsigned, 8> DefSet;
@@ -1762,11 +1762,11 @@ void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
     // Add all undefined physical registers to the live in set.
     for (MachineOperand &Use : MI.operands()) {
       // Only process physreg uses.
-      if (!IsPhysReg(Use) || !Use.isUse())
+      if (!isPhysReg(Use) || !Use.isUse())
         continue;
 
       // Reserved regs do not need to be tracked through live-in sets.
-      unsigned Reg = Use.getReg();
+      Register Reg = Use.getReg();
       if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
         continue;
 
@@ -1778,14 +1778,14 @@ void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
     // set.
     for (MachineOperand &Def : MI.operands()) {
       // Only process physreg defs.
-      if (!IsPhysReg(Def) || !Def.isDef())
+      if (!isPhysReg(Def) || !Def.isDef())
         continue;
       DefSet.insert(Def.getReg());
     }
   }
 }
 
-void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+void llvm::buildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
                                       SmallDenseSet<unsigned, 8> &LiveOutSet,
                                       const MachineRegisterInfo *MRI) {
   for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) {
@@ -1794,14 +1794,14 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
     // set.
     for (MachineOperand &Def : MI.operands()) {
       // Only process physreg defs.
-      if (!IsPhysReg(Def) || !Def.isDef())
+      if (!isPhysReg(Def) || !Def.isDef())
         continue;
       LiveOutSet.erase(Def.getReg());
     }
     // Add all undefined physical registers to the live in set.
     for (MachineOperand &Use : MI.operands()) {
       // Only process physreg uses.
-      if (!IsPhysReg(Use) || !Use.isUse())
+      if (!isPhysReg(Use) || !Use.isUse())
         continue;
 
       // Reserved regs do not need to be tracked through live-in sets.
@@ -1818,7 +1818,7 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
   }
 }
 
-MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode,
+MachineReg llvm::createVirtualRegForOperand(MachineOpcode Opcode,
                                             unsigned OpNum,
                                             MachineFunction &MF) {
   const TargetSubtargetInfo &ST = MF.getSubtarget();
@@ -1835,14 +1835,14 @@ MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode,
   return MRI.createVirtualRegister(RC);
 }
 
-MachineReg llvm::CreateVirtualDstReg(MachineOpcode Opcode,
+MachineReg llvm::createVirtualDstReg(MachineOpcode Opcode,
                                      MachineFunction &MF) {
-  return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
+  return llvm::createVirtualRegForOperand(Opcode, 0, MF);
 }
 
 // Return true if the MI is a copy of exec.
 // If true then sets pDst to the destination register.
-bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec,
+bool llvm::isExecCopy(const MachineInstr &MI, MachineReg Exec,
                       MachineReg *pDst) {
   enum { DST = 0, SRC = 1 };
   bool FoundCopy = false;
@@ -1868,10 +1868,10 @@ bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec,
   return FoundCopy;
 }
 
-llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) {
+llvm::MachineRegWithSubReg llvm::getWqmEntryActiveMask(MachineFunction &MF) {
   llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister,
                                              AMDGPU::NoSubRegister};
-  if (MachineInstr *MI = GetWqmEntryActiveMaskInst(MF)) {
+  if (MachineInstr *MI = getWqmEntryActiveMaskInst(MF)) {
     LiveLaneMask.Reg = MI->getOperand(0).getReg();
     LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
   }
@@ -1879,7 +1879,7 @@ llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) {
   return LiveLaneMask;
 }
 
-MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) {
+MachineInstr *llvm::getWqmEntryActiveMaskInst(MachineFunction &MF) {
 #if 0 // TODO: Get rid of this
     // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
     // This instruction is added by the SIWholeQuadMode pass.
@@ -1897,7 +1897,7 @@ MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) {
   return nullptr;
 }
 
-bool llvm::IsFetchShaderCall(const MachineInstr *MI) {
+bool llvm::isFetchShaderCall(const MachineInstr *MI) {
 #if 0 // TODO: Get rid of this.
     return 
         MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
@@ -1907,12 +1907,12 @@ bool llvm::IsFetchShaderCall(const MachineInstr *MI) {
 #endif
 }
 
-bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+bool llvm::isSccLiveAt(llvm::MachineBasicBlock *MBB,
                        llvm::MachineBasicBlock::iterator MI) {
   const TargetRegisterInfo *TRI =
       MBB->getParent()->getRegInfo().getTargetRegisterInfo();
-  for (auto it = MI; it != MBB->end(); ++it) {
-    const MachineInstr &CurMI = *it;
+  for (auto It = MI; It != MBB->end(); ++It) {
+    const MachineInstr &CurMI = *It;
     // Hit use of scc, it is live.
     if (CurMI.readsRegister(AMDGPU::SCC, TRI))
       return true;
@@ -1939,12 +1939,12 @@ bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB,
 // scc around BeforeInst. This way BeforeInst can safely be used
 // as the new insert location.
 //
-MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
+MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef(
     MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
     const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
     MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
   // If SCC is dead at MI when we can use MI as the insert point.
-  if (!llvm::IsSccLiveAt(MBB, MI)) {
+  if (!llvm::isSccLiveAt(MBB, MI)) {
     return MI;
   }
 
@@ -1990,7 +1990,7 @@ MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
   //      MI
   //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
   //
-  unsigned int TmpScc =
+  Register TmpScc =
       MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
@@ -2006,39 +2006,39 @@ MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
 
 namespace {
 bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
-                    SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
-  MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
-  MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+                    SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+  MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
   // Treat non inst as not local.
-  if (!startMI || !endMI)
+  if (!StartMI || !EndMI)
     return false;
   // is local when parent MBB the same.
-  bool bSameMBB = startMI->getParent() == endMI->getParent();
-  if (!bSameMBB)
+  bool IsSameMBB = StartMI->getParent() == EndMI->getParent();
+  if (!IsSameMBB)
     return false;
   // Collect touched MBB.
-  MachineBasicBlock *MBB = startMI->getParent();
-  touchedMBBSet.insert(MBB);
+  MachineBasicBlock *MBB = StartMI->getParent();
+  TouchedMBBSet.insert(MBB);
   return true;
 }
 
 bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
-                      SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+                      SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
   for (const LiveRange::Segment &Seg : Range->segments) {
-    if (!isLocalSegment(&Seg, Indexes, touchedMBBSet))
+    if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet))
       return false;
   }
   return true;
 }
 
 bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
-  MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
-  MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+  MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
   // Treat non inst as not local.
-  if (!startMI || !endMI)
+  if (!StartMI || !EndMI)
     return false;
   // is local when parent MBB the same.
-  return startMI->getParent() == endMI->getParent();
+  return StartMI->getParent() == EndMI->getParent();
 }
 
 bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
@@ -2053,19 +2053,19 @@ bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
 
 // In case like float4 v, v.x used and defined in one block, v.y used and define
 // in another block, one live interval could touch more than one MBB.
-// touchedMBBSet is used for scheduling where local live interval could cross
+// TouchedMBBSet is used for scheduling where local live interval could cross
 // multiple regions, need to calculate livereg for each region inside touched
 // MBB.
 bool llvm::isLocalLiveInterval(
     const LiveInterval &LI, SlotIndexes *Indexes,
-    SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+    SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
   if (LI.hasSubRanges()) {
     for (const auto &S : LI.subranges()) {
-      if (!isLocalLiveRange(&S, Indexes, touchedMBBSet))
+      if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
         return false;
     }
   }
-  return isLocalLiveRange(&LI, Indexes, touchedMBBSet);
+  return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
 }
 
 bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
@@ -2096,7 +2096,7 @@ void llvm::buildEndLiveMap(
 
     // R.End doesn't point to the boundary instruction.
     // Skip Debug instr.
-    if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+    if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) {
       auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
       MBBOutputSlotMap[&MBB] = After ? SI.getDeadSlot() : SI.getBaseIndex();
     }
@@ -2107,16 +2107,15 @@ void llvm::buildEndLiveMap(
     if (!LIS->hasInterval(Reg))
       continue;
 
-    LaneBitmask LiveMask;
     const auto &LI = LIS->getInterval(Reg);
 
     // Skip local live interval to make live input/ouput faster.
     if (llvm::isLocalLiveInterval(LI, SlotIndexes))
       continue;
 
-    for (auto outputIt : MBBOutputSlotMap) {
-      MachineBasicBlock *MBB = outputIt.first;
-      auto SI = outputIt.second;
+    for (auto OutputIt : MBBOutputSlotMap) {
+      MachineBasicBlock *MBB = OutputIt.first;
+      auto SI = OutputIt.second;
 
       auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
       if (LiveMask.any())
@@ -2125,7 +2124,7 @@ void llvm::buildEndLiveMap(
   }
 }
 
-unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF,
+unsigned llvm::getCurrentVGPRCount(llvm::MachineFunction &MF,
                                    const SIRegisterInfo *SIRI) {
   auto &MRI = MF.getRegInfo();
   for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
@@ -2136,10 +2135,10 @@ unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF,
   return 0;
 }
 
-unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF,
+unsigned llvm::getCurrentSGPRCount(llvm::MachineFunction &MF,
                                    const SIRegisterInfo *SIRI) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  Register ScratchRSrcReg = MFI->getScratchRSrcReg();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned MaxSGPR = 0;
   for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
@@ -2160,11 +2159,11 @@ unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF,
 void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
 
   dbgs() << "\n live set: \n";
-  for (auto it : LiveSet) {
-    int Reg = it.first;
+  for (auto It : LiveSet) {
+    int Reg = It.first;
     dbgs() << printReg(Reg, SIRI);
-    if (it.second.any()) {
-      dbgs() << " mask:" << it.second.getAsInteger();
+    if (It.second.any()) {
+      dbgs() << " mask:" << It.second.getAsInteger();
     }
     dbgs() << "\n";
   }
@@ -2197,7 +2196,7 @@ bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage)
 #endif
 
 MachineBasicBlock::succ_iterator
-llvm::FindSuccessor(llvm::MachineBasicBlock *MBB,
+llvm::findSuccessor(llvm::MachineBasicBlock *MBB,
                     llvm::MachineBasicBlock *Succ) {
   for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(),
                                         End = MBB->succ_end();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index fec8ac9546a4a..94d78fb676f9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -31,13 +31,13 @@ void SubExp::dump(const MachineRegisterInfo &MRI,
                   const SIRegisterInfo *SIRI) const {
   dbgs() << "\nSubExp:\n";
   dbgs() << "input regs:\n";
-  for (auto &input : InputLive) {
-    pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
+  for (auto &Input : InputLive) {
+    pressure::print_reg(Input.first, MRI, SIRI, llvm::dbgs());
     dbgs() << "\n";
   }
   dbgs() << "output regs:\n";
-  for (auto &output : OutputLive) {
-    pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
+  for (auto &Output : OutputLive) {
+    pressure::print_reg(Output.first, MRI, SIRI, llvm::dbgs());
     dbgs() << "\n";
   }
 
@@ -58,8 +58,7 @@ bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const {
   return false;
 }
 
-void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
-                             const SIRegisterInfo *SIRI) {
+void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI) {
   SMaxSize = std::max(SInputSize, SOutputSize);
   VMaxSize = std::max(VInputSize, VOutputSize);
 
@@ -76,23 +75,23 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
       Register Reg = MO.getReg();
       if (!Reg.isVirtual())
         continue;
-      LaneBitmask mask = getRegMask(MO, MRI);
-      auto it = LiveRegs.find(Reg);
-      if (it != LiveRegs.end()) {
-        LiveRegs[Reg] = mask | it->second;
+      LaneBitmask Mask = getRegMask(MO, MRI);
+      auto It = LiveRegs.find(Reg);
+      if (It != LiveRegs.end()) {
+        LiveRegs[Reg] = Mask | It->second;
       } else {
-        LiveRegs[Reg] = mask;
+        LiveRegs[Reg] = Mask;
       }
     }
   }
 
-  for (auto it : LiveRegs) {
-    LaneBitmask emptyMask;
-    CurPressure.inc(it.first, emptyMask, it.second, MRI);
+  for (auto It : LiveRegs) {
+    LaneBitmask EmptyMask;
+    CurPressure.inc(It.first, EmptyMask, It.second, MRI);
   }
 
-  for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) {
-    MachineInstr *MI = *it;
+  for (auto It = SUnits.rbegin(); It != SUnits.rend(); It++) {
+    MachineInstr *MI = *It;
     auto *ST =
         &MI->getMF()
              ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
@@ -108,9 +107,9 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
 
       LaneBitmask LiveMask = getRegMask(MO, MRI);
       LaneBitmask PrevMask;
-      auto liveIt = LiveRegs.find(Reg);
-      if (liveIt != LiveRegs.end()) {
-        PrevMask = liveIt->second;
+      auto LiveIt = LiveRegs.find(Reg);
+      if (LiveIt != LiveRegs.end()) {
+        PrevMask = LiveIt->second;
       }
 
       if (MO.isDef()) {
@@ -123,16 +122,16 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
       LiveRegs[Reg] = LiveMask;
     }
 
-    unsigned sSize = CurPressure.getSGPRNum();
-    unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts());
-    if (sSize > SMaxSize)
-      SMaxSize = sSize;
-    if (vSize > VMaxSize)
-      VMaxSize = vSize;
+    unsigned SSize = CurPressure.getSGPRNum();
+    unsigned VSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts());
+    if (SSize > SMaxSize)
+      SMaxSize = SSize;
+    if (VSize > VMaxSize)
+      VMaxSize = VSize;
   }
 }
 
-bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool IsMoveUp) const {
+bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI) const {
   if (IsMultiDefOutput)
     return false;
   if (IsHasTerminatorInst)
@@ -142,7 +141,7 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool IsMoveUp) const {
 
   // Input should be single def.
   for (unsigned Reg : TopRegs) {
-    if (!MRI.hasOneDef(Reg) && !llvm::IsSub0Sub1SingleDef(Reg, MRI))
+    if (!MRI.hasOneDef(Reg) && !llvm::isSub0Sub1SingleDef(Reg, MRI))
       return false;
   }
   return true;
@@ -154,11 +153,11 @@ ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
     : MRI(MRI), SIRI(SIRI), SIII(SIII), IsJoinInputToSubExp(IsJoinInput) {}
 
 template <typename T>
-void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) {
-  unsigned NodeSize = InputLiveReg.size() + insts.size();
+void ExpDag::initNodes(const LiveSet &InputLiveReg, T &Insts) {
+  unsigned NodeSize = InputLiveReg.size() + Insts.size();
   SUnits.reserve(NodeSize);
 
-  for (MachineInstr *MI : insts) {
+  for (MachineInstr *MI : Insts) {
     if (MI->isDebugInstr())
       continue;
     SUnits.emplace_back(MI, SUnits.size());
@@ -167,8 +166,8 @@ void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) {
     MISUnitMap[MI] = SU;
   }
 
-  for (auto it : InputLiveReg) {
-    unsigned Reg = it.first;
+  for (auto It : InputLiveReg) {
+    unsigned Reg = It.first;
     SUnits.emplace_back();
     SUnit *SU = &SUnits.back();
     SU->NodeNum = SUnits.size() - 1;
@@ -187,7 +186,7 @@ template <typename T>
 void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
                    T &Insts) {
   initNodes(InputLiveReg, Insts);
-  addDataDep(SIRI);
+  addDataDep();
   addCtrlDep();
   buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII);
 }
@@ -203,10 +202,10 @@ template void ExpDag::build<std::vector<MachineInstr *>>(
 void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
                          const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
   IntEqClasses SubtreeClasses(SUnits.size());
-  std::vector<unsigned> passThruInputs;
+  std::vector<unsigned> PassThruInputs;
   for (SUnit &SU : SUnits) {
     if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) {
-      passThruInputs.emplace_back(SU.NodeNum);
+      PassThruInputs.emplace_back(SU.NodeNum);
       continue;
     }
     if (!IsJoinInputToSubExp && !SU.isInstr())
@@ -227,9 +226,9 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
   SubtreeClasses.compress();
 
   unsigned NumSubExps = SubtreeClasses.getNumClasses();
-  // Not count passThruInputs for subExps since they're exp with only 1 SU.
+  // Not count PassThruInputs for subExps since they're exp with only 1 SU.
   // SubExpIndexMap is used to pack SubIdx within updated NumSubExps.
-  NumSubExps -= passThruInputs.size();
+  NumSubExps -= PassThruInputs.size();
   SubExps.resize(NumSubExps);
   DenseMap<unsigned, unsigned> SubExpIndexMap;
 
@@ -242,18 +241,18 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
     unsigned OriginSubIdx = SubIdx;
     // Pack subidx.
     if (SubExpIndexMap.count(SubIdx) == 0) {
-      unsigned count = SubExpIndexMap.size();
-      SubExpIndexMap.insert(std::make_pair(SubIdx, count));
+      unsigned Count = SubExpIndexMap.size();
+      SubExpIndexMap.insert(std::make_pair(SubIdx, Count));
     }
     SubIdx = SubExpIndexMap[SubIdx];
     // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag.
     SU.NodeQueueId = SubIdx;
 
     SubExp &Exp = SubExps[SubIdx];
-    auto it = SUnitInputMap.find(&SU);
-    if (it != SUnitInputMap.end()) {
+    auto It = SUnitInputMap.find(&SU);
+    if (It != SUnitInputMap.end()) {
       // Input.
-      unsigned Reg = it->second;
+      Register Reg = It->second;
       Exp.TopRegs.insert(Reg);
     } else {
       MachineInstr *MI = SU.getInstr();
@@ -264,7 +263,7 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
           continue;
         if (!MO.isUse())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) {
           Exp.IsUseIncomingReg = true;
         }
@@ -301,13 +300,13 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
               IsUsedInOtherBlk = true;
               break;
             }
-            auto suIt = MISUnitMap.find(&UserMI);
+            auto SuIt = MISUnitMap.find(&UserMI);
             // When UserMI is not in dag, treat it as other block.
-            if (suIt == MISUnitMap.end()) {
+            if (SuIt == MISUnitMap.end()) {
               IsUsedInOtherBlk = true;
               break;
             }
-            SUnit *UseSU = suIt->second;
+            SUnit *UseSU = SuIt->second;
             // UserMI should always be in same subExp.
             unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum];
             if (UseSubIdx != OriginSubIdx) {
@@ -333,34 +332,34 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
   // Only reg will miss live mask.
   for (SubExp &Exp : SubExps) {
     for (unsigned Reg : Exp.TopRegs) {
-      auto it = StartLiveReg.find(Reg);
-      assert(it != StartLiveReg.end() &&
+      auto It = StartLiveReg.find(Reg);
+      assert(It != StartLiveReg.end() &&
              "cannot find input reg in block start live");
-      Exp.InputLive[Reg] |= it->second;
+      Exp.InputLive[Reg] |= It->second;
     }
 
     for (unsigned Reg : Exp.BottomRegs) {
-      auto it = EndLiveReg.find(Reg);
-      if (it == EndLiveReg.end()) {
+      auto It = EndLiveReg.find(Reg);
+      if (It == EndLiveReg.end()) {
         //"cannot find output reg in block end live");
         // Bottom reg is killed inside current block, did not get out of the
         // block.
         // Or the bottom reg is not treat as output in this dag, not save to
-        // outputLive which will affect profit count.
+        // OutputLive which will affect profit count.
         continue;
       }
-      Exp.OutputLive[Reg] |= it->second;
+      Exp.OutputLive[Reg] |= It->second;
     }
 
-    CollectLiveSetPressure(Exp.InputLive, MRI, SIRI, Exp.VInputSize,
+    collectLiveSetPressure(Exp.InputLive, MRI, SIRI, Exp.VInputSize,
                            Exp.SInputSize);
-    CollectLiveSetPressure(Exp.OutputLive, MRI, SIRI, Exp.VOutputSize,
+    collectLiveSetPressure(Exp.OutputLive, MRI, SIRI, Exp.VOutputSize,
                            Exp.SOutputSize);
   }
 }
 
-void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
-  DenseMap<unsigned, MachineInstr *> curDefMI;
+void ExpDag::addDataDep() {
+  DenseMap<unsigned, MachineInstr *> CurDefMI;
 
   for (SUnit &SU : SUnits) {
     if (!SU.isInstr())
@@ -377,11 +376,11 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
       Register Reg = MO.getReg();
       SUnit *DefSU = nullptr;
 
-      auto curDefIt = curDefMI.find(Reg);
+      auto CurDefIt = CurDefMI.find(Reg);
       // Check def inst first.
-      if (curDefIt != curDefMI.end()) {
-        MachineInstr *curDef = curDefIt->second;
-        DefSU = MISUnitMap[curDef];
+      if (CurDefIt != CurDefMI.end()) {
+        MachineInstr *CurDef = CurDefIt->second;
+        DefSU = MISUnitMap[CurDef];
       } else {
         // physical reg is not in live reg.
         if (!Reg.isVirtual())
@@ -404,7 +403,7 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
         continue;
       if (!MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
 
       // For case like:
       // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
@@ -412,17 +411,17 @@ void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
       // When partially write, link MI to previous def.
       if (MO.getSubReg() != 0) {
         SUnit *DefSU = nullptr;
-        auto curDefIt = curDefMI.find(Reg);
+        auto CurDefIt = CurDefMI.find(Reg);
         // Check def inst first.
-        if (curDefIt != curDefMI.end()) {
-          MachineInstr *CurDef = curDefIt->second;
+        if (CurDefIt != CurDefMI.end()) {
+          MachineInstr *CurDef = CurDefIt->second;
           DefSU = MISUnitMap[CurDef];
           // Add link between different defs.
           SU.addPred(SDep(DefSU, SDep::Data, Reg));
         }
       }
 
-      curDefMI[Reg] = MI;
+      CurDefMI[Reg] = MI;
     }
   }
 }
@@ -521,7 +520,7 @@ void BlockExpDag::buildAvail(const LiveSet &PassThruSet,
 
       MachineInstr *MI = SU->getInstr();
       // Calc pressure based on pred nodes.
-      GCNRPTracker::LiveRegSet dagLive;
+      GCNRPTracker::LiveRegSet DagLive;
       for (auto &Pred : SU->Preds) {
         SUnit *PredSU = Pred.getSUnit();
         GCNRPTracker::LiveRegSet PredLive = DagAvailRegMap[PredSU];
@@ -533,9 +532,9 @@ void BlockExpDag::buildAvail(const LiveSet &PassThruSet,
           // Update PredLive based on MI.
           RP.advance();
         }
-        llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+        llvm::mergeLiveRegSet(DagLive, RP.getLiveRegs());
       }
-      DagAvailRegMap[SU] = dagLive;
+      DagAvailRegMap[SU] = DagLive;
 
       // Add succ to work list.
       for (auto &Succ : SU->Succs) {
@@ -561,23 +560,23 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
   if (MBB->empty())
     return;
   DenseMap<SUnit *, GCNRPTracker::LiveRegSet> DagAvailRegMap;
-  GCNRPTracker::LiveRegSet passThruSet;
-  for (auto Reg : StartLiveReg) {
-    unsigned reg = Reg.first;
-    auto EndReg = EndLiveReg.find(reg);
+  GCNRPTracker::LiveRegSet PassThruSet;
+  for (auto It : StartLiveReg) {
+    Register Reg = It.first;
+    auto EndReg = EndLiveReg.find(Reg);
     if (EndReg == EndLiveReg.end())
       continue;
 
-    LaneBitmask mask = Reg.second;
-    LaneBitmask endMask = EndReg->second;
-    mask &= endMask;
-    if (mask.getAsInteger() == 0)
+    LaneBitmask Mask = It.second;
+    LaneBitmask EndMask = EndReg->second;
+    Mask &= EndMask;
+    if (Mask.getAsInteger() == 0)
       continue;
-    passThruSet[reg] = mask;
+    PassThruSet[Reg] = Mask;
   }
 
   // Build avial for each nodes.
-  buildAvail(passThruSet, DagAvailRegMap);
+  buildAvail(PassThruSet, DagAvailRegMap);
 
   // Calc avaialbe for each node, live is avail & sum(input of success).
   // If a reg is avaiable from the node, then success node can use it from this
@@ -594,10 +593,10 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
       // Using pass thru as base because output of current SU should not
       // affect other output SUs.
       GCNUpwardRPTracker RP(*LIS);
-      RP.reset(BeginMI, &passThruSet, /*After*/ true);
+      RP.reset(BeginMI, &PassThruSet, /*After*/ true);
       MachineInstr *MI = SU.getInstr();
       if (MI) {
-        RP.reset(*MI, &passThruSet, /*After*/ true);
+        RP.reset(*MI, &PassThruSet, /*After*/ true);
         RP.recede(*MI);
       }
       DagPressureMap[&SU] = RP.getLiveRegs();
@@ -611,7 +610,6 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
   }
 
   while (!WorkList.empty()) {
-    bool IsUpdated = false;
     SmallVector<SUnit *, 4> ReadyNodes;
     for (SUnit *SU : WorkList) {
       if (SU->NumSuccsLeft > 0)
@@ -619,7 +617,6 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
       ReadyNodes.emplace_back(SU);
       // Ready, move it to Processed.
       Processed.insert(SU);
-      IsUpdated = true;
       // Only update 1 node once.
       // Order of schedle here should not affect pressure.
       break;
@@ -631,7 +628,7 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
 
       MachineInstr *MI = SU->getInstr();
       // Calc pressure based on succ nodes.
-      GCNRPTracker::LiveRegSet dagLive;
+      GCNRPTracker::LiveRegSet DagLive;
       for (auto &Succ : SU->Succs) {
         SUnit *SuccSU = Succ.getSUnit();
         GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU];
@@ -643,12 +640,12 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
           // Update SuccLive based on MI.
           RP.recede(*MI);
         }
-        llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+        llvm::mergeLiveRegSet(DagLive, RP.getLiveRegs());
       }
       // Remove live which not avail in SU.
-      GCNRPTracker::LiveRegSet availLive = DagAvailRegMap[SU];
-      llvm::andLiveRegSet(dagLive, availLive);
-      DagPressureMap[SU] = dagLive;
+      GCNRPTracker::LiveRegSet AvailLive = DagAvailRegMap[SU];
+      llvm::andLiveRegSet(DagLive, AvailLive);
+      DagPressureMap[SU] = DagLive;
 
       // Add pred to work list.
       for (auto &Pred : SU->Preds) {
@@ -669,16 +666,16 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
 // dump functions.
 
 std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const {
-  std::string s;
-  raw_string_ostream oss(s);
-  auto it = SUnitInputMap.find(SU);
-  if (it != SUnitInputMap.end()) {
-    oss << "<input:" << llvm::printReg(it->second) << ">";
+  std::string S;
+  raw_string_ostream OSS(S);
+  auto It = SUnitInputMap.find(SU);
+  if (It != SUnitInputMap.end()) {
+    OSS << "<input:" << llvm::printReg(It->second) << ">";
   } else {
-    SU->getInstr()->print(oss, /*SkipOpers=*/true);
+    SU->getInstr()->print(OSS, /*SkipOpers=*/true);
   }
 
-  return oss.str();
+  return OSS.str();
 }
 
 /// Return the label.
@@ -688,7 +685,6 @@ std::string ExpDag::getDAGName() const { return "dag.exp"; }
 /// rendered using 'dot'.
 ///
 void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const {
-#if 0 // TODO: Re-enable this
   // This code is only for debugging!
 #ifndef NDEBUG
   ViewGraph(const_cast<ExpDag *>(this), Name, false, Title);
@@ -696,7 +692,6 @@ void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const {
   errs() << "BlockExpDag::viewGraph is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
 #endif // NDEBUG
-#endif
 }
 
 void ExpDag::dump() {
@@ -713,9 +708,9 @@ static DenseSet<const SUnit *> ViewNodes;
 template <>
 struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
 
-  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
 
-  static std::string getGraphName(const llvm::ExpDag *G) {
+  static std::string getGraphName(const llvm::ExpDag *) {
     return "ExpDag graph";
   }
 
@@ -729,7 +724,7 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
   }
 
   static std::string getNodeIdentifierLabel(const SUnit *Node,
-                                            const llvm::ExpDag *Graph) {
+                                            const llvm::ExpDag *) {
     std::string R;
     raw_string_ostream OS(R);
     OS << static_cast<const void *>(Node);
@@ -738,8 +733,8 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
 
   /// If you want to override the dot attributes printed for a particular
   /// edge, override this method.
-  static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
-                                       const llvm::ExpDag *Graph) {
+  static std::string getEdgeAttributes(const SUnit *, SUnitIterator EI,
+                                       const llvm::ExpDag *) {
     if (EI.isArtificialDep())
       return "color=cyan,style=dashed";
     if (EI.isCtrlDep())
@@ -747,7 +742,7 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
     return "";
   }
 
-  static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *Graph) {
+  static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *) {
     std::string Str;
     raw_string_ostream SS(Str);
     SS << "SU:" << SU->NodeNum;
@@ -758,7 +753,7 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
     return G->getGraphNodeLabel(SU);
   }
   static std::string getNodeAttributes(const SUnit *N,
-                                       const llvm::ExpDag *Graph) {
+                                       const llvm::ExpDag *) {
     std::string Str("shape=Mrecord");
 
     Str += ",style=filled,fillcolor=\"#";
@@ -798,42 +793,42 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
   MaxSGPR = AMDGPU::SGPR104 - AMDGPU::SGPR0;
 
   const auto &EndSlot = LIS->getMBBEndIdx(MBB);
-  const GCNRPTracker::LiveRegSet outputLive =
+  const GCNRPTracker::LiveRegSet OutputLive =
       llvm::getLiveRegs(EndSlot, *LIS, MRI);
 
   auto *ST =
       &MBB->getParent()
            ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
   if (MBB->empty()) {
-    GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive);
+    GCNRegPressure MaxPressure = getRegPressure(MRI, OutputLive);
     MaxSGPR = MaxPressure.getSGPRNum();
     MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts());
     return;
   }
 
-  BlockExpDag dag(MBB, LIS, MRI, SIRI, SIII);
-  dag.build();
+  BlockExpDag Dag(MBB, LIS, MRI, SIRI, SIII);
+  Dag.build();
 
-  std::vector<SUnit> &SUnits = dag.SUnits;
+  std::vector<SUnit> &SUnits = Dag.SUnits;
   // Remove input nodes.
   for (SUnit &SU : SUnits) {
     if (!SU.isInstr())
       continue;
-    std::vector<SDep> inputDeps;
+    std::vector<SDep> InputDeps;
     for (SDep &Dep : SU.Preds) {
       SUnit *Pred = Dep.getSUnit();
       if (Pred->isInstr())
         continue;
-      inputDeps.emplace_back(Dep);
+      InputDeps.emplace_back(Dep);
     }
-    for (SDep &Dep : inputDeps) {
+    for (SDep &Dep : InputDeps) {
       SU.removePred(Dep);
     }
   }
 
-  unsigned inputSize = dag.InputSUnitMap.size();
-  unsigned instNodeSize = SUnits.size() - inputSize;
-  SUnits.erase(SUnits.begin() + instNodeSize, SUnits.end());
+  const unsigned InputSize = Dag.InputSUnitMap.size();
+  const unsigned InstNodeSize = SUnits.size() - InputSize;
+  SUnits.erase(SUnits.begin() + InstNodeSize, SUnits.end());
 
   std::vector<llvm::SUnit *> BotRoots;
   for (SUnit &SU : SUnits) {
@@ -844,9 +839,9 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
   auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI);
 
   GCNUpwardRPTracker RPTracker(*LIS);
-  RPTracker.reset(MBB->front(), &outputLive, /*After*/ true);
-  for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) {
-    const SUnit *SU = *it;
+  RPTracker.reset(MBB->front(), &OutputLive, /*After*/ true);
+  for (auto It = SchedResult.rbegin(); It != SchedResult.rend(); It++) {
+    const SUnit *SU = *It;
     if (!SU->isInstr())
       continue;
     MachineInstr *MI = SU->getInstr();
@@ -863,32 +858,32 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
 namespace {
 
 std::vector<SUnit *> buildWorkList(std::vector<llvm::SUnit> &SUnits) {
-  std::vector<SUnit *> resultList;
-  resultList.reserve(SUnits.size());
+  std::vector<SUnit *> ResultList;
+  ResultList.reserve(SUnits.size());
   for (SUnit &SU : SUnits) {
-    resultList.emplace_back(&SU);
+    ResultList.emplace_back(&SU);
   }
-  return resultList;
+  return ResultList;
 }
 
-void sortByHeight(std::vector<SUnit *> &workList) {
-  std::sort(workList.begin(), workList.end(),
-            [](const SUnit *a, const SUnit *b) {
+void sortByHeight(std::vector<SUnit *> &WorkList) {
+  std::sort(WorkList.begin(), WorkList.end(),
+            [](const SUnit *A, const SUnit *B) {
               // Lowest height first.
-              if (a->getHeight() < b->getHeight())
+              if (A->getHeight() < B->getHeight())
                 return true;
               // If height the same, NodeNum big first.
-              if (a->getHeight() == b->getHeight())
-                return a->NodeNum > b->NodeNum;
+              if (A->getHeight() == B->getHeight())
+                return A->NodeNum > B->NodeNum;
               return false;
             });
 }
 
-void sortByInChain(std::vector<SUnit *> &workList, DenseSet<SUnit *> &Chained) {
+void sortByInChain(std::vector<SUnit *> &WorkList, DenseSet<SUnit *> &Chained) {
   // In chain nodes at end.
-  std::sort(workList.begin(), workList.end(),
-            [&Chained](const SUnit *a, const SUnit *b) {
-              return Chained.count(a) < Chained.count(b);
+  std::sort(WorkList.begin(), WorkList.end(),
+            [&Chained](const SUnit *A, const SUnit *B) {
+              return Chained.count(A) < Chained.count(B);
             });
 }
 
@@ -905,7 +900,7 @@ const TargetRegisterClass *getRegClass(SUnit *SU,
   MachineOperand *MO = MI->defs().begin();
   if (!MO->isReg())
     return nullptr;
-  unsigned Reg = MO->getReg();
+  Register Reg = MO->getReg();
   return SIRI->getRegClassForReg(MRI, Reg);
 }
 
@@ -926,12 +921,12 @@ unsigned getSGPRSize(const TargetRegisterClass *RC,
   return RC->getLaneMask().getNumLanes();
 }
 
-void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet<SUnit *, 2> &backNodes,
+void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet<SUnit *, 2> &BackNodes,
                                 unsigned NodeNum,
-                                SmallDenseSet<SUnit *, 4> &visitedNodes) {
-  if (visitedNodes.count(SU))
+                                SmallDenseSet<SUnit *, 4> &VisitedNodes) {
+  if (VisitedNodes.count(SU))
     return;
-  visitedNodes.insert(SU);
+  VisitedNodes.insert(SU);
 
   for (SDep &Dep : SU->Succs) {
     if (Dep.isWeak())
@@ -943,8 +938,8 @@ void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet<SUnit *, 2> &backNodes,
      if (Succ->NodeNum >= NodeNum)
        continue;*/
 
-    backNodes.insert(Succ);
-    collectSameHeightBackNodes(Succ, backNodes, NodeNum, visitedNodes);
+    BackNodes.insert(Succ);
+    collectSameHeightBackNodes(Succ, BackNodes, NodeNum, VisitedNodes);
   }
 }
 
@@ -963,60 +958,60 @@ SUnit *HRB::Lineage::getTail() const { return Nodes.back(); }
 
 void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
   // Working list from TopRoots.
-  std::vector<SUnit *> workList = buildWorkList(SUnits);
+  std::vector<SUnit *> WorkList = buildWorkList(SUnits);
   IntEqClasses EqClasses(SUnits.size());
 
-  while (!workList.empty()) {
-    sortByHeight(workList);
+  while (!WorkList.empty()) {
+    sortByHeight(WorkList);
     // Highest SU.
-    SUnit *SU = workList.back();
-    workList.pop_back();
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
     if (!SU->isInstr())
       continue;
     if (ChainedNodes.count(SU) > 0)
       continue;
     IsRecomputeHeight = false;
-    Lineage lineage = buildChain(SU, SUnits);
+    Lineage Lineage = buildChain(SU, SUnits);
 
     // Remove chained nodes from worklist.
-    sortByInChain(workList, ChainedNodes);
-    while (!workList.empty()) {
-      SUnit *back = workList.back();
-      if (ChainedNodes.count(back))
-        workList.pop_back();
+    sortByInChain(WorkList, ChainedNodes);
+    while (!WorkList.empty()) {
+      SUnit *Back = WorkList.back();
+      if (ChainedNodes.count(Back))
+        WorkList.pop_back();
       else
         break;
     }
 
-    Lineages.emplace_back(lineage);
+    Lineages.emplace_back(Lineage);
 
     if (IsRecomputeHeight) {
       // Update height from tail.
-      SUnit *tail = lineage.Nodes.back();
-      tail->setDepthDirty();
-      tail->getHeight();
+      SUnit *Tail = Lineage.Nodes.back();
+      Tail->setDepthDirty();
+      Tail->getHeight();
     }
   }
 
-  DenseSet<SUnit *> tailSet;
+  DenseSet<SUnit *> TailSet;
   for (Lineage &L : Lineages) {
     if (L.Nodes.size() < 2)
       continue;
-    auto it = L.Nodes.rbegin();
-    it++;
-    SUnit *tail = L.Nodes.back();
-    // If already as tail for other lineage, start from next.
-    if (tailSet.count(tail) > 0) {
-      tail = *it;
-      it++;
+    auto It = L.Nodes.rbegin();
+    It++;
+    SUnit *Tail = L.Nodes.back();
+    // If already as tail for other Lineage, start from next.
+    if (TailSet.count(Tail) > 0) {
+      Tail = *It;
+      It++;
     } else {
-      tailSet.insert(tail);
+      TailSet.insert(Tail);
     }
-    for (; it != L.Nodes.rend(); it++) {
-      SUnit *SU = *it;
-      if (tail->NodeNum == -1)
+    for (; It != L.Nodes.rend(); It++) {
+      SUnit *SU = *It;
+      if (Tail->NodeNum == (unsigned)-1)
         continue;
-      EqClasses.join(SU->NodeNum, tail->NodeNum);
+      EqClasses.join(SU->NodeNum, Tail->NodeNum);
     }
   }
 
@@ -1024,7 +1019,7 @@ void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
   // TODO: assign sub class to node.
   for (Lineage &L : Lineages) {
     for (SUnit *SU : L.Nodes) {
-      if (SU->NodeNum == -1)
+      if (SU->NodeNum == (unsigned)-1)
         continue;
       unsigned SubIdx = EqClasses[SU->NodeNum];
       //// Pack subidx.
@@ -1040,7 +1035,7 @@ void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
       dbgs() << "Chained Nodes:"; for (SUnit *SU
                                        : ChainedNodes) {
         dbgs() << " " << SU->NodeNum << "\n";
-      } for (int i = 0; i < Lineages.size(); i++) {
+      } for (unsigned i = 0; i < Lineages.size(); i++) {
         dbgs() << "Lineage" << i << ":";
         Lineage &L = Lineages[i];
         for (SUnit *SU : L.Nodes) {
@@ -1078,7 +1073,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
   }
   // Make sure choose lowest dependence between SameHeightCandidate.
   if (SameHeightCandidate.size() > 1) {
-    for (int i = 1; i < SameHeightCandidate.size(); i++) {
+    for (size_t i = 1; i < SameHeightCandidate.size(); i++) {
       SUnit *SU = SameHeightCandidate[i];
       // If Heir is pred of SU, use SU.
       if (canReach(SU, Heir))
@@ -1116,8 +1111,8 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
 }
 
 HRB::Lineage HRB::buildChain(SUnit *Node, std::vector<llvm::SUnit> &SUnits) {
-  HRB::Lineage chain;
-  chain.addNode(Node);
+  HRB::Lineage Chain;
+  Chain.addNode(Node);
   ChainedNodes.insert(Node);
   LLVM_DEBUG(dbgs() << "start chain " << Node->NodeNum << "("
                     << Node->getHeight() << ")\n");
@@ -1125,7 +1120,7 @@ HRB::Lineage HRB::buildChain(SUnit *Node, std::vector<llvm::SUnit> &SUnits) {
     SUnit *Heir = findHeir(Node, SUnits);
     if (!Heir)
       break;
-    chain.addNode(Heir);
+    Chain.addNode(Heir);
 
     LLVM_DEBUG(dbgs() << "add node to chain " << Heir->NodeNum << "\n");
     if (ChainedNodes.count(Heir) > 0)
@@ -1137,38 +1132,38 @@ HRB::Lineage HRB::buildChain(SUnit *Node, std::vector<llvm::SUnit> &SUnits) {
   // Find biggest vgpr RC for the chain.
   // TODO: Build conflict and allocate on each edge of the chain.
   const TargetRegisterClass *RC = nullptr;
-  unsigned maxRCSize = 0;
-  for (SUnit *SU : chain.Nodes) {
+  unsigned MaxRCSize = 0;
+  for (SUnit *SU : Chain.Nodes) {
     const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
     unsigned RCSize = getVGPRSize(SuRC, SIRI);
-    if (RCSize > maxRCSize) {
-      maxRCSize = RCSize;
+    if (RCSize > MaxRCSize) {
+      MaxRCSize = RCSize;
       RC = SuRC;
     }
   }
   if (!RC) {
     // TODO: Find biggest sgpr RC.
-    unsigned maxRCSize = 0;
-    for (SUnit *SU : chain.Nodes) {
+    unsigned MaxRCSize = 0;
+    for (SUnit *SU : Chain.Nodes) {
       const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
       unsigned RCSize = getSGPRSize(SuRC, SIRI);
-      if (RCSize > maxRCSize) {
-        maxRCSize = RCSize;
+      if (RCSize > MaxRCSize) {
+        MaxRCSize = RCSize;
         RC = SuRC;
       }
     }
   }
-  chain.RC = RC;
-  return chain;
+  Chain.RC = RC;
+  return Chain;
 }
 
 void HRB::buildConflict() {
 
   for (unsigned i = 0; i < Lineages.size(); i++) {
-    Lineage &a = Lineages[i];
+    Lineage &A = Lineages[i];
     for (unsigned j = i + 1; j < Lineages.size(); j++) {
-      Lineage &b = Lineages[j];
-      if (isConflict(a, b)) {
+      Lineage &B = Lineages[j];
+      if (isConflict(A, B)) {
         Color.Conflicts[i].insert(j);
         Color.Conflicts[j].insert(i);
         LLVM_DEBUG(dbgs() << i << " conflict" << j << "\n");
@@ -1179,24 +1174,24 @@ void HRB::buildConflict() {
   }
 }
 
-bool HRB::canReach(llvm::SUnit *a, llvm::SUnit *b) {
-  auto it = ReachMap.find(a);
+bool HRB::canReach(llvm::SUnit *A, llvm::SUnit *B) {
+  auto It = ReachMap.find(A);
   // If no reach info, treat as reach.
-  if (it == ReachMap.end())
+  if (It == ReachMap.end())
     return true;
-  DenseSet<SUnit *> &CurReach = it->second;
-  return CurReach.find(b) != CurReach.end();
+  DenseSet<SUnit *> &CurReach = It->second;
+  return CurReach.find(B) != CurReach.end();
 }
 
-void HRB::updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+void HRB::updateReachForEdge(llvm::SUnit *A, llvm::SUnit *B,
                              std::vector<llvm::SUnit> &SUnits) {
-  DenseSet<SUnit *> &ReachA = ReachMap[a];
-  ReachA.insert(b);
-  DenseSet<SUnit *> &ReachB = ReachMap[b];
+  DenseSet<SUnit *> &ReachA = ReachMap[A];
+  ReachA.insert(B);
+  DenseSet<SUnit *> &ReachB = ReachMap[B];
   ReachA.insert(ReachB.begin(), ReachB.end());
 
   for (SUnit &SU : SUnits) {
-    if (!canReach(&SU, a))
+    if (!canReach(&SU, A))
       continue;
 
     DenseSet<SUnit *> &CurReach = ReachMap[&SU];
@@ -1252,91 +1247,91 @@ void HRB::buildReachRelation(ArrayRef<SUnit *> BotRoots) {
   });
 }
 
-bool HRB::isConflict(const Lineage &a, const Lineage &b) {
+bool HRB::isConflict(const Lineage &A, const Lineage &B) {
   // Make conflict between sgpr and vgpr to help group lineages when share
   // colors. Keep the conflict will group lineages in avoid mix use color in
   // different sub exp.
-  SUnit *head0 = a.getHead();
-  SUnit *tail0 = a.getTail();
-  SUnit *head1 = b.getHead();
-  SUnit *tail1 = b.getTail();
-  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
-  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
-  bool r01 = Reach0.count(tail1) != 0;
-  bool r10 = Reach1.count(tail0) != 0;
-  return r01 && r10;
+  SUnit *Head0 = A.getHead();
+  SUnit *Tail0 = A.getTail();
+  SUnit *Head1 = B.getHead();
+  SUnit *Tail1 = B.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[Head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[Head1];
+  bool R01 = Reach0.count(Tail1) != 0;
+  bool R10 = Reach1.count(Tail0) != 0;
+  return R01 && R10;
 }
-bool HRB::canFuse(const Lineage &a, const Lineage &b) {
-  if (a.RC != b.RC) {
+bool HRB::canFuse(const Lineage &A, const Lineage &B) {
+  if (A.RC != B.RC) {
     // no RC will not conflict with other nodes.
-    if (!a.RC)
+    if (!A.RC)
       return false;
-    if (!b.RC)
+    if (!B.RC)
       return false;
     // SGRP and VGPR not conflict.
-    if (SIRI->isSGPRClass(a.RC) != SIRI->isSGPRClass(b.RC))
+    if (SIRI->isSGPRClass(A.RC) != SIRI->isSGPRClass(B.RC))
       return false;
   }
   // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
-  SUnit *head0 = a.getHead();
-  SUnit *tail0 = a.getTail();
-  SUnit *head1 = b.getHead();
-  SUnit *tail1 = b.getTail();
-  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
-  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
-  bool r01 = Reach0.count(tail1) != 0;
-  bool r10 = Reach1.count(tail0) != 0;
-  return r01 != r10;
+  SUnit *Head0 = A.getHead();
+  SUnit *Tail0 = A.getTail();
+  SUnit *Head1 = B.getHead();
+  SUnit *Tail1 = B.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[Head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[Head1];
+  bool R01 = Reach0.count(Tail1) != 0;
+  bool R10 = Reach1.count(Tail0) != 0;
+  return R01 != R10;
 }
 
-bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits) {
+bool HRB::tryFuse(Lineage &A, Lineage &B, std::vector<llvm::SUnit> &SUnits) {
 
   // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
-  SUnit *head0 = a.getHead();
-  SUnit *tail0 = a.getTail();
-  SUnit *head1 = b.getHead();
-  SUnit *tail1 = b.getTail();
-  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
-  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
-  bool r01 = Reach0.count(tail1) != 0;
-  bool r10 = Reach1.count(tail0) != 0;
-  if (r01 == r10)
+  SUnit *Head0 = A.getHead();
+  SUnit *Tail0 = A.getTail();
+  SUnit *Head1 = B.getHead();
+  SUnit *Tail1 = B.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[Head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[Head1];
+  bool R01 = Reach0.count(Tail1) != 0;
+  bool R10 = Reach1.count(Tail0) != 0;
+  if (R01 == R10)
     return false;
-  Lineage *newHead = &a;
-  Lineage *newTail = &b;
-  if (r01) {
+  Lineage *NewHead = &A;
+  Lineage *NewTail = &B;
+  if (R01) {
     // a reach b, b cannot reach a.
     // link a.tail->b.head.
-    newHead = &a;
-    newTail = &b;
+    NewHead = &A;
+    NewTail = &B;
   } else {
     // b reach a, a cannot reach b.
     // link b.tail->a.head.
-    newHead = &b;
-    newTail = &a;
+    NewHead = &B;
+    NewTail = &A;
   }
 
   // Merge reg class.
-  const TargetRegisterClass *RC0 = newHead->RC;
-  const TargetRegisterClass *RC1 = newTail->RC;
+  const TargetRegisterClass *RC0 = NewHead->RC;
+  const TargetRegisterClass *RC1 = NewTail->RC;
   unsigned RC0Size = getVGPRSize(RC0, SIRI);
   unsigned RC1Size = getVGPRSize(RC1, SIRI);
   if (RC1Size > RC0Size)
-    newHead->RC = RC1;
+    NewHead->RC = RC1;
   // Merge chain.
-  SUnit *fuseTail = newHead->getTail();
-  SUnit *fuseHead = newTail->getHead();
-  assert(ReachMap[fuseHead].count(fuseTail) == 0 && "");
-  fuseHead->addPred(SDep(fuseTail, SDep::Artificial));
-  LLVM_DEBUG(dbgs() << "fuse " << fuseTail->NodeNum << "->" << fuseHead->NodeNum
+  SUnit *FuseTail = NewHead->getTail();
+  SUnit *FuseHead = NewTail->getHead();
+  assert(ReachMap[FuseHead].count(FuseTail) == 0 && "");
+  FuseHead->addPred(SDep(FuseTail, SDep::Artificial));
+  LLVM_DEBUG(dbgs() << "fuse " << FuseTail->NodeNum << "->" << FuseHead->NodeNum
                     << "\n");
   // Update reach map.
-  updateReachForEdge(fuseTail, fuseHead, SUnits);
+  updateReachForEdge(FuseTail, FuseHead, SUnits);
   // Merge Nodes.
-  newHead->Nodes.append(newTail->Nodes.begin(), newTail->Nodes.end());
+  NewHead->Nodes.append(NewTail->Nodes.begin(), NewTail->Nodes.end());
   // Clear newTail.
-  newTail->Nodes.clear();
-  newTail->RC = nullptr;
+  NewTail->Nodes.clear();
+  NewTail->RC = nullptr;
   return true;
 }
 
@@ -1346,27 +1341,27 @@ void HRB::fusionLineages(std::vector<llvm::SUnit> &SUnits) {
   bool IsUpdated = true;
   while (IsUpdated) {
     IsUpdated = false;
-    int size = Lineages.size();
-    for (int i = 0; i < size; i++) {
-      Lineage &a = Lineages[i];
-      if (a.length() == 0)
+    int Size = Lineages.size();
+    for (int i = 0; i < Size; i++) {
+      Lineage &A = Lineages[i];
+      if (A.length() == 0)
         continue;
 
-      for (int j = i + 1; j < size; j++) {
-        Lineage &b = Lineages[j];
-        if (b.length() == 0)
+      for (int j = i + 1; j < Size; j++) {
+        Lineage &B = Lineages[j];
+        if (B.length() == 0)
           continue;
-        if (tryFuse(a, b, SUnits)) {
+        if (tryFuse(A, B, SUnits)) {
           IsUpdated = true;
-          if (a.length() == 0)
+          if (A.length() == 0)
             break;
         }
       }
     }
     // Remove empty lineages.
     std::sort(Lineages.begin(), Lineages.end(),
-              [](const Lineage &a, const Lineage &b) {
-                return a.length() > b.length();
+              [](const Lineage &A, const Lineage &B) {
+                return A.length() > B.length();
               });
     while (Lineages.back().length() == 0) {
       Lineages.pop_back();
@@ -1379,63 +1374,63 @@ void HRB::fusionLineages(std::vector<llvm::SUnit> &SUnits) {
   }
 }
 
-unsigned HRB::colorLineages(std::vector<Lineage *> &lineages,
+unsigned HRB::colorLineages(std::vector<Lineage *> &InLineages,
                             DenseMap<Lineage *, unsigned> &AllocMap,
                             const unsigned Limit) {
   // allocate long Lineage first. How about size of RC?
-  std::sort(lineages.begin(), lineages.end(),
+  std::sort(InLineages.begin(), InLineages.end(),
             [](const Lineage *a, const Lineage *b) {
               // Make sure root allocate first.
               return a->length() > b->length();
             });
 
-  unsigned maxColor = 0;
+  unsigned MaxColor = 0;
   const unsigned VGPR_LIMIT = 256 * 4;
 
-  for (Lineage *L : lineages) {
+  for (Lineage *L : InLineages) {
     unsigned ID = L->ID;
     auto &Conflict = Color.Conflicts[ID];
-    std::bitset<VGPR_LIMIT> colors;
+    std::bitset<VGPR_LIMIT> Colors;
     for (unsigned j : Conflict) {
-      Lineage *C = &Lineages[j];
-      if (AllocMap.count(C) == 0)
+      Lineage *LineageC = &Lineages[j];
+      if (AllocMap.count(LineageC) == 0)
         continue;
-      unsigned c = AllocMap[C];
-      unsigned s = C->getSize();
-      for (unsigned i = 0; i < s; i++) {
-        unsigned pos = c + i;
-        colors.set(pos);
+      unsigned C = AllocMap[LineageC];
+      unsigned S = LineageC->getSize();
+      for (unsigned i = 0; i < S; i++) {
+        unsigned Pos = C + i;
+        Colors.set(Pos);
       }
     }
 
-    unsigned color = Limit;
-    unsigned size = L->getSize();
-    for (unsigned i = 0; i < Limit - size;) {
-      unsigned oldI = i;
-      for (unsigned j = 0; j < size; j++) {
-        unsigned pos = i + size - 1 - j;
-        if (colors.test(pos)) {
-          i = pos + 1;
+    unsigned Color = Limit;
+    unsigned Size = L->getSize();
+    for (unsigned i = 0; i < Limit - Size;) {
+      unsigned OldI = i;
+      for (unsigned j = 0; j < Size; j++) {
+        unsigned Pos = i + Size - 1 - j;
+        if (Colors.test(Pos)) {
+          i = Pos + 1;
           break;
         }
       }
 
-      if (i != oldI)
+      if (i != OldI)
         continue;
-      color = i;
+      Color = i;
       break;
     }
 
-    AllocMap[L] = color;
-    color += size;
-    if (color > maxColor)
-      maxColor = color;
+    AllocMap[L] = Color;
+    Color += Size;
+    if (Color > MaxColor)
+      MaxColor = Color;
   }
-  return maxColor;
+  return MaxColor;
 }
 
-void HRB::ColorResult::colorSU(SUnit *SU, unsigned color) {
-  ColorMap[SU] = color;
+void HRB::ColorResult::colorSU(SUnit *SU, unsigned Color) {
+  ColorMap[SU] = Color;
 }
 
 unsigned HRB::ColorResult::getLineage(SUnit *SU) const {
@@ -1454,53 +1449,53 @@ bool HRB::ColorResult::isTail(SUnit *SU) const { return TailSet.count(SU); }
 const SUnit *HRB::ColorResult::getTail(SUnit *SU) const {
   if (!isHead(SU))
     return nullptr;
-  auto it = HeadTailMap.find(SU);
-  return it->second;
+  auto It = HeadTailMap.find(SU);
+  return It->second;
 }
 
 unsigned HRB::ColorResult::getColor(const llvm::SUnit *SU) const {
-  auto it = ColorMap.find(SU);
-  return it->second;
+  auto It = ColorMap.find(SU);
+  return It->second;
 }
 
 unsigned HRB::ColorResult::getSize(const llvm::SUnit *SU) const {
-  auto it = SizeMap.find(SU);
-  return it->second;
+  auto It = SizeMap.find(SU);
+  return It->second;
 }
 
 HRB::ColorResult &HRB::coloring() {
   // Collect VGPR lineages.
-  std::vector<Lineage *> vgprLineages;
+  std::vector<Lineage *> VgprLineages;
   for (Lineage &L : Lineages) {
-    auto RC = L.RC;
+    const auto *RC = L.RC;
     if (!RC)
       continue;
     if (SIRI->isSGPRClass(RC))
       continue;
-    vgprLineages.emplace_back(&L);
+    VgprLineages.emplace_back(&L);
   }
 
   const unsigned VGPR_LIMIT = 256 * 4;
   DenseMap<Lineage *, unsigned> VAllocMap;
-  const unsigned maxVGPR = colorLineages(vgprLineages, VAllocMap, VGPR_LIMIT);
+  const unsigned MaxVGPR = colorLineages(VgprLineages, VAllocMap, VGPR_LIMIT);
 
   // Collect SGPR lineages.
-  std::vector<Lineage *> sgprLineages;
+  std::vector<Lineage *> SgprLineages;
   for (Lineage &L : Lineages) {
-    auto RC = L.RC;
+    const auto *RC = L.RC;
     if (!RC)
       continue;
     if (!SIRI->isSGPRClass(RC))
       continue;
-    sgprLineages.emplace_back(&L);
+    SgprLineages.emplace_back(&L);
   }
 
   const unsigned SGPR_LIMIT = 104;
   DenseMap<Lineage *, unsigned> SAllocMap;
-  const unsigned maxSGPR = colorLineages(sgprLineages, SAllocMap, SGPR_LIMIT);
+  const unsigned MaxSGPR = colorLineages(SgprLineages, SAllocMap, SGPR_LIMIT);
   // +1 for each type of lineages(SGPR, VGPR, no reg).
-  const unsigned maxReg = maxSGPR + 1 + maxVGPR + 1 + 1;
-  const unsigned sgprBase = maxVGPR + 1;
+  const unsigned MaxReg = MaxSGPR + 1 + MaxVGPR + 1 + 1;
+  const unsigned SgprBase = MaxVGPR + 1;
 
   for (Lineage &L : Lineages) {
     // Collect HeadSet.
@@ -1508,41 +1503,41 @@ HRB::ColorResult &HRB::coloring() {
     Color.TailSet.insert(L.getTail());
     Color.HeadTailMap[L.getHead()] = L.getTail();
     // Save color.
-    auto RC = L.RC;
+    const auto *RC = L.RC;
     // All no reg lineage goes to maxReg.
-    unsigned color = maxReg;
+    unsigned RegColor = MaxReg;
     if (!RC) {
     } else if (SIRI->isSGPRClass(RC)) {
-      color = SAllocMap[&L] + sgprBase;
+      RegColor = SAllocMap[&L] + SgprBase;
     } else {
-      color = VAllocMap[&L];
+      RegColor = VAllocMap[&L];
     }
-    unsigned size = L.getSize();
+    unsigned Size = L.getSize();
     for (SUnit *SU : L.Nodes) {
-      Color.colorSU(SU, color);
-      Color.SizeMap[SU] = size;
+      Color.colorSU(SU, RegColor);
+      Color.SizeMap[SU] = Size;
       Color.LineageMap[SU] = L.ID;
     }
   }
-  Color.maxReg = maxReg;
-  Color.maxSGPR = maxSGPR;
-  Color.maxVGPR = maxVGPR;
+  Color.MaxReg = MaxReg;
+  Color.MaxSGPR = MaxSGPR;
+  Color.MaxVGPR = MaxVGPR;
 
   for (unsigned i = 0; i < Lineages.size(); i++) {
-    Lineage &a = Lineages[i];
-    SUnit *headA = a.getHead();
-    unsigned colorA = Color.getColor(headA);
-    unsigned sizeA = Color.getSize(headA);
+    Lineage &A = Lineages[i];
+    SUnit *HeadA = A.getHead();
+    unsigned ColorA = Color.getColor(HeadA);
+    unsigned SizeA = Color.getSize(HeadA);
     for (unsigned j = i + 1; j < Lineages.size(); j++) {
-      Lineage &b = Lineages[j];
+      Lineage &B = Lineages[j];
 
-      SUnit *headB = b.getHead();
-      unsigned colorB = Color.getColor(headB);
-      unsigned sizeB = Color.getSize(headB);
+      SUnit *HeadB = B.getHead();
+      unsigned ColorB = Color.getColor(HeadB);
+      unsigned SizeB = Color.getSize(HeadB);
 
-      if (colorB >= (colorA + sizeA))
+      if (ColorB >= (ColorA + SizeA))
         continue;
-      if (colorA >= (colorB + sizeB))
+      if (ColorA >= (ColorB + SizeB))
         continue;
       Color.ShareColorLineages.insert(i);
       Color.ShareColorLineages.insert(j);
@@ -1553,7 +1548,7 @@ HRB::ColorResult &HRB::coloring() {
 }
 
 void HRB::dump() {
-  for (int i = 0; i < Lineages.size(); i++) {
+  for (unsigned i = 0; i < Lineages.size(); i++) {
     dbgs() << "Lineage" << i << ":";
     Lineage &L = Lineages[i];
     for (SUnit *SU : L.Nodes) {
@@ -1566,7 +1561,7 @@ void HRB::dump() {
     }
     if (!ReachMap.empty()) {
       dbgs() << "conflict:";
-      for (int j = 0; j < Lineages.size(); j++) {
+      for (unsigned j = 0; j < Lineages.size(); j++) {
         if (i == j)
           continue;
         if (isConflict(L, Lineages[j])) {
@@ -1581,9 +1576,9 @@ void HRB::dump() {
 void HRB::dumpReachMap() {
   if (!ReachMap.empty()) {
     dbgs() << "reachMap:";
-    for (auto it : ReachMap) {
-      SUnit *SU = it.first;
-      auto &Reach = it.second;
+    for (auto It : ReachMap) {
+      SUnit *SU = It.first;
+      auto &Reach = It.second;
       if (SU->isInstr()) {
         MachineInstr *MI = SU->getInstr();
         MI->print(dbgs());
@@ -1604,24 +1599,24 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
                                     std::vector<SUnit *> &BRoots,
                                     const llvm::MachineRegisterInfo &MRI,
                                     const llvm::SIRegisterInfo *SIRI) {
-  HRB hrb(MRI, SIRI);
+  HRB Hrb(MRI, SIRI);
   // build reach info to avoid dead loop when build linear.
-  hrb.buildReachRelation(BRoots);
-  hrb.buildLinear(SUnits);
+  Hrb.buildReachRelation(BRoots);
+  Hrb.buildLinear(SUnits);
 
-  std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *a, const SUnit *b) {
-    return a->NumSuccsLeft < b->NumSuccsLeft;
+  std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *A, const SUnit *B) {
+    return A->NumSuccsLeft < B->NumSuccsLeft;
   });
   while (!BRoots.empty() && BRoots.back()->NumSuccsLeft > 0) {
     BRoots.pop_back();
   }
 
-  hrb.buildReachRelation(BRoots);
-  hrb.fusionLineages(SUnits);
-  hrb.buildConflict();
-  const HRB::ColorResult &Color = hrb.coloring();
+  Hrb.buildReachRelation(BRoots);
+  Hrb.fusionLineages(SUnits);
+  Hrb.buildConflict();
+  const HRB::ColorResult &ColorRes = Hrb.coloring();
 
-  LLVM_DEBUG(hrb.dump());
+  LLVM_DEBUG(Hrb.dump());
 
   // All lineage head which don't has Pred is TopRoots.
   // Put top roots in worklist.
@@ -1638,30 +1633,30 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
   // When there're more than one sub exp in the DAG, make sure not mix different
   // sub exp or it will dead loop for color goes different subexp.
 
-  std::bitset<512 * 2> colors;
-  auto isColorAvail = [&colors](unsigned color, unsigned size) -> bool {
-    for (unsigned i = 0; i < size; i++) {
-      unsigned pos = color + i;
-      if (colors.test(pos))
+  std::bitset<512 * 2> Colors;
+  auto IsColorAvail = [&Colors](unsigned Color, unsigned Size) -> bool {
+    for (unsigned i = 0; i < Size; i++) {
+      unsigned Pos = Color + i;
+      if (Colors.test(Pos))
         return false;
     }
     return true;
   };
-  auto allocColor = [&colors](unsigned color, unsigned size) {
-    for (unsigned i = 0; i < size; i++) {
-      unsigned pos = color + i;
-      assert(!colors.test(pos) && "color already allocated");
-      LLVM_DEBUG(dbgs() << pos << "is allocated\n");
-      colors.set(pos);
+  auto AllocColor = [&Colors](unsigned Color, unsigned Size) {
+    for (unsigned i = 0; i < Size; i++) {
+      unsigned Pos = Color + i;
+      assert(!Colors.test(Pos) && "color already allocated");
+      LLVM_DEBUG(dbgs() << Pos << "is allocated\n");
+      Colors.set(Pos);
     }
   };
 
-  auto freeColor = [&colors](unsigned color, unsigned size) {
-    for (unsigned i = 0; i < size; i++) {
-      unsigned pos = color + i;
-      assert(colors.test(pos) && "color has not been allocated");
-      LLVM_DEBUG(dbgs() << pos << "is free\n");
-      colors.reset(pos);
+  auto FreeColor = [&Colors](unsigned Color, unsigned Size) {
+    for (unsigned i = 0; i < Size; i++) {
+      unsigned Pos = Color + i;
+      assert(Colors.test(Pos) && "color has not been allocated");
+      LLVM_DEBUG(dbgs() << Pos << "is free\n");
+      Colors.reset(Pos);
     }
   };
 
@@ -1680,25 +1675,25 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
   // ShareColorLineages will mark lineages which share color with other
   // lineages. When sched, choose new lineages which has more conflict with
   // ShareColorLineages.
-  const DenseSet<unsigned> &ShareColorLineages = Color.ShareColorLineages;
+  const DenseSet<unsigned> &ShareColorLineages = ColorRes.ShareColorLineages;
 
   std::vector<const SUnit *> Schedule;
   DenseSet<unsigned> UnfinishedLineages;
   while (!ReadyList.empty()) {
     // Make sure node conflict with predLineage first.
     std::sort(ReadyList.begin(), ReadyList.end(),
-              [&UnfinishedLineages, &Color](const SUnit *a, const SUnit *b) {
-                unsigned confA = 0;
+              [&UnfinishedLineages, &ColorRes](const SUnit *A, const SUnit *B) {
+                unsigned ConfA = 0;
                 for (unsigned L : UnfinishedLineages) {
-                  if (Color.isConflict(a, L))
-                    confA++;
+                  if (ColorRes.isConflict(A, L))
+                    ConfA++;
                 }
-                unsigned confB = 0;
+                unsigned ConfB = 0;
                 for (unsigned L : UnfinishedLineages) {
-                  if (Color.isConflict(b, L))
-                    confB++;
+                  if (ColorRes.isConflict(B, L))
+                    ConfB++;
                 }
-                return confA > confB;
+                return ConfA > ConfB;
               });
 
     LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU
@@ -1706,33 +1701,33 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
       dbgs() << " " << SU->NodeNum;
     } dbgs() << "\n";);
     SUnit *Candidate = nullptr;
-    for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
-      SUnit *SU = *it;
-      unsigned color = Color.getColor(SU);
-      unsigned size = Color.getSize(SU);
+    for (auto It = ReadyList.begin(); It != ReadyList.end(); It++) {
+      SUnit *SU = *It;
+      unsigned Color = ColorRes.getColor(SU);
+      unsigned Size = ColorRes.getSize(SU);
       // If SU is not head or color is available, SU is the candidate.
-      if (Color.isHead(SU)) {
-        if (!isColorAvail(color, size))
+      if (ColorRes.isHead(SU)) {
+        if (!IsColorAvail(Color, Size))
           continue;
         // alloc color.
-        allocColor(color, size);
+        AllocColor(Color, Size);
         // save tail color.
-        const SUnit *Tail = Color.getTail(SU);
-        unsigned ID = Color.getLineage(SU);
-        SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2> &tailColors =
+        const SUnit *Tail = ColorRes.getTail(SU);
+        unsigned ID = ColorRes.getLineage(SU);
+        SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2> &TailColors =
             TailMap[Tail];
-        tailColors.emplace_back(std::make_tuple(color, size, ID));
+        TailColors.emplace_back(std::make_tuple(Color, Size, ID));
         if (ShareColorLineages.count(ID))
           UnfinishedLineages.insert(ID);
       }
 
       // free color for working lineage which end with SU.
-      if (Color.isTail(SU)) {
-        auto &tailColors = TailMap[SU];
-        for (auto &tailTuple : tailColors) {
-          unsigned lineageColor, lineageSize, ID;
-          std::tie(lineageColor, lineageSize, ID) = tailTuple;
-          freeColor(lineageColor, lineageSize);
+      if (ColorRes.isTail(SU)) {
+        auto &TailColors = TailMap[SU];
+        for (auto &TailTuple : TailColors) {
+          unsigned LineageColor, LineageSize, ID;
+          std::tie(LineageColor, LineageSize, ID) = TailTuple;
+          FreeColor(LineageColor, LineageSize);
           if (ShareColorLineages.count(ID))
             UnfinishedLineages.insert(ID);
         }
@@ -1742,21 +1737,21 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
 
       Candidate = SU;
       // Remove Candidate from ReadyList.
-      ReadyList.erase(it);
+      ReadyList.erase(It);
       break;
     }
 
     if (!Candidate) {
       // In case failed to find candidate, start a lineage if there is one.
-      for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
-        SUnit *SU = *it;
+      for (auto It = ReadyList.begin(); It != ReadyList.end(); It++) {
+        SUnit *SU = *It;
 
-        if (!Color.isHead(SU)) {
+        if (!ColorRes.isHead(SU)) {
           continue;
         }
         Candidate = SU;
         // Remove Candidate from ReadyList.
-        ReadyList.erase(it);
+        ReadyList.erase(It);
         break;
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
index c447750e17f1d..c19190c6afe24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -49,9 +49,8 @@ struct SubExp {
   unsigned VMaxSize;
   LiveSet InputLive;
   LiveSet OutputLive;
-  bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool IsMoveUp) const;
-  void calcMaxPressure(const llvm::MachineRegisterInfo &MRI,
-                       const llvm::SIRegisterInfo *SIRI);
+  bool isSafeToMove(const llvm::MachineRegisterInfo &MRI) const;
+  void calcMaxPressure(const llvm::MachineRegisterInfo &MRI);
   void dump(const llvm::MachineRegisterInfo &MRI,
             const llvm::SIRegisterInfo *SIRI) const;
   bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo *SIRI) const;
@@ -83,8 +82,8 @@ struct ExpDag {
   void addCustomGraphFeatures(llvm::GraphWriter<ExpDag *> &) const {}
 
 private:
-  template <typename T> void initNodes(const LiveSet &InputLiveReg, T &insts);
-  void addDataDep(const llvm::SIRegisterInfo *SIRI);
+  template <typename T> void initNodes(const LiveSet &InputLiveReg, T &Insts);
+  void addDataDep();
   void addCtrlDep();
   void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
                    const llvm::SIRegisterInfo *SIRI,
@@ -140,10 +139,10 @@ class HRB {
     llvm::DenseSet<llvm::SUnit *> HeadSet;
     llvm::DenseSet<llvm::SUnit *> TailSet;
     llvm::DenseMap<llvm::SUnit *, llvm::SUnit *> HeadTailMap;
-    unsigned maxReg = 0;
-    unsigned maxVGPR = 0;
-    unsigned maxSGPR = 0;
-    void colorSU(llvm::SUnit *SU, unsigned color);
+    unsigned MaxReg = 0;
+    unsigned MaxVGPR = 0;
+    unsigned MaxSGPR = 0;
+    void colorSU(llvm::SUnit *SU, unsigned Color);
     unsigned getLineage(llvm::SUnit *SU) const;
     bool isConflict(const llvm::SUnit *SU0, unsigned Lineage) const;
     bool isHead(llvm::SUnit *SU) const;
@@ -161,8 +160,8 @@ class HRB {
   llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> &getReachMap() {
     return ReachMap;
   }
-  bool canReach(llvm::SUnit *a, llvm::SUnit *b);
-  void updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+  bool canReach(llvm::SUnit *a, llvm::SUnit *B);
+  void updateReachForEdge(llvm::SUnit *A, llvm::SUnit *B,
                           std::vector<llvm::SUnit> &SUnits);
   void fusionLineages(std::vector<llvm::SUnit> &SUnits);
   ColorResult &coloring();
@@ -172,10 +171,10 @@ class HRB {
 private:
   Lineage buildChain(llvm::SUnit *Node, std::vector<llvm::SUnit> &SUnits);
   llvm::SUnit *findHeir(llvm::SUnit *SU, std::vector<llvm::SUnit> &SUnits);
-  bool isConflict(const Lineage &a, const Lineage &b);
-  bool canFuse(const Lineage &a, const Lineage &b);
-  bool tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits);
-  unsigned colorLineages(std::vector<Lineage *> &lineages,
+  bool isConflict(const Lineage &A, const Lineage &B);
+  bool canFuse(const Lineage &A, const Lineage &B);
+  bool tryFuse(Lineage &A, Lineage &B, std::vector<llvm::SUnit> &SUnits);
+  unsigned colorLineages(std::vector<Lineage *> &Lineages,
                          llvm::DenseMap<Lineage *, unsigned> &AllocMap,
                          const unsigned Limit);
 

From 0600e2fd23c5a6b7992c6cfcc1944e255fbae7d9 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 16:57:56 -0700
Subject: [PATCH 16/25] Possibly the last batch of cleanup

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |   1 -
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 928 +++++++++---------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  14 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    |  24 +-
 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h  |  99 --
 5 files changed, 440 insertions(+), 626 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 853a212ac5bf3..a6ce3426a7b93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -17,7 +17,6 @@
 #include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "AMDGPUSubExpDag.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDGPUVMemDegreeDAG.h"
 #include "GCNRegPressure.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index d207b3aa3d4f3..990718cd7525f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -60,10 +60,10 @@ class CFGWithPhi {
 };
 
 void CFGWithPhi::dump() {
-#ifdef DBG
+#ifndef NDEBUG
   for (MachineBasicBlock &BB : F) {
     dbgs() << BB.getName() << "\n";
-    auto &PhiInsts = blockToPhiInstsMap[&BB];
+    auto &PhiInsts = BlockToPhiInstsMap[&BB];
     for (MachineInstr *I : PhiInsts) {
       if (!I->isPHI())
         continue;
@@ -644,31 +644,31 @@ bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
 }
 } // namespace llvm
 
-// Helper functions to write jason.
+// Helper functions to Write jason.
 namespace {
-void json_name(StringRef Val, raw_ostream &os) { os << "\"" << Val << "\":"; }
+void json_name(StringRef Val, raw_ostream &OS) { OS << "\"" << Val << "\":"; }
 
 template <typename write_fn>
-void json_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
-  json_name(Val, os);
-  os << "\"";
-  fn();
-  os << "\"";
+void json_pair(StringRef Val, write_fn &Fn, raw_ostream &OS) {
+  json_name(Val, OS);
+  OS << "\"";
+  Fn();
+  OS << "\"";
 }
 
 template <typename write_fn>
-void json_obj_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
-  json_name(Val, os);
+void json_obj_pair(StringRef Val, write_fn &Fn, raw_ostream &OS) {
+  json_name(Val, OS);
 
-  fn();
+  Fn();
 }
 
 template <typename write_fn>
-void json_array(StringRef Val, write_fn &fn, raw_ostream &os) {
-  json_name(Val, os);
-  os << "[";
-  fn();
-  os << "]";
+void json_array(StringRef Val, write_fn &Fn, raw_ostream &OS) {
+  json_name(Val, OS);
+  OS << "[";
+  Fn();
+  OS << "]";
 }
 } // namespace
 
@@ -676,71 +676,71 @@ namespace llvm {
 namespace pressure {
 
 void write_inst(MachineInstr &MI, const SlotIndexes *SlotIndexes,
-                const SIInstrInfo *SIII, raw_ostream &os) {
-  os << "{";
+                const SIInstrInfo *SIII, raw_ostream &OS) {
+  OS << "{";
   SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
-  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+  auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); };
 
-  json_pair("slot_index", writeSlot, os);
+  json_pair("slot_index", WriteSlot, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeOpcode = [&MI, &SIII, &os]() {
-    os << SIII->getName(MI.getOpcode());
+  auto WriteOpcode = [&MI, &SIII, &OS]() {
+    OS << SIII->getName(MI.getOpcode());
   };
 
-  json_pair("opcode", writeOpcode, os);
+  json_pair("opcode", WriteOpcode, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeAsm = [&MI, &SIII, &os]() {
-    MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+  auto WriteAsm = [&MI, &SIII, &OS]() {
+    MI.print(OS, /*IsStandalone*/ true, /*SkipOpers*/ false,
              /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
   };
-  json_pair("asm", writeAsm, os);
+  json_pair("asm", WriteAsm, OS);
 
-  os << "}";
+  OS << "}";
 }
 
 void print_reg(Register Reg, const MachineRegisterInfo &MRI,
-               const SIRegisterInfo *SIRI, raw_ostream &os) {
+               const SIRegisterInfo *SIRI, raw_ostream &OS) {
   if (Reg.isVirtual()) {
     StringRef Name = MRI.getVRegName(Reg);
     if (Name != "") {
-      os << '%' << Name;
+      OS << '%' << Name;
     } else {
-      os << '%' << Register::virtReg2Index(Reg);
+      OS << '%' << Register::virtReg2Index(Reg);
     }
   } else if (Reg < SIRI->getNumRegs()) {
-    os << '$';
-    printLowerCase(SIRI->getName(Reg), os);
+    OS << '$';
+    printLowerCase(SIRI->getName(Reg), OS);
   } else {
     llvm_unreachable("invalid reg");
   }
 }
 
 void write_reg(unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI,
-               const SIRegisterInfo *SIRI, raw_ostream &os) {
-  os << "{";
+               const SIRegisterInfo *SIRI, raw_ostream &OS) {
+  OS << "{";
 
-  auto writeReg = [&MRI, &SIRI, &Reg, &os]() { print_reg(Reg, MRI, SIRI, os); };
-  json_pair("reg", writeReg, os);
+  auto WriteReg = [&MRI, &SIRI, &Reg, &OS]() { print_reg(Reg, MRI, SIRI, OS); };
+  json_pair("reg", WriteReg, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeSubReg = [&SubReg, &os]() { os << SubReg; };
+  auto WriteSubReg = [&SubReg, &OS]() { OS << SubReg; };
 
-  json_pair("sub_reg", writeSubReg, os);
+  json_pair("sub_reg", WriteSubReg, OS);
 
-  os << ",";
-  auto writeIsSgpr = [&Reg, &MRI, &SIRI, &os]() {
+  OS << ",";
+  auto WriteIsSgpr = [&Reg, &MRI, &SIRI, &OS]() {
     if (SIRI->isSGPRReg(MRI, Reg))
-      os << "true";
+      OS << "true";
     else
-      os << "false";
+      OS << "false";
   };
-  json_obj_pair("is_sgpr", writeIsSgpr, os);
-  os << "}";
+  json_obj_pair("is_sgpr", WriteIsSgpr, OS);
+  OS << "}";
 }
 
 unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI,
@@ -749,7 +749,7 @@ unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI,
 }
 
 void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI,
-                const SIRegisterInfo *SIRI, raw_ostream &os) {
+                const SIRegisterInfo *SIRI, raw_ostream &OS) {
   if (Mask.none()) {
     unsigned size = get_reg_size(Reg, MRI, SIRI);
     Mask = LaneBitmask((1 << size) - 1);
@@ -757,199 +757,199 @@ void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI,
   unsigned mask = Mask.getAsInteger();
   for (unsigned i = 0; i <= Mask.getHighestLane(); i++) {
     if (mask & (1 << i)) {
-      write_reg(Reg, i, MRI, SIRI, os);
-      os << ",\n";
+      write_reg(Reg, i, MRI, SIRI, OS);
+      OS << ",\n";
     }
   }
 }
 
 void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask,
                           const MachineRegisterInfo &MRI,
-                          const SIRegisterInfo *SIRI, raw_ostream &os) {
-  os << "{";
-  auto writeID = [&ID, &os]() { os << ID; };
+                          const SIRegisterInfo *SIRI, raw_ostream &OS) {
+  OS << "{";
+  auto WriteID = [&ID, &OS]() { OS << ID; };
 
-  json_pair("ID", writeID, os);
+  json_pair("ID", WriteID, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeReg = [&reg, &MRI, &SIRI, &os]() { print_reg(reg, MRI, SIRI, os); };
+  auto WriteReg = [&reg, &MRI, &SIRI, &OS]() { print_reg(reg, MRI, SIRI, OS); };
 
-  json_pair("reg", writeReg, os);
+  json_pair("reg", WriteReg, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeMask = [&mask, &os]() { os << mask; };
+  auto WriteMask = [&mask, &OS]() { OS << mask; };
 
-  json_pair("mask", writeMask, os);
+  json_pair("mask", WriteMask, OS);
 
-  os << "},\n";
+  OS << "},\n";
 }
 
 void write_dag_inst_node(unsigned ID, SlotIndex Slot,
                          GCNRPTracker::LiveRegSet LiveReg,
                          const MachineRegisterInfo &MRI,
                          const SIRegisterInfo *SIRI, SUnit *SU,
-                         raw_ostream &os) {
-  os << "{";
-  auto writeID = [&ID, &os]() { os << ID; };
+                         raw_ostream &OS) {
+  OS << "{";
+  auto WriteID = [&ID, &OS]() { OS << ID; };
 
-  json_pair("ID", writeID, os);
+  json_pair("ID", WriteID, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+  auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); };
 
-  json_pair("slot_index", writeSlot, os);
+  json_pair("slot_index", WriteSlot, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeRegs = [&LiveReg, &MRI, &SIRI, &os]() {
-    for (auto it : LiveReg) {
-      unsigned Reg = it.first;
-      LaneBitmask Mask = it.second;
-      write_live(Reg, Mask, MRI, SIRI, os);
+  auto WriteRegs = [&LiveReg, &MRI, &SIRI, &OS]() {
+    for (auto It : LiveReg) {
+      unsigned Reg = It.first;
+      LaneBitmask Mask = It.second;
+      write_live(Reg, Mask, MRI, SIRI, OS);
     }
   };
-  json_array("regs", writeRegs, os);
+  json_array("regs", WriteRegs, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writePreds = [&SU, &os]() {
+  auto WritePreds = [&SU, &OS]() {
     for (auto &Pred : SU->Preds) {
 
-      os << Pred.getSUnit()->NodeNum << ",";
+      OS << Pred.getSUnit()->NodeNum << ",";
     }
   };
 
-  json_array("preds", writePreds, os);
+  json_array("preds", WritePreds, OS);
 
-  os << "},\n";
+  OS << "},\n";
 }
 
 void write_block(MachineBasicBlock &Blk, LiveIntervals *LIS,
                  const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                 const SIInstrInfo *SIII, raw_ostream &os) {
-  os << "{\n";
-  auto writeName = [&Blk, &os]() { os << Blk.getName(); };
-  json_pair("name", writeName, os);
+                 const SIInstrInfo *SIII, raw_ostream &OS) {
+  OS << "{\n";
+  auto WriteName = [&Blk, &OS]() { OS << Blk.getName(); };
+  json_pair("name", WriteName, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeIndex = [&Blk, &os]() { os << Blk.getNumber(); };
-  json_pair("id", writeIndex, os);
+  auto WriteIndex = [&Blk, &OS]() { OS << Blk.getNumber(); };
+  json_pair("id", WriteIndex, OS);
 
-  os << ",";
+  OS << ",";
 
   const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
 
   SlotIndex BeginSlot = SlotIndexes->getMBBStartIdx(&Blk);
-  auto writeSlot = [&BeginSlot, &os]() { BeginSlot.print(os); };
-  json_pair("begin_slot", writeSlot, os);
+  auto WriteSlot = [&BeginSlot, &OS]() { BeginSlot.print(OS); };
+  json_pair("begin_slot", WriteSlot, OS);
 
-  os << ",";
+  OS << ",";
 
   SlotIndex EndSlot = SlotIndexes->getMBBEndIdx(&Blk);
-  auto writeEndSlot = [&EndSlot, &os]() { EndSlot.print(os); };
-  json_pair("end_slot", writeEndSlot, os);
+  auto WriteEndSlot = [&EndSlot, &OS]() { EndSlot.print(OS); };
+  json_pair("end_slot", WriteEndSlot, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeInsts = [&Blk, &SlotIndexes, &SIII, &os]() {
+  auto WriteInsts = [&Blk, &SlotIndexes, &SIII, &OS]() {
     for (MachineInstr &MI : Blk) {
       if (MI.isDebugInstr())
         continue;
-      write_inst(MI, SlotIndexes, SIII, os);
-      os << ",\n";
+      write_inst(MI, SlotIndexes, SIII, OS);
+      OS << ",\n";
     }
   };
 
-  json_array("instructions", writeInsts, os);
+  json_array("instructions", WriteInsts, OS);
 
-  os << ",";
+  OS << ",";
 
-  BlockExpDag dag(&Blk, LIS, MRI, SIRI, SIII);
-  dag.buildWithPressure();
+  BlockExpDag Dag(&Blk, LIS, MRI, SIRI, SIII);
+  Dag.buildWithPressure();
 
-  const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *dag.LIS, dag.MRI);
-  auto writeInputs = [&StartLiveReg, &dag, &os]() {
-    for (auto it : StartLiveReg) {
-      unsigned Reg = it.first;
-      LaneBitmask mask = it.second;
-      SUnit *SU = dag.InputSUnitMap[Reg];
+  const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *Dag.LIS, Dag.MRI);
+  auto WriteInputs = [&StartLiveReg, &Dag, &OS]() {
+    for (auto It : StartLiveReg) {
+      unsigned Reg = It.first;
+      LaneBitmask Mask = It.second;
+      SUnit *SU = Dag.InputSUnitMap[Reg];
       // Write Reg and mask to the nodes.
-      write_dag_input_node(SU->NodeNum, Reg, mask.getAsInteger(), dag.MRI,
-                           dag.SIRI, os);
+      write_dag_input_node(SU->NodeNum, Reg, Mask.getAsInteger(), Dag.MRI,
+                           Dag.SIRI, OS);
     }
   };
 
-  json_array("input_nodes", writeInputs, os);
+  json_array("input_nodes", WriteInputs, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeNodes = [&SlotIndexes, &dag, &os]() {
-    for (auto it : dag.MISUnitMap) {
-      MachineInstr *MI = it.first;
-      SUnit *SU = it.second;
+  auto WriteNodes = [&SlotIndexes, &Dag, &OS]() {
+    for (auto It : Dag.MISUnitMap) {
+      MachineInstr *MI = It.first;
+      SUnit *SU = It.second;
       // Use SlotIndex of MI.
       SlotIndex SlotIndex;
       if (!MI->isDebugInstr())
         SlotIndex = SlotIndexes->getInstructionIndex(*MI);
-      GCNRPTracker::LiveRegSet LiveReg = dag.DagPressureMap[SU];
+      GCNRPTracker::LiveRegSet LiveReg = Dag.DagPressureMap[SU];
       // Write slot, live to the nodes.
-      write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, dag.MRI, dag.SIRI,
-                          SU, os);
+      write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, Dag.MRI, Dag.SIRI,
+                          SU, OS);
     }
   };
 
-  json_array("inst_nodes", writeNodes, os);
+  json_array("inst_nodes", WriteNodes, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writePreds = [&Blk, &os]() {
+  auto WritePreds = [&Blk, &OS]() {
     for (MachineBasicBlock *Pred : Blk.predecessors()) {
-      os << Pred->getNumber() << ",";
+      OS << Pred->getNumber() << ",";
     }
   };
 
-  json_array("preds", writePreds, os);
+  json_array("preds", WritePreds, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeSuccs = [&Blk, &os]() {
+  auto WriteSuccs = [&Blk, &OS]() {
     for (MachineBasicBlock *Succ : Blk.successors()) {
-      os << Succ->getNumber() << ",";
+      OS << Succ->getNumber() << ",";
     }
   };
 
-  json_array("succs", writeSuccs, os);
+  json_array("succs", WriteSuccs, OS);
 
-  os << "}";
+  OS << "}";
 }
 
 void write_define(SlotIndex &Slot, unsigned Reg, unsigned SubReg,
                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                  raw_ostream &os) {
-  os << "{";
-  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+                  raw_ostream &OS) {
+  OS << "{";
+  auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); };
 
-  json_pair("slot_index", writeSlot, os);
+  json_pair("slot_index", WriteSlot, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeReg = [&MRI, &SIRI, &Reg, &SubReg, &os]() {
-    write_reg(Reg, SubReg, MRI, SIRI, os);
+  auto WriteReg = [&MRI, &SIRI, &Reg, &SubReg, &OS]() {
+    write_reg(Reg, SubReg, MRI, SIRI, OS);
   };
-  json_obj_pair("reg", writeReg, os);
+  json_obj_pair("reg", WriteReg, OS);
 
-  os << "}\n";
+  OS << "}\n";
 
-  os << ",";
+  OS << ",";
 }
 
 void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes,
                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                  raw_ostream &os) {
+                  raw_ostream &OS) {
   // Split subReg?  MO.getSubReg();
   Register Reg = MO.getReg();
   unsigned SubReg = MO.getSubReg();
@@ -958,104 +958,104 @@ void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes,
   if (SubReg == 0) {
     unsigned Size = get_reg_size(Reg, MRI, SIRI);
     for (unsigned i = 0; i < Size; i++) {
-      write_define(Slot, Reg, i, MRI, SIRI, os);
+      write_define(Slot, Reg, i, MRI, SIRI, OS);
     }
   } else {
     switch (SubReg) {
     default:
       assert(0 && "SubReg not supported yet.");
-      write_define(Slot, Reg, SubReg, MRI, SIRI, os);
+      write_define(Slot, Reg, SubReg, MRI, SIRI, OS);
       break;
     case AMDGPU::sub0:
-      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 0, MRI, SIRI, OS);
       break;
     case AMDGPU::sub1:
-      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, OS);
       break;
     case AMDGPU::sub2:
-      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, OS);
       break;
     case AMDGPU::sub3:
-      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, OS);
       break;
     case AMDGPU::sub4:
-      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      write_define(Slot, Reg, 4, MRI, SIRI, OS);
       break;
     case AMDGPU::sub5:
-      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      write_define(Slot, Reg, 5, MRI, SIRI, OS);
       break;
     case AMDGPU::sub6:
-      write_define(Slot, Reg, 6, MRI, SIRI, os);
+      write_define(Slot, Reg, 6, MRI, SIRI, OS);
       break;
     case AMDGPU::sub7:
-      write_define(Slot, Reg, 7, MRI, SIRI, os);
+      write_define(Slot, Reg, 7, MRI, SIRI, OS);
       break;
     case AMDGPU::sub8:
-      write_define(Slot, Reg, 8, MRI, SIRI, os);
+      write_define(Slot, Reg, 8, MRI, SIRI, OS);
       break;
     case AMDGPU::sub9:
-      write_define(Slot, Reg, 9, MRI, SIRI, os);
+      write_define(Slot, Reg, 9, MRI, SIRI, OS);
       break;
     case AMDGPU::sub10:
-      write_define(Slot, Reg, 10, MRI, SIRI, os);
+      write_define(Slot, Reg, 10, MRI, SIRI, OS);
       break;
     case AMDGPU::sub11:
-      write_define(Slot, Reg, 11, MRI, SIRI, os);
+      write_define(Slot, Reg, 11, MRI, SIRI, OS);
       break;
     case AMDGPU::sub12:
-      write_define(Slot, Reg, 12, MRI, SIRI, os);
+      write_define(Slot, Reg, 12, MRI, SIRI, OS);
       break;
     case AMDGPU::sub13:
-      write_define(Slot, Reg, 13, MRI, SIRI, os);
+      write_define(Slot, Reg, 13, MRI, SIRI, OS);
       break;
     case AMDGPU::sub14:
-      write_define(Slot, Reg, 14, MRI, SIRI, os);
+      write_define(Slot, Reg, 14, MRI, SIRI, OS);
       break;
     case AMDGPU::sub15:
-      write_define(Slot, Reg, 15, MRI, SIRI, os);
+      write_define(Slot, Reg, 15, MRI, SIRI, OS);
       break;
     case AMDGPU::sub0_sub1:
-      write_define(Slot, Reg, 0, MRI, SIRI, os);
-      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 0, MRI, SIRI, OS);
+      write_define(Slot, Reg, 1, MRI, SIRI, OS);
       break;
     case AMDGPU::sub2_sub3:
-      write_define(Slot, Reg, 2, MRI, SIRI, os);
-      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, OS);
+      write_define(Slot, Reg, 3, MRI, SIRI, OS);
       break;
     case AMDGPU::sub4_sub5:
-      write_define(Slot, Reg, 4, MRI, SIRI, os);
-      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      write_define(Slot, Reg, 4, MRI, SIRI, OS);
+      write_define(Slot, Reg, 5, MRI, SIRI, OS);
       break;
     case AMDGPU::sub1_sub2:
-      write_define(Slot, Reg, 1, MRI, SIRI, os);
-      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, OS);
+      write_define(Slot, Reg, 2, MRI, SIRI, OS);
       break;
     case AMDGPU::sub0_sub1_sub2:
-      write_define(Slot, Reg, 0, MRI, SIRI, os);
-      write_define(Slot, Reg, 1, MRI, SIRI, os);
-      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 0, MRI, SIRI, OS);
+      write_define(Slot, Reg, 1, MRI, SIRI, OS);
+      write_define(Slot, Reg, 2, MRI, SIRI, OS);
       break;
     case AMDGPU::sub0_sub1_sub2_sub3:
-      write_define(Slot, Reg, 0, MRI, SIRI, os);
-      write_define(Slot, Reg, 1, MRI, SIRI, os);
-      write_define(Slot, Reg, 2, MRI, SIRI, os);
-      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      write_define(Slot, Reg, 0, MRI, SIRI, OS);
+      write_define(Slot, Reg, 1, MRI, SIRI, OS);
+      write_define(Slot, Reg, 2, MRI, SIRI, OS);
+      write_define(Slot, Reg, 3, MRI, SIRI, OS);
       break;
     case AMDGPU::sub2_sub3_sub4_sub5:
-      write_define(Slot, Reg, 2, MRI, SIRI, os);
-      write_define(Slot, Reg, 3, MRI, SIRI, os);
-      write_define(Slot, Reg, 4, MRI, SIRI, os);
-      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, OS);
+      write_define(Slot, Reg, 3, MRI, SIRI, OS);
+      write_define(Slot, Reg, 4, MRI, SIRI, OS);
+      write_define(Slot, Reg, 5, MRI, SIRI, OS);
       break;
     case AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7:
-      write_define(Slot, Reg, 0, MRI, SIRI, os);
-      write_define(Slot, Reg, 1, MRI, SIRI, os);
-      write_define(Slot, Reg, 2, MRI, SIRI, os);
-      write_define(Slot, Reg, 3, MRI, SIRI, os);
-      write_define(Slot, Reg, 4, MRI, SIRI, os);
-      write_define(Slot, Reg, 5, MRI, SIRI, os);
-      write_define(Slot, Reg, 6, MRI, SIRI, os);
-      write_define(Slot, Reg, 7, MRI, SIRI, os);
+      write_define(Slot, Reg, 0, MRI, SIRI, OS);
+      write_define(Slot, Reg, 1, MRI, SIRI, OS);
+      write_define(Slot, Reg, 2, MRI, SIRI, OS);
+      write_define(Slot, Reg, 3, MRI, SIRI, OS);
+      write_define(Slot, Reg, 4, MRI, SIRI, OS);
+      write_define(Slot, Reg, 5, MRI, SIRI, OS);
+      write_define(Slot, Reg, 6, MRI, SIRI, OS);
+      write_define(Slot, Reg, 7, MRI, SIRI, OS);
       break;
     }
   }
@@ -1063,13 +1063,13 @@ void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes,
 
 void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes,
                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                   raw_ostream &os) {
+                   raw_ostream &OS) {
 
   for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
     auto Reg = Register::index2VirtReg(i);
 
     for (MachineOperand &MO : MRI.def_operands(Reg)) {
-      write_define(MO, SlotIndexes, MRI, SIRI, os);
+      write_define(MO, SlotIndexes, MRI, SIRI, OS);
     }
   }
 }
@@ -1077,288 +1077,288 @@ void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes,
 void write_uses(MachineFunction &MF, const SlotIndexes *SlotIndexes,
 
                 const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                raw_ostream &os) {
+                raw_ostream &OS) {
 
   for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
     auto Reg = Register::index2VirtReg(i);
 
     for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
       // TODO: create write_use if use has more info.
-      write_define(MO, SlotIndexes, MRI, SIRI, os);
+      write_define(MO, SlotIndexes, MRI, SIRI, OS);
     }
   }
 }
 
 void write_liveness(SlotIndex Slot, GCNRPTracker::LiveRegSet &LiveSet,
                     const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                    raw_ostream &os) {
-  os << "{";
-  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+                    raw_ostream &OS) {
+  OS << "{";
+  auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); };
 
-  json_pair("slot_index", writeSlot, os);
+  json_pair("slot_index", WriteSlot, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeRegs = [&LiveSet, &MRI, &SIRI, &os]() {
+  auto WriteRegs = [&LiveSet, &MRI, &SIRI, &OS]() {
     for (auto it : LiveSet) {
       unsigned Reg = it.first;
       LaneBitmask Mask = it.second;
-      write_live(Reg, Mask, MRI, SIRI, os);
+      write_live(Reg, Mask, MRI, SIRI, OS);
     }
   };
-  json_array("regs", writeRegs, os);
-  os << "\n},\n";
+  json_array("regs", WriteRegs, OS);
+  OS << "\n},\n";
 }
 
-void write_segment(const LiveInterval::Segment &S, raw_ostream &os) {
-  os << "{";
-  auto writeBegin = [&S, &os]() { S.start.print(os); };
+void write_segment(const LiveInterval::Segment &S, raw_ostream &OS) {
+  OS << "{";
+  auto WriteBegin = [&S, &OS]() { S.start.print(OS); };
 
-  json_pair("begin", writeBegin, os);
+  json_pair("begin", WriteBegin, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeEnd = [&S, &os]() { S.end.print(os); };
+  auto WriteEnd = [&S, &OS]() { S.end.print(OS); };
 
-  json_pair("end", writeEnd, os);
+  json_pair("end", WriteEnd, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeValNum = [&S, &os]() {
+  auto WriteValNum = [&S, &OS]() {
     if (S.valno)
-      os << S.valno->id;
+      OS << S.valno->id;
     else
-      os << 0xFFFFFFFF;
+      OS << 0xFFFFFFFF;
   };
 
-  json_pair("val_num", writeValNum, os);
+  json_pair("val_num", WriteValNum, OS);
 
-  os << "},\n";
+  OS << "},\n";
 }
 
-void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &os) {
-  os << "{\n";
-  auto writeMask = [&SR, &os]() { os << SR.LaneMask.getAsInteger(); };
+void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &OS) {
+  OS << "{\n";
+  auto WriteMask = [&SR, &OS]() { OS << SR.LaneMask.getAsInteger(); };
 
-  json_pair("mask", writeMask, os);
+  json_pair("mask", WriteMask, OS);
 
-  os << ",";
+  OS << ",";
 
   // Segments.
-  auto writeSegments = [&SR, &os]() {
+  auto WriteSegments = [&SR, &OS]() {
     for (auto &S : SR.segments) {
-      write_segment(S, os);
+      write_segment(S, OS);
     }
   };
 
-  json_array("segments", writeSegments, os);
+  json_array("segments", WriteSegments, OS);
 
-  os << "\n},\n";
+  OS << "\n},\n";
 }
 
 void write_live_interval(LiveInterval &LI, const MachineRegisterInfo &MRI,
-                         const SIRegisterInfo *SIRI, raw_ostream &os) {
-  os << "{\n";
+                         const SIRegisterInfo *SIRI, raw_ostream &OS) {
+  OS << "{\n";
 
-  auto writeReg = [&LI, &MRI, &SIRI, &os]() {
-    write_reg(LI.reg(), 0, MRI, SIRI, os);
+  auto WriteReg = [&LI, &MRI, &SIRI, &OS]() {
+    write_reg(LI.reg(), 0, MRI, SIRI, OS);
   };
 
-  json_obj_pair("reg", writeReg, os);
+  json_obj_pair("reg", WriteReg, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeSegments = [&LI, &os]() {
+  auto WriteSegments = [&LI, &OS]() {
     for (auto &S : LI.segments) {
-      write_segment(S, os);
+      write_segment(S, OS);
     }
   };
 
-  json_array("segments", writeSegments, os);
+  json_array("segments", WriteSegments, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeSubRanges = [&LI, &os]() {
+  auto WriteSubRanges = [&LI, &OS]() {
     for (auto &SR : LI.subranges()) {
-      write_subrange(SR, os);
+      write_subrange(SR, OS);
     }
   };
 
-  json_array("subranges", writeSubRanges, os);
+  json_array("subranges", WriteSubRanges, OS);
 
-  os << "},\n";
+  OS << "},\n";
 }
 
 std::string get_legal_str(const MDString *MDStr) {
-  std::string str;
-  raw_string_ostream Stream(str);
+  std::string Str;
+  raw_string_ostream Stream(Str);
   MDStr->print(Stream);
   Stream.flush();
   // Remove !.
-  str = str.substr(1);
+  Str = Str.substr(1);
   // Remove ""
-  str = str.substr(1);
-  str.pop_back();
-  std::replace(str.begin(), str.end(), '\\', '#');
-  return str;
+  Str = Str.substr(1);
+  Str.pop_back();
+  std::replace(Str.begin(), Str.end(), '\\', '#');
+  return Str;
 }
 
-void write_file(const MDNode *FileNode, raw_ostream &os) {
+void write_file(const MDNode *FileNode, raw_ostream &OS) {
   const MDString *FileName = cast<MDString>(FileNode->getOperand(0).get());
-  StringRef fileNameStr = FileName->getString();
-  if (fileNameStr.find("__AMDGPU_GPUMAP_") == 0)
+  StringRef FileNameStr = FileName->getString();
+  if (FileNameStr.find("__AMDGPU_GPUMAP_") == 0)
     return;
-  if (fileNameStr.find("__AMDGPU_DWARF_") == 0)
+  if (FileNameStr.find("__AMDGPU_DWARF_") == 0)
     return;
 
-  os << "{";
+  OS << "{";
 
-  std::string str0 = get_legal_str(FileName);
-  auto writeName = [&str0, &os]() { os << str0; };
-  json_pair("filename", writeName, os);
+  std::string Str0 = get_legal_str(FileName);
+  auto WriteName = [&Str0, &OS]() { OS << Str0; };
+  json_pair("filename", WriteName, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
   const MDString *Content = cast<MDString>(FileNode->getOperand(1).get());
   std::string str = get_legal_str(Content);
-  auto writeContent = [&str, &os]() { os << str; };
-  json_pair("content", writeContent, os);
-  os << "\n},\n";
+  auto WriteContent = [&str, &OS]() { OS << str; };
+  json_pair("content", WriteContent, OS);
+  OS << "\n},\n";
 }
 
-void write_DIFile(const DIFile *File, raw_ostream &os) {
+void write_DIFile(const DIFile *File, raw_ostream &OS) {
   if (File) {
-    std::string name = get_legal_str(File->getRawFilename());
-    std::string dir = "";
+    std::string Name = get_legal_str(File->getRawFilename());
+    std::string Dir = "";
     if (MDString *MDDir = File->getRawDirectory())
-      dir = get_legal_str(MDDir);
-    os << dir << name;
+      Dir = get_legal_str(MDDir);
+    OS << Dir << Name;
   } else {
-    os << "ArtificialFile";
+    OS << "ArtificialFile";
   }
 }
 
-void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &os) {
-  os << "{";
+void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &OS) {
+  OS << "{";
 
-  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+  auto WriteSlot = [&Slot, &OS]() { Slot.print(OS); };
 
-  json_pair("slot_index", writeSlot, os);
+  json_pair("slot_index", WriteSlot, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
   MDNode *Scope = DL.getScope();
-  unsigned line = DL.getLine();
-  unsigned col = DL.getCol();
+  unsigned Line = DL.getLine();
+  unsigned Col = DL.getCol();
 
-  auto writeLine = [&line, &os]() { os << line; };
-  json_pair("line", writeLine, os);
+  auto WriteLine = [&Line, &OS]() { OS << Line; };
+  json_pair("line", WriteLine, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeCol = [&col, &os]() { os << col; };
-  json_pair("col", writeCol, os);
+  auto WriteCol = [&Col, &OS]() { OS << Col; };
+  json_pair("col", WriteCol, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeFile = [&Scope, &os]() {
+  auto WriteFile = [&Scope, &OS]() {
     const DIFile *File = cast<DIScope>(Scope)->getFile();
-    write_DIFile(File, os);
+    write_DIFile(File, OS);
   };
-  json_pair("file", writeFile, os);
+  json_pair("file", WriteFile, OS);
 
-  if (DILocation *inlineDL = DL.getInlinedAt()) {
-    os << ",\n";
-    unsigned inlineLine = inlineDL->getLine();
-    auto writeLine = [&inlineLine, &os]() { os << inlineLine; };
-    json_pair("inline_line", writeLine, os);
+  if (DILocation *InlineDL = DL.getInlinedAt()) {
+    OS << ",\n";
+    unsigned InlineLine = InlineDL->getLine();
+    auto WriteLine = [&InlineLine, &OS]() { OS << InlineLine; };
+    json_pair("inline_line", WriteLine, OS);
 
-    os << ",\n";
+    OS << ",\n";
 
-    unsigned inlineCol = inlineDL->getColumn();
-    auto writeCol = [&inlineCol, &os]() { os << inlineCol; };
-    json_pair("inline_col", writeCol, os);
+    unsigned InlineCol = InlineDL->getColumn();
+    auto WriteCol = [&InlineCol, &OS]() { OS << InlineCol; };
+    json_pair("inline_col", WriteCol, OS);
 
-    os << ",\n";
+    OS << ",\n";
 
     const MDNode *InlineScope = DL.getInlinedAtScope();
-    auto writeFile = [&InlineScope, &os]() {
+    auto WriteFile = [&InlineScope, &OS]() {
       const DIFile *File = cast<DIScope>(InlineScope)->getFile();
-      write_DIFile(File, os);
+      write_DIFile(File, OS);
     };
-    json_pair("inline_file", writeFile, os);
+    json_pair("inline_file", WriteFile, OS);
   }
 
-  os << "\n},\n";
+  OS << "\n},\n";
 }
 
 void write_dbg_val(unsigned Reg, const DIVariable *V, const DIExpression *Exp,
                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                   raw_ostream &os) {
-  os << "{";
+                   raw_ostream &OS) {
+  OS << "{";
 
-  auto writeReg = [&MRI, &SIRI, &Reg, &os]() {
+  auto WriteReg = [&MRI, &SIRI, &Reg, &OS]() {
     const unsigned SubReg = 0;
-    write_reg(Reg, SubReg, MRI, SIRI, os);
+    write_reg(Reg, SubReg, MRI, SIRI, OS);
   };
-  json_obj_pair("reg", writeReg, os);
+  json_obj_pair("reg", WriteReg, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
   if (V) {
-    auto writeName = [&V, &os]() { os << V->getName(); };
-    json_pair("debug_val_name", writeName, os);
-    os << ",\n";
+    auto WriteName = [&V, &OS]() { OS << V->getName(); };
+    json_pair("debug_val_name", WriteName, OS);
+    OS << ",\n";
 
-    auto writeFile = [&V, &os]() {
+    auto WriteFile = [&V, &OS]() {
       const DIFile *File = V->getFile();
-      write_DIFile(File, os);
+      write_DIFile(File, OS);
     };
-    json_pair("debug_val_file", writeFile, os);
-    os << ",\n";
+    json_pair("debug_val_file", WriteFile, OS);
+    OS << ",\n";
 
-    auto writeLine = [&V, &os]() { os << V->getLine(); };
-    json_pair("debug_val_line", writeLine, os);
+    auto WriteLine = [&V, &OS]() { OS << V->getLine(); };
+    json_pair("debug_val_line", WriteLine, OS);
   }
 
   if (Exp->isValid() && Exp->getNumElements()) {
-    os << ",\n";
-    auto writeV = [&Exp, &os]() {
-      os << '[';
+    OS << ",\n";
+    auto WriteV = [&Exp, &OS]() {
+      OS << '[';
       bool NeedSep = false;
       for (auto Op : Exp->expr_ops()) {
         if (NeedSep)
-          os << ", ";
+          OS << ", ";
         else
           NeedSep = true;
-        os << dwarf::OperationEncodingString(Op.getOp());
+        OS << dwarf::OperationEncodingString(Op.getOp());
         for (unsigned I = 0; I < Op.getNumArgs(); ++I)
-          os << ' ' << Op.getArg(I);
+          OS << ' ' << Op.getArg(I);
       }
-      os << "] ";
+      OS << "] ";
     };
-    json_pair("debug_exp", writeV, os);
+    json_pair("debug_exp", WriteV, OS);
   }
-  os << "\n},\n";
+  OS << "\n},\n";
 }
 
 void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS,
                     const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
                     const SIRegisterInfo *SIRI, const SlotIndexes *SlotIndexes,
-                    const NamedMDNode *SourceMD, raw_ostream &os) {
-  os << ",\n";
+                    const NamedMDNode *SourceMD, raw_ostream &OS) {
+  OS << ",\n";
 
-  auto writeFiles = [&SourceMD, &os]() {
+  auto WriteFiles = [&SourceMD, &OS]() {
     for (const MDNode *FileNode : SourceMD->operands()) {
-      write_file(FileNode, os);
+      write_file(FileNode, OS);
     }
   };
 
-  json_array("files", writeFiles, os);
+  json_array("files", WriteFiles, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeLineMapping = [&MF, &SlotIndexes, &os]() {
+  auto WriteLineMapping = [&MF, &SlotIndexes, &OS]() {
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : MBB) {
         if (MI.isDebugInstr()) {
@@ -1368,16 +1368,16 @@ void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS,
         if (!DL)
           continue;
         SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
-        write_line_mapping(Slot, DL, os);
+        write_line_mapping(Slot, DL, OS);
       }
     }
   };
 
-  json_array("line_mapping", writeLineMapping, os);
+  json_array("line_mapping", WriteLineMapping, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeDebugVals = [&MF, &MRI, &SIRI, &os]() {
+  auto WriteDebugVals = [&MF, &MRI, &SIRI, &OS]() {
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : MBB) {
         if (!MI.isDebugValue())
@@ -1392,91 +1392,89 @@ void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS,
 
         const DIVariable *V = MI.getDebugVariable();
         const DIExpression *Exp = MI.getDebugExpression();
-        write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, os);
+        write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, OS);
       }
     }
   };
 
-  json_array("debug_vals", writeDebugVals, os);
+  json_array("debug_vals", WriteDebugVals, OS);
 }
 
 void write_function(MachineFunction &MF, LiveIntervals *LIS,
                     const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
-                    const SIRegisterInfo *SIRI, raw_ostream &os) {
+                    const SIRegisterInfo *SIRI, raw_ostream &OS) {
   const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
 
-  os << "{\n";
-  auto writeName = [&MF, &os]() { os << MF.getName(); };
-  json_pair("name", writeName, os);
+  OS << "{\n";
+  auto WriteName = [&MF, &OS]() { OS << MF.getName(); };
+  json_pair("name", WriteName, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeBlocks = [&MF, &SlotIndexes, &LIS, &MRI, &SIRI, &SIII, &os]() {
+  auto WriteBlocks = [&MF, &LIS, &MRI, &SIRI, &SIII, &OS]() {
     for (MachineBasicBlock &MBB : MF) {
-      write_block(MBB, LIS, MRI, SIRI, SIII, os);
-      os << ",\n";
+      write_block(MBB, LIS, MRI, SIRI, SIII, OS);
+      OS << ",\n";
     }
   };
 
-  json_array("blocks", writeBlocks, os);
+  json_array("blocks", WriteBlocks, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
-    write_defines(MF, SlotIndexes, MRI, SIRI, os);
+  auto WriteDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &OS]() {
+    write_defines(MF, SlotIndexes, MRI, SIRI, OS);
   };
 
-  json_array("defines", writeDefines, os);
+  json_array("defines", WriteDefines, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeUses = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
-    write_uses(MF, SlotIndexes, MRI, SIRI, os);
+  auto WriteUses = [&MF, &SlotIndexes, &MRI, &SIRI, &OS]() {
+    write_uses(MF, SlotIndexes, MRI, SIRI, OS);
   };
 
-  json_array("uses", writeUses, os);
+  json_array("uses", WriteUses, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeLiveness = [&MF, &LIS, &MRI, &SIRI, &os]() {
+  auto WriteLiveness = [&MF, &LIS, &MRI, &SIRI, &OS]() {
     for (MachineBasicBlock &MBB : MF)
       for (MachineInstr &MI : MBB) {
         if (MI.isDebugInstr())
           continue;
         const SlotIndex &SI = LIS->getInstructionIndex(MI).getBaseIndex();
         GCNRPTracker::LiveRegSet LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
-        write_liveness(SI, LISLR, MRI, SIRI, os);
+        write_liveness(SI, LISLR, MRI, SIRI, OS);
       }
   };
 
-  json_array("liveness", writeLiveness, os);
+  json_array("liveness", WriteLiveness, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeLiveIntervals = [&MRI, &SIRI, &LIS, &os]() {
+  auto WriteLiveIntervals = [&MRI, &SIRI, &LIS, &OS]() {
     for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
       auto Reg = Register::index2VirtReg(i);
       if (!LIS->hasInterval(Reg))
         continue;
       auto &LI = LIS->getInterval(Reg);
-      write_live_interval(LI, MRI, SIRI, os);
+      write_live_interval(LI, MRI, SIRI, OS);
     }
   };
 
-  json_array("live_intervals", writeLiveIntervals, os);
+  json_array("live_intervals", WriteLiveIntervals, OS);
 
-#if 0 // TODO: Do we need this?
   // Check debug info.
   const Function &F = MF.getFunction();
   const Module *M = F.getParent();
   const NamedMDNode *SourceMD =
-      M->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceContentsMDName);
+      M->getNamedMetadata("dx.source.contents");
   if (SourceMD) {
-    write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, os);
+    write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, OS);
   }
-#endif
 
-  os << "\n}";
+  OS << "\n}";
 }
 
 void write_pressure(MachineFunction &MF, LiveIntervals *LIS,
@@ -1500,13 +1498,13 @@ void write_pressure(MachineFunction &MF, LiveIntervals *LIS,
   O.close();
 }
 
-void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) {
+void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &OS) {
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
   const auto *SIII = ST->getInstrInfo();
   const auto *SIRI = ST->getRegisterInfo();
   auto &MRI = MF.getRegInfo();
-  write_function(MF, LIS, MRI, SIII, SIRI, os);
-  os.flush();
+  write_function(MF, LIS, MRI, SIII, SIRI, OS);
+  OS.flush();
 }
 
 } // namespace pressure
@@ -1524,16 +1522,15 @@ class ContributionList {
   DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributorMap;
   // Set of inst which been contributed by the key MachineInstr.
   DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributedToMap;
-  void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &os);
+  void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &OS);
   void writeBlock(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
-                  raw_ostream &os);
-  void write(raw_ostream &os);
+                  raw_ostream &OS);
+  void write(raw_ostream &OS);
 };
 
 void buildMIContribution(MachineInstr &MI,
                          DenseSet<MachineInstr *> &ContributorSet,
-                         DenseSet<MachineInstr *> &ContributedSet,
-                         const SIRegisterInfo &SIRI, MachineRegisterInfo &MRI) {
+                         DenseSet<MachineInstr *> &ContributedSet, MachineRegisterInfo &MRI) {
   for (MachineOperand &UseMO : MI.uses()) {
     if (!UseMO.isReg())
       continue;
@@ -1565,134 +1562,132 @@ void buildMIContribution(MachineInstr &MI,
 }
 
 bool ContributionList::propagateContribution() {
-  bool bUpdated = false;
+  bool IsUpdated = false;
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   for (auto *MBB : RPOT) {
     for (auto &MI : *MBB) {
-      auto &contributors = MIContributorMap[&MI];
-      unsigned size = contributors.size();
-      DenseSet<MachineInstr *> parentContributors;
-      for (auto *CMI : contributors) {
-        auto &pContributors = MIContributorMap[CMI];
-        parentContributors.insert(pContributors.begin(), pContributors.end());
+      auto &Contributors = MIContributorMap[&MI];
+      unsigned Size = Contributors.size();
+      DenseSet<MachineInstr *> ParentContributors;
+      for (auto *CMI : Contributors) {
+        auto &Contributors = MIContributorMap[CMI];
+        ParentContributors.insert(Contributors.begin(), Contributors.end());
       }
-      contributors.insert(parentContributors.begin(), parentContributors.end());
-      bUpdated |= size < contributors.size();
+      Contributors.insert(ParentContributors.begin(), ParentContributors.end());
+      IsUpdated |= Size < Contributors.size();
     }
   }
-  return bUpdated;
+  return IsUpdated;
 }
 
 void ContributionList::build() {
   // Build contribution.
   auto &MRI = MF.getRegInfo();
-  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
-  const auto *SIRI = ST->getRegisterInfo();
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
-      auto &contributors = MIContributorMap[&MI];
-      auto &contributed = MIContributedToMap[&MI];
-      buildMIContribution(MI, contributors, contributed, *SIRI, MRI);
+      auto &Contributors = MIContributorMap[&MI];
+      auto &Contributed = MIContributedToMap[&MI];
+      buildMIContribution(MI, Contributors, Contributed, MRI);
     }
   }
   // propagate contribution.
-  bool bUpdated = true;
-  while (bUpdated) {
-    bUpdated = propagateContribution();
+  bool IsUpdated = true;
+  while (IsUpdated) {
+    IsUpdated = propagateContribution();
   }
 }
 
 void ContributionList::writeInst(MachineInstr &MI, const SIInstrInfo *SIII,
-                                 raw_ostream &os) {
-  os << "\n{\n";
+                                 raw_ostream &OS) {
+  OS << "\n{\n";
   unsigned ID = MIIndexMap[&MI];
-  auto writeSlot = [&ID, &os]() { os << ID; };
+  auto WriteSlot = [&ID, &OS]() { OS << ID; };
 
-  json_pair("ID", writeSlot, os);
+  json_pair("ID", WriteSlot, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeAsm = [&MI, &SIII, &os]() {
-    MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+  auto WriteAsm = [&MI, &SIII, &OS]() {
+    MI.print(OS, /*IsStandalone*/ true, /*SkipOpers*/ false,
              /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
   };
-  json_pair("asm", writeAsm, os);
+  json_pair("asm", WriteAsm, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto &contributors = MIContributorMap[&MI];
-  auto writeContributor = [&contributors, this, &os]() {
-    for (auto *MI : contributors) {
+  auto &Contributors = MIContributorMap[&MI];
+  auto WriteContributor = [&Contributors, this, &OS]() {
+    for (auto *MI : Contributors) {
       unsigned ID = MIIndexMap[MI];
-      os << ID << ",";
+      OS << ID << ",";
     }
   };
 
-  json_array("contributors", writeContributor, os);
-  os << ",\n";
+  json_array("contributors", WriteContributor, OS);
+  OS << ",\n";
 
-  auto &contributeds = MIContributedToMap[&MI];
-  auto writeContributed = [&contributeds, this, &os]() {
-    for (auto *MI : contributeds) {
+  auto &Contributeds = MIContributedToMap[&MI];
+  auto WriteContributed = [&Contributeds, this, &OS]() {
+    for (auto *MI : Contributeds) {
       unsigned ID = MIIndexMap[MI];
-      os << ID << ",";
+      OS << ID << ",";
     }
   };
 
-  json_array("contributed", writeContributed, os);
-  os << "\n}\n";
+  json_array("contributed", WriteContributed, OS);
+  OS << "\n}\n";
 }
 
 void ContributionList::writeBlock(MachineBasicBlock &MBB,
-                                  const SIInstrInfo *SIII, raw_ostream &os) {
-  os << "{\n";
-  auto writeName = [&MBB, &os]() { os << MBB.getName(); };
-  json_pair("name", writeName, os);
+                                  const SIInstrInfo *SIII, raw_ostream &OS) {
+  OS << "{\n";
+  auto WriteName = [&MBB, &OS]() { OS << MBB.getName(); };
+  json_pair("name", WriteName, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeIndex = [&MBB, &os]() { os << MBB.getNumber(); };
-  json_pair("id", writeIndex, os);
+  auto WriteIndex = [&MBB, &OS]() { OS << MBB.getNumber(); };
+  json_pair("id", WriteIndex, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeInsts = [this, &MBB, &SIII, &os]() {
+  auto WriteInsts = [this, &MBB, &SIII, &OS]() {
     for (MachineInstr &MI : MBB) {
       if (MI.isDebugInstr())
         continue;
-      writeInst(MI, SIII, os);
-      os << ",\n";
+      writeInst(MI, SIII, OS);
+      OS << ",\n";
     }
   };
 
-  json_array("instructions", writeInsts, os);
+  json_array("instructions", WriteInsts, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writePreds = [&MBB, &os]() {
+  auto WritePreds = [&MBB, &OS]() {
     for (MachineBasicBlock *Pred : MBB.predecessors()) {
-      os << Pred->getNumber() << ",";
+      OS << Pred->getNumber() << ",";
     }
   };
 
-  json_array("preds", writePreds, os);
+  json_array("preds", WritePreds, OS);
 
-  os << ",";
+  OS << ",";
 
-  auto writeSuccs = [&MBB, &os]() {
+  auto WriteSuccs = [&MBB, &OS]() {
     for (MachineBasicBlock *Succ : MBB.successors()) {
-      os << Succ->getNumber() << ",";
+      OS << Succ->getNumber() << ",";
     }
   };
 
-  json_array("succs", writeSuccs, os);
+  json_array("succs", WriteSuccs, OS);
 
-  os << "}";
+  OS << "}";
 }
 
-void ContributionList::write(raw_ostream &os) {
+void ContributionList::write(raw_ostream &OS) {
   unsigned ID = 0;
-  // Build ID for write.
+  // Build ID for Write.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   for (auto *MBB : RPOT) {
     for (auto &MI : *MBB) {
@@ -1703,22 +1698,22 @@ void ContributionList::write(raw_ostream &os) {
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
   const auto *SIII = ST->getInstrInfo();
 
-  os << "{\n";
-  auto writeName = [this, &os]() { os << MF.getName(); };
-  json_pair("name", writeName, os);
+  OS << "{\n";
+  auto WriteName = [this, &OS]() { OS << MF.getName(); };
+  json_pair("name", WriteName, OS);
 
-  os << ",\n";
+  OS << ",\n";
 
-  auto writeBlocks = [this, &SIII, &RPOT, &os]() {
+  auto WriteBlocks = [this, &SIII, &RPOT, &OS]() {
     for (auto *MBB : RPOT) {
-      writeBlock(*MBB, SIII, os);
-      os << ",\n";
+      writeBlock(*MBB, SIII, OS);
+      OS << ",\n";
     }
   };
 
-  json_array("blocks", writeBlocks, os);
+  json_array("blocks", WriteBlocks, OS);
 
-  os << "\n}";
+  OS << "\n}";
 }
 } // namespace
 
@@ -1788,8 +1783,8 @@ void llvm::updatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
 void llvm::buildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
                                       SmallDenseSet<unsigned, 8> &LiveOutSet,
                                       const MachineRegisterInfo *MRI) {
-  for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) {
-    auto &MI = *rit;
+  for (auto RIt = NewBB->rbegin(); RIt != NewBB->rend(); RIt++) {
+    auto &MI = *RIt;
     // Add all physical register defs (exlicit+implicit) to the def register
     // set.
     for (MachineOperand &Def : MI.operands()) {
@@ -1805,7 +1800,7 @@ void llvm::buildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
         continue;
 
       // Reserved regs do not need to be tracked through live-in sets.
-      unsigned Reg = Use.getReg();
+      Register Reg = Use.getReg();
       if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
         continue;
 
@@ -1843,7 +1838,7 @@ MachineReg llvm::createVirtualDstReg(MachineOpcode Opcode,
 // Return true if the MI is a copy of exec.
 // If true then sets pDst to the destination register.
 bool llvm::isExecCopy(const MachineInstr &MI, MachineReg Exec,
-                      MachineReg *pDst) {
+                      MachineReg *OutDst) {
   enum { DST = 0, SRC = 1 };
   bool FoundCopy = false;
   if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::S_MOV_B32 ||
@@ -1853,60 +1848,13 @@ bool llvm::isExecCopy(const MachineInstr &MI, MachineReg Exec,
       FoundCopy = true;
     }
   }
-#if 0 // TODO: Delete this.
-    else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO ||
-             MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32)
-    {
-        FoundCopy = true;
-    }
-#endif
-
   if (FoundCopy) {
-    *pDst = MI.getOperand(DST).getReg();
+    *OutDst = MI.getOperand(DST).getReg();
   }
 
   return FoundCopy;
 }
 
-llvm::MachineRegWithSubReg llvm::getWqmEntryActiveMask(MachineFunction &MF) {
-  llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister,
-                                             AMDGPU::NoSubRegister};
-  if (MachineInstr *MI = getWqmEntryActiveMaskInst(MF)) {
-    LiveLaneMask.Reg = MI->getOperand(0).getReg();
-    LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
-  }
-
-  return LiveLaneMask;
-}
-
-MachineInstr *llvm::getWqmEntryActiveMaskInst(MachineFunction &MF) {
-#if 0 // TODO: Get rid of this
-    // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
-    // This instruction is added by the SIWholeQuadMode pass.
-    MachineBasicBlock &MBB = MF.front();
-    for (MachineInstr &MI : MBB)
-    {
-        if (MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK ||
-            MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK_32)
-        {
-            return &MI;
-        }
-    }
-#endif
-
-  return nullptr;
-}
-
-bool llvm::isFetchShaderCall(const MachineInstr *MI) {
-#if 0 // TODO: Get rid of this.
-    return 
-        MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
-        MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall);
-#else
-  return false;
-#endif
-}
-
 bool llvm::isSccLiveAt(llvm::MachineBasicBlock *MBB,
                        llvm::MachineBasicBlock::iterator MI) {
   const TargetRegisterInfo *TRI =
@@ -1967,7 +1915,7 @@ MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef(
     // If the instruction modifies exec then we cannot use it as
     // an insertion point (if that is a constraint from the caller).
     // The check for EXEC works for both wave64 and wave32 because
-    // it will also catch writes to the subregisters (e.g. exec_lo).
+    // it will also catch Writes to the subregisters (e.g. exec_lo).
     if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
       break;
     }
@@ -1979,8 +1927,8 @@ MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef(
   }
 
   // If no safe location can be found in the block we can save and restore
-  // SCC around MI. There is no way to directly read or write SCC so we use
-  // s_cselect to read the current value of SCC and s_cmp to write the saved
+  // SCC around MI. There is no way to directly read or Write SCC so we use
+  // s_cselect to read the current value of SCC and s_cmp to Write the saved
   // value back to SCC.
   //
   // The generated code will look like this;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 04b4b74fbd726..e4b8a28dda6e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -90,19 +90,7 @@ MachineReg createVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand,
 MachineReg createVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF);
 
 bool isExecCopy(const llvm::MachineInstr &MI, MachineReg Exec,
-                MachineReg *pDst);
-struct MachineRegWithSubReg {
-  MachineReg Reg = /*NoRegister*/ 0;
-  unsigned SubReg = /*NoSubRegister*/ 0;
-};
-MachineRegWithSubReg getWqmEntryActiveMask(llvm::MachineFunction &MF);
-llvm::MachineInstr *getWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
-
-// Return true if this machine instruction represents a call to the fetch
-// shader. We curently have two mechanisims for calling fetch shader:
-// 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
-// 2. A CALL instruction with the `FetchShaderCall` flag set to true.
-bool isFetchShaderCall(const llvm::MachineInstr *MI);
+                MachineReg *OutDst);
 
 bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
                  llvm::MachineBasicBlock::iterator MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index 94d78fb676f9a..95066743b59bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -716,7 +716,7 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
 
   static bool renderGraphFromBottomUp() { return true; }
 
-  static bool isNodeHidden(const SUnit *Node) {
+  static bool isNodeHidden(const SUnit *Node, const llvm::ExpDag *) {
     if (ViewNodes.empty())
       return false;
 
@@ -921,28 +921,6 @@ unsigned getSGPRSize(const TargetRegisterClass *RC,
   return RC->getLaneMask().getNumLanes();
 }
 
-void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet<SUnit *, 2> &BackNodes,
-                                unsigned NodeNum,
-                                SmallDenseSet<SUnit *, 4> &VisitedNodes) {
-  if (VisitedNodes.count(SU))
-    return;
-  VisitedNodes.insert(SU);
-
-  for (SDep &Dep : SU->Succs) {
-    if (Dep.isWeak())
-      continue;
-    if (Dep.getLatency() > 0)
-      continue;
-
-    SUnit *Succ = Dep.getSUnit(); /*
-     if (Succ->NodeNum >= NodeNum)
-       continue;*/
-
-    BackNodes.insert(Succ);
-    collectSameHeightBackNodes(Succ, BackNodes, NodeNum, VisitedNodes);
-  }
-}
-
 } // namespace
 
 namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
deleted file mode 100644
index c49590a7d8f7f..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
+++ /dev/null
@@ -1,99 +0,0 @@
-//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Build degree about VMem to help balance latency and pressure inside a
-/// block.
-//
-//===----------------------------------------------------------------------===//
-#pragma once
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
-#include <vector>
-
-namespace llvm {
-class MachineBasicBlock;
-class SUnit;
-class SIInstrInfo;
-class MachineInstr;
-
-class SimpleDAG {
-public:
-  SimpleDAG(llvm::MachineBasicBlock &MBB, const llvm::SIInstrInfo *TII)
-      : SIII(TII), MBB(MBB) {}
-  std::vector<llvm::SUnit> SUnits;
-  // InstrInfo.
-  const llvm::SIInstrInfo *SIII;
-  llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
-  llvm::DenseMap<llvm::SUnit *, llvm::MachineInstr *> SUnitMIMap;
-  llvm::MachineBasicBlock &MBB;
-  void build();
-
-private:
-  void initNodes();
-  void addDependence();
-  void addCtrlDep();
-};
-
-// Collect height/depth for high latency mem ld, which only update height/depth
-// when cross high latency mem ld. Call the height/depth as VMem degree here.
-// The rule is sample and its user should has different degree.
-// For example
-// a = sample     // a has depth 0, height 3
-// b = sample a   // b has depth 1, height 2
-// c = sample c   // c has depth 2, height 1
-//   user of c    // user of c has depth 2, height 0
-//
-// For the purpose of in block reorder/remat, nothing will move/clone cross the
-// block. So do this after cross blk remat? In the middle of cross block remat
-// to help reach target when only move things cross blk cannot reach the target.
-// Reorder at the beginning? No pressure at that time? After get pressure, might
-// need to update max pressure.
-
-class VMemDegreeDAG {
-public:
-  VMemDegreeDAG(std::vector<llvm::SUnit> &Units, const llvm::SIInstrInfo *TII)
-      : SUnits(Units), SIII(TII) {}
-  std::vector<llvm::SUnit> &SUnits;
-  // InstrInfo.
-  const llvm::SIInstrInfo *SIII;
-  void build();
-
-  bool isHighLatency(const llvm::SUnit *SU) const;
-  bool isHighLatency(const llvm::MachineInstr *MI) const;
-  // height/depth based on Long latency inst.
-  std::vector<unsigned> VMemDataHeight;
-  std::vector<unsigned> VMemDataDepth;
-  // Full height/depth count non-data dependent too.
-  std::vector<unsigned> VMemFullHeight;
-  std::vector<unsigned> VMemFullDepth;
-  llvm::SmallVector<llvm::SUnit *, 16> VMemSUs;
-  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUs;
-  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16>
-      GroupedVMemSUsByDepth;
-
-  void dump();
-
-private:
-  static constexpr unsigned kNoReg = -1;
-
-  std::pair<unsigned, unsigned>
-  buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
-                       std::vector<unsigned> &VMemDepth, bool bDataOnly);
-  // Compute vmem height/depth.
-  void buildVMemDepthHeight();
-  void buildVMemDataDepthHeight();
-  void groupVmemSUnits();
-};
-
-// Split block based on vmem depth.
-void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag);
-
-} // namespace llvm

From 84d8dd8df6bb481d1974201cf3c25bb4a5db8d37 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 17:09:04 -0700
Subject: [PATCH 17/25] Additional cleanup + format

---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |  2 +-
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       | 34 +++++-----
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 14 ++---
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 63 ++++++++++++-------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       | 32 ++++++----
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    | 19 +++++-
 6 files changed, 100 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 974cd8a5f36b4..7b61d21e6e20e 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -436,7 +436,7 @@ class TargetRegisterInfo : public MCRegisterInfo {
   /// \returns an empty set if there is no set of covering sub registers.
   std::vector<unsigned>
   getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC,
-                                            LaneBitmask mask) const;
+                                            LaneBitmask Mask) const;
 
   /// The lane masks returned by getSubRegIndexLaneMask() above can only be
   /// used to determine if sub-registers overlap - they can't be used to
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index d458648fd8bd8..e8f0c526fcd33 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -730,14 +730,14 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
 
 std::vector<unsigned>
 TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
-    const TargetRegisterClass *RC, LaneBitmask mask) const {
+    const TargetRegisterClass *RC, LaneBitmask Mask) const {
   // TODO: this could replace the code it was copied from in SplitKit.cpp
 
   // First pass: Try to find a perfectly matching subregister index.
   // If none exists find the one covering the most lanemask bits.
   SmallVector<unsigned, 8> PossibleIndexes;
   unsigned BestIdx = 0;
-  const LaneBitmask avoid = ~mask;
+  const LaneBitmask Avoid = ~Mask;
   {
     unsigned BestCover = 0;
     for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
@@ -746,13 +746,13 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
         continue;
       LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
       // Early exit if we found a perfect match.
-      if (SubRegMask == mask) {
+      if (SubRegMask == Mask) {
         BestIdx = Idx;
         break;
       }
 
       // The index must not cover any lanes outside
-      if ((SubRegMask & avoid).any())
+      if ((SubRegMask & Avoid).any())
         continue;
 
       unsigned PopCount = SubRegMask.getNumLanes();
@@ -767,36 +767,36 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
   // Abort if we cannot possibly implement the COPY with the given indexes.
   if (BestIdx == 0) {
     LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
-                      << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+                      << getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
                       << '\n');
     assert(false && "Impossible to span reg class");
     return std::vector<unsigned>();
   }
 
-  std::vector<unsigned> result;
-  result.push_back(BestIdx);
+  std::vector<unsigned> Result;
+  Result.push_back(BestIdx);
 
   // Greedy heuristic: Keep iterating keeping the best covering subreg index
   // each time.
-  mask &= ~(getSubRegIndexLaneMask(BestIdx));
-  while (mask.any()) {
+  Mask &= ~(getSubRegIndexLaneMask(BestIdx));
+  while (Mask.any()) {
     BestIdx = 0;
     int BestCover = std::numeric_limits<int>::min();
     for (unsigned Idx : PossibleIndexes) {
       LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
       // Early exit if we found a perfect match.
-      if (SubRegMask == mask) {
+      if (SubRegMask == Mask) {
         BestIdx = Idx;
         break;
       }
 
       // Guaranteed above
-      assert((SubRegMask & avoid).none());
+      assert((SubRegMask & Avoid).none());
 
       // Try to cover as much of the remaining lanes as possible but as few of
       // the already covered lanes as possible.
-      int Cover = (SubRegMask & mask).getNumLanes() -
-                  (SubRegMask & ~mask).getNumLanes();
+      int Cover = (SubRegMask & Mask).getNumLanes() -
+                  (SubRegMask & ~Mask).getNumLanes();
       if (Cover > BestCover) {
         BestCover = Cover;
         BestIdx = Idx;
@@ -805,16 +805,16 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
 
     if (BestIdx == 0) {
       LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
-                        << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+                        << getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
                         << '\n');
       assert(false && "Impossible to span reg class");
       return std::vector<unsigned>();
     }
 
-    result.push_back(BestIdx);
-    mask &= ~getSubRegIndexLaneMask(BestIdx);
+    Result.push_back(BestIdx);
+    Mask &= ~getSubRegIndexLaneMask(BestIdx);
   }
 
-  return result;
+  return Result;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index a6ce3426a7b93..012ab0c91b257 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -3287,7 +3287,8 @@ void sortSubExpCandidates(std::vector<SubExp> &SubExpCandidates) {
   MapVector<SubExp *, SortNode> SortMap;
   for (auto It : InputMap) {
     unsigned Reg = It.first;
-    MapVector<Register, SetVector<SubExp *>>::iterator OutIt = OutputMap.find(Reg);
+    MapVector<Register, SetVector<SubExp *>>::iterator OutIt =
+        OutputMap.find(Reg);
     if (OutIt == OutputMap.end())
       continue;
     auto &InExps = It.second;
@@ -3622,8 +3623,7 @@ collectPassThrus(MachineBasicBlock *MBB,
   return PassThrus;
 }
 // Try to build a free subExp which all input is passThrus.
-SubExp buildFreeSubExp(SubExp &Exp,
-                       GCNRPTracker::LiveRegSet &PassThrus,
+SubExp buildFreeSubExp(SubExp &Exp, GCNRPTracker::LiveRegSet &PassThrus,
                        MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
   SubExp FreeExp;
   // Try to split the subExp to find a help case.
@@ -3777,8 +3777,7 @@ std::vector<SubExp> buildSubExpCandidates(
       }
       if (!canHelpPressureWhenSink(Exp, PassThrus, MRI, SIRI, MLI, DT,
                                    IsCanClone, IsSgprBound)) {
-        if (AllowPartialUseInSubExp &&
-            Exp.isSafeToMove(MRI)) {
+        if (AllowPartialUseInSubExp && Exp.isSafeToMove(MRI)) {
           SubExp FreeSubExp = buildFreeSubExp(Exp, PassThrus, MRI, SIRI);
           if (canHelpPressureWhenSink(FreeSubExp, PassThrus, MRI, SIRI, MLI, DT,
                                       IsCanClone, IsSgprBound)) {
@@ -4249,9 +4248,8 @@ bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &HotBlocks,
       continue;
 
     // Collect pass thru regs.
-    GCNRPTracker::LiveRegSet PassThrus =
-        collectPassThrus(MBB, InputLive, OutputLive,
-                         LiveRegCandidates, MRI, IsCanClone);
+    GCNRPTracker::LiveRegSet PassThrus = collectPassThrus(
+        MBB, InputLive, OutputLive, LiveRegCandidates, MRI, IsCanClone);
 
     // Group pass thru regs by def MBB.
     SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 990718cd7525f..01336b84c6786 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -1,3 +1,17 @@
+//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
@@ -383,8 +397,9 @@ struct Piece {
   }
 };
 
-static void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC,
-                  unsigned Offset, const SIRegisterInfo *SIRI) {
+static void updateSubReg(MachineOperand &UseMO,
+                         const llvm::TargetRegisterClass *NewRC,
+                         unsigned Offset, const SIRegisterInfo *SIRI) {
   unsigned Size = NewRC->getLaneMask().getNumLanes();
   if (Size == 1) {
     UseMO.setSubReg(0);
@@ -529,12 +544,13 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
     case 1:
       return reduceChannel(Piece.Offset, MI,
                            SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
-                                          : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
+                                           : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     case 2:
       return reduceChannel(Piece.Offset, MI,
-                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
-                                          : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
+                           SIII->get(IsImm
+                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
+                                         : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     case 3:
       if (FullMask == 0xf)
@@ -542,8 +558,9 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
       LLVM_FALLTHROUGH;
     case 4:
       return reduceChannel(Piece.Offset, MI,
-                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
-                                          : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
+                           SIII->get(IsImm
+                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
+                                         : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     case 5:
     case 6:
@@ -553,8 +570,9 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
       LLVM_FALLTHROUGH;
     case 8:
       return reduceChannel(Piece.Offset, MI,
-                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
-                                          : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
+                           SIII->get(IsImm
+                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
+                                         : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     }
 
@@ -751,19 +769,19 @@ unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI,
 void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI,
                 const SIRegisterInfo *SIRI, raw_ostream &OS) {
   if (Mask.none()) {
-    unsigned size = get_reg_size(Reg, MRI, SIRI);
-    Mask = LaneBitmask((1 << size) - 1);
+    unsigned Size = get_reg_size(Reg, MRI, SIRI);
+    Mask = LaneBitmask((1 << Size) - 1);
   }
-  unsigned mask = Mask.getAsInteger();
+  unsigned IntMask = Mask.getAsInteger();
   for (unsigned i = 0; i <= Mask.getHighestLane(); i++) {
-    if (mask & (1 << i)) {
+    if (IntMask & (1 << i)) {
       write_reg(Reg, i, MRI, SIRI, OS);
       OS << ",\n";
     }
   }
 }
 
-void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask,
+void write_dag_input_node(unsigned ID, unsigned Reg, unsigned Mask,
                           const MachineRegisterInfo &MRI,
                           const SIRegisterInfo *SIRI, raw_ostream &OS) {
   OS << "{";
@@ -773,13 +791,13 @@ void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask,
 
   OS << ",";
 
-  auto WriteReg = [&reg, &MRI, &SIRI, &OS]() { print_reg(reg, MRI, SIRI, OS); };
+  auto WriteReg = [&Reg, &MRI, &SIRI, &OS]() { print_reg(Reg, MRI, SIRI, OS); };
 
   json_pair("reg", WriteReg, OS);
 
   OS << ",";
 
-  auto WriteMask = [&mask, &OS]() { OS << mask; };
+  auto WriteMask = [&Mask, &OS]() { OS << Mask; };
 
   json_pair("mask", WriteMask, OS);
 
@@ -1220,8 +1238,8 @@ void write_file(const MDNode *FileNode, raw_ostream &OS) {
   OS << ",\n";
 
   const MDString *Content = cast<MDString>(FileNode->getOperand(1).get());
-  std::string str = get_legal_str(Content);
-  auto WriteContent = [&str, &OS]() { OS << str; };
+  std::string Str = get_legal_str(Content);
+  auto WriteContent = [&Str, &OS]() { OS << Str; };
   json_pair("content", WriteContent, OS);
   OS << "\n},\n";
 }
@@ -1468,8 +1486,7 @@ void write_function(MachineFunction &MF, LiveIntervals *LIS,
   // Check debug info.
   const Function &F = MF.getFunction();
   const Module *M = F.getParent();
-  const NamedMDNode *SourceMD =
-      M->getNamedMetadata("dx.source.contents");
+  const NamedMDNode *SourceMD = M->getNamedMetadata("dx.source.contents");
   if (SourceMD) {
     write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, OS);
   }
@@ -1530,7 +1547,8 @@ class ContributionList {
 
 void buildMIContribution(MachineInstr &MI,
                          DenseSet<MachineInstr *> &ContributorSet,
-                         DenseSet<MachineInstr *> &ContributedSet, MachineRegisterInfo &MRI) {
+                         DenseSet<MachineInstr *> &ContributedSet,
+                         MachineRegisterInfo &MRI) {
   for (MachineOperand &UseMO : MI.uses()) {
     if (!UseMO.isReg())
       continue;
@@ -1938,8 +1956,7 @@ MachineBasicBlock::iterator llvm::findOrCreateInsertionPointForSccDef(
   //      MI
   //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
   //
-  Register TmpScc =
-      MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
       .addImm(-1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index e4b8a28dda6e6..52fa19a82b773 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -1,8 +1,20 @@
+//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
 
-#pragma once
-
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -51,9 +63,9 @@ bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
 
 llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO,
                              const llvm::MachineRegisterInfo &MRI);
-void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
-void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
-void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+void andLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet);
+void andNotLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet);
+void mergeLiveRegSet(LiveSet &TargetSet, const LiveSet &InputSet);
 llvm::MachineBasicBlock *split(llvm::MachineInstr *I);
 
 // For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only
@@ -71,9 +83,6 @@ bool reach_block(llvm::MachineBasicBlock *FromBB,
 void viewCFGWithPhi(llvm::MachineFunction &MF);
 void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
 
-llvm::MachineBasicBlock *createNullExportBlock(llvm::MachineFunction &MF,
-                                               const llvm::SIInstrInfo *TII);
-
 bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
                        llvm::MachineBasicBlock &MBB);
 
@@ -128,7 +137,7 @@ llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
 // local.
 bool isLocalLiveInterval(
     const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
-    llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &touchedMBBSet);
+    llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &TouchedMBBSet);
 bool isLocalLiveInterval(const llvm::LiveInterval &LI,
                          llvm::SlotIndexes *Indexes);
 
@@ -149,13 +158,12 @@ bool isFastMathInst(llvm::MachineInstr &MI);
 
 namespace pressure {
 void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI,
-               const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &os);
+               const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &OS);
 void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
                     const char *Filename);
 void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
-                    llvm::raw_ostream &os);
+                    llvm::raw_ostream &OS);
 } // namespace pressure
-// bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage);
 
 // Look for the successor `Succ` of the given `MBB`.
 // Returns MBB->succ_end() if `Succ` is not a successor of MBB.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index 95066743b59bd..19a63b7900645 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -1,9 +1,23 @@
+//===----------- AMDGPUSubExpDag.cpp - AMDGPU Sub Expression DAG ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Sub Expression DAG. Helper for building a dag based on sub
+/// expressions.
+//
+//===----------------------------------------------------------------------===//
+
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 
-// #include "dxc/DXIL/DxilMetadataHelper.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -752,8 +766,7 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
                                         const llvm::ExpDag *G) {
     return G->getGraphNodeLabel(SU);
   }
-  static std::string getNodeAttributes(const SUnit *N,
-                                       const llvm::ExpDag *) {
+  static std::string getNodeAttributes(const SUnit *N, const llvm::ExpDag *) {
     std::string Str("shape=Mrecord");
 
     Str += ",style=filled,fillcolor=\"#";

From 303a4015bed7f6cbe670f0a9c7ae98117fbfff8b Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 19:59:51 -0700
Subject: [PATCH 18/25] Added cmath

---
 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index c04afe61c9809..7c9a4e5fc297f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -15,6 +15,8 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCInstrItineraries.h"
 
+#include <cmath>
+
 namespace llvm {
 
 class MachineFunction;

From 971e5568d97d056023c5af6189b6ed54e6a36555 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 20:19:27 -0700
Subject: [PATCH 19/25] Wrong place for std header

---
 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp | 2 ++
 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h   | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index a8eef88ac2af8..e313c1f264a92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -20,6 +20,8 @@
 
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
+#include <cmath>
+
 namespace llvm {
 
 // Other info which can help compare schedule result.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index 7c9a4e5fc297f..c04afe61c9809 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -15,8 +15,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCInstrItineraries.h"
 
-#include <cmath>
-
 namespace llvm {
 
 class MachineFunction;

From be03462ab1a3660d77c9c8ba113158176b59303b Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 20:35:19 -0700
Subject: [PATCH 20/25] Made getMinimalSpanningSubRegIdxSetForLaneMask local

---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |  8 --
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       | 91 ------------------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 96 ++++++++++++++++++-
 3 files changed, 94 insertions(+), 101 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 7b61d21e6e20e..e4fad8f9ec869 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -430,14 +430,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
                                 LaneBitmask LaneMask,
                                 SmallVectorImpl<unsigned> &Indexes) const;
 
-  /// Return the set of sub register indexes that minimally cover the given
-  /// lane mask for the given register class.
-  ///
-  /// \returns an empty set if there is no set of covering sub registers.
-  std::vector<unsigned>
-  getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC,
-                                            LaneBitmask Mask) const;
-
   /// The lane masks returned by getSubRegIndexLaneMask() above can only be
   /// used to determine if sub-registers overlap - they can't be used to
   /// determine if a set of sub-registers completely cover another
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index e8f0c526fcd33..701a9f8d72a65 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -727,94 +727,3 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
   dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n";
 }
 #endif
-
-std::vector<unsigned>
-TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
-    const TargetRegisterClass *RC, LaneBitmask Mask) const {
-  // TODO: this could replace the code it was copied from in SplitKit.cpp
-
-  // First pass: Try to find a perfectly matching subregister index.
-  // If none exists find the one covering the most lanemask bits.
-  SmallVector<unsigned, 8> PossibleIndexes;
-  unsigned BestIdx = 0;
-  const LaneBitmask Avoid = ~Mask;
-  {
-    unsigned BestCover = 0;
-    for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
-      // Is this index even compatible with the given class?
-      if (getSubClassWithSubReg(RC, Idx) != RC)
-        continue;
-      LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
-      // Early exit if we found a perfect match.
-      if (SubRegMask == Mask) {
-        BestIdx = Idx;
-        break;
-      }
-
-      // The index must not cover any lanes outside
-      if ((SubRegMask & Avoid).any())
-        continue;
-
-      unsigned PopCount = SubRegMask.getNumLanes();
-      PossibleIndexes.push_back(Idx);
-      if (PopCount > BestCover) {
-        BestCover = PopCount;
-        BestIdx = Idx;
-      }
-    }
-  }
-
-  // Abort if we cannot possibly implement the COPY with the given indexes.
-  if (BestIdx == 0) {
-    LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
-                      << getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
-                      << '\n');
-    assert(false && "Impossible to span reg class");
-    return std::vector<unsigned>();
-  }
-
-  std::vector<unsigned> Result;
-  Result.push_back(BestIdx);
-
-  // Greedy heuristic: Keep iterating keeping the best covering subreg index
-  // each time.
-  Mask &= ~(getSubRegIndexLaneMask(BestIdx));
-  while (Mask.any()) {
-    BestIdx = 0;
-    int BestCover = std::numeric_limits<int>::min();
-    for (unsigned Idx : PossibleIndexes) {
-      LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
-      // Early exit if we found a perfect match.
-      if (SubRegMask == Mask) {
-        BestIdx = Idx;
-        break;
-      }
-
-      // Guaranteed above
-      assert((SubRegMask & Avoid).none());
-
-      // Try to cover as much of the remaining lanes as possible but as few of
-      // the already covered lanes as possible.
-      int Cover = (SubRegMask & Mask).getNumLanes() -
-                  (SubRegMask & ~Mask).getNumLanes();
-      if (Cover > BestCover) {
-        BestCover = Cover;
-        BestIdx = Idx;
-      }
-    }
-
-    if (BestIdx == 0) {
-      LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
-                        << getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
-                        << '\n');
-      assert(false && "Impossible to span reg class");
-      return std::vector<unsigned>();
-    }
-
-    Result.push_back(BestIdx);
-    Mask &= ~getSubRegIndexLaneMask(BestIdx);
-  }
-
-  return Result;
-}
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 01336b84c6786..73904c308b1f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -397,6 +397,98 @@ struct Piece {
   }
 };
 
+static std::vector<unsigned>
+getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI,
+                                          const TargetRegisterClass *RC,
+                                          LaneBitmask Mask) {
+  // TODO: this could replace the code it was copied from in SplitKit.cpp
+
+  // First pass: Try to find a perfectly matching subregister index.
+  // If none exists find the one covering the most lanemask bits.
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  const LaneBitmask Avoid = ~Mask;
+  {
+    unsigned BestCover = 0;
+    for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
+      // Is this index even compatible with the given class?
+      if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
+        continue;
+      LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == Mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // The index must not cover any lanes outside
+      if ((SubRegMask & Avoid).any())
+        continue;
+
+      unsigned PopCount = SubRegMask.getNumLanes();
+      PossibleIndexes.push_back(Idx);
+      if (PopCount > BestCover) {
+        BestCover = PopCount;
+        BestIdx = Idx;
+      }
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0) {
+    LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+                      << TRI->getRegClassName(RC) << " mask "
+                      << PrintLaneMask(Mask) << '\n');
+    assert(false && "Impossible to span reg class");
+    return std::vector<unsigned>();
+  }
+
+  std::vector<unsigned> Result;
+  Result.push_back(BestIdx);
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx));
+  while (Mask.any()) {
+    BestIdx = 0;
+    int BestCover = std::numeric_limits<int>::min();
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == Mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Guaranteed above
+      assert((SubRegMask & Avoid).none());
+
+      // Try to cover as much of the remaining lanes as possible but as few of
+      // the already covered lanes as possible.
+      int Cover = (SubRegMask & Mask).getNumLanes() -
+                  (SubRegMask & ~Mask).getNumLanes();
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0) {
+      LLVM_DEBUG(
+          dbgs() << "Unable to find minimal spanning sub register(s) for "
+                 << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
+                 << '\n');
+      assert(false && "Impossible to span reg class");
+      return std::vector<unsigned>();
+    }
+
+    Result.push_back(BestIdx);
+    Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return Result;
+}
+
 static void updateSubReg(MachineOperand &UseMO,
                          const llvm::TargetRegisterClass *NewRC,
                          unsigned Offset, const SIRegisterInfo *SIRI) {
@@ -409,8 +501,8 @@ static void updateSubReg(MachineOperand &UseMO,
 
     unsigned Mask = LaneMask.getAsInteger() >> Offset;
 
-    unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask(
-                                 NewRC, LaneBitmask(Mask))
+    unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask(
+                             SIRI, NewRC, LaneBitmask(Mask))
                              .front();
 
     UseMO.setSubReg(NewSubReg);

From 436058b1b06d01ecbdac0cfc2967e8ed2a451e22 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 22:04:37 -0700
Subject: [PATCH 21/25] Fixed build break after rebase

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 27 +++++++++----------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  2 +-
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 012ab0c91b257..3e691239ab2f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -2023,7 +2023,7 @@ void printVreg(Register Reg, const MachineRegisterInfo &MRI) {
     if (Name != "") {
       dbgs() << '%' << Name;
     } else {
-      dbgs() << '%' << Register::virtReg2Index(Reg);
+      dbgs() << '%' << Reg.virtRegIndex();
     }
   }
 }
@@ -3851,7 +3851,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
       }
 
       for (auto OutIt : Exp.OutputLive) {
-        unsigned Reg = OutIt.first;
+        Register Reg = OutIt.first;
         LaneBitmask OutMask = OutIt.second;
         LaneBitmask MBBBeginMask;
         if (CrossLive.find(Reg) != CrossLive.end())
@@ -3863,10 +3863,9 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
                                              : (OutMask & MBBBeginMask);
         if (MBBBeginMask.any()) {
           unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
-          LLVM_DEBUG(std::string movStr =
+          LLVM_DEBUG(std::string MovStr =
                          Exp.IsHoist ? "output hoist:" : "output sink:";
-                     dbgs()
-                     << movStr << Register::virtReg2Index(Reg) << " " << Size);
+                     dbgs() << MovStr << Reg.virtRegIndex() << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB when sink and increase when hoist.
           if (SIRI->isVGPR(MRI, Reg)) {
@@ -3886,7 +3885,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
       }
 
       for (auto InIt : Exp.InputLive) {
-        unsigned Reg = InIt.first;
+        Register Reg = InIt.first;
         LaneBitmask InMask = InIt.second;
         LaneBitmask MBBBeginMask;
         if (CrossLive.find(Reg) != CrossLive.end())
@@ -3903,9 +3902,9 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
           // It will increase live for MBB.
           unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
 
-          LLVM_DEBUG(
-              std::string movStr = Exp.IsHoist ? "input hoist:" : "input sink:";
-              dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size);
+          LLVM_DEBUG(std::string MovStr =
+                         Exp.IsHoist ? "input hoist:" : "input sink:";
+                     dbgs() << MovStr << Reg.virtRegIndex() << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             if (Exp.IsHoist)
@@ -3928,7 +3927,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
       // MBB. So cannot count that output live reg as profit.
       // Hoist into loop is not supported now.
       for (auto OutIt : Exp.OutputLive) {
-        unsigned Reg = OutIt.first;
+        Register Reg = OutIt.first;
         bool IsDomUser = false;
         for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
           MachineBasicBlock *UserMBB = MI.getParent();
@@ -3947,8 +3946,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
         LaneBitmask ProfitMask = OutMask & MBBBeginMask;
         if (MBBBeginMask.any()) {
           unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
-          LLVM_DEBUG(dbgs()
-                     << "move:" << Register::virtReg2Index(Reg) << " " << Size);
+          LLVM_DEBUG(dbgs() << "move:" << Reg.virtRegIndex() << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB.
           if (SIRI->isVGPR(MRI, Reg)) {
@@ -3962,7 +3960,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
       }
 
       for (auto InIt : Exp.InputLive) {
-        unsigned Reg = InIt.first;
+        Register Reg = InIt.first;
         LaneBitmask InMask = InIt.second;
         LaneBitmask MBBBeginMask;
         if (InputLive.find(Reg) != InputLive.end())
@@ -3976,8 +3974,7 @@ calculateSaving(HotBlock &HotBb, std::vector<SubExp> &SubExpCandidates,
           // It will increase live for MBB.
           unsigned Size = getRegSize(Reg, ProfitMask, MRI, SIRI);
 
-          LLVM_DEBUG(dbgs()
-                     << "add:" << Register::virtReg2Index(Reg) << " " << Size);
+          LLVM_DEBUG(dbgs() << "add:" << Reg.virtRegIndex() << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             VgprDiff += Size;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 73904c308b1f6..3feaa2f0f508f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -819,7 +819,7 @@ void print_reg(Register Reg, const MachineRegisterInfo &MRI,
     if (Name != "") {
       OS << '%' << Name;
     } else {
-      OS << '%' << Register::virtReg2Index(Reg);
+      OS << '%' << Reg.virtRegIndex();
     }
   } else if (Reg < SIRI->getNumRegs()) {
     OS << '$';
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 52fa19a82b773..7aa053b9f7fe8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -52,7 +52,7 @@ using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
                     const llvm::MachineRegisterInfo &MRI,
                     const llvm::SIRegisterInfo *SIRI);
-void collectLiveSetPressure(const LiveSet &liveSet,
+void collectLiveSetPressure(const LiveSet &LiveSet,
                             const llvm::MachineRegisterInfo &MRI,
                             const llvm::SIRegisterInfo *SIRI,
                             unsigned &VPressure, unsigned &SPressure);

From 9dbab90c299479105d4652935cfde044cf5c97c0 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 17 Mar 2025 22:16:43 -0700
Subject: [PATCH 22/25] Clang format

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ec39b385ecbd2..d680e01e3f8fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -397,8 +397,8 @@ static cl::opt<bool>
 // Enable Hot block rematerialize
 static cl::opt<bool>
     EnableHotBlockRemat("amdgpu-enable-hot-block-remat",
-                         cl::desc("Enable HotBlock Rematerialize optimization"),
-                         cl::init(false), cl::Hidden);
+                        cl::desc("Enable HotBlock Rematerialize optimization"),
+                        cl::init(false), cl::Hidden);
 
 // Enable GFX11+ VOPD
 static cl::opt<bool>
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 3c467c098a65e..14db2b39ef9d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1332,7 +1332,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isLowLatencyInstruction(const MachineInstr &MI) const;
   bool isHighLatencyDef(int Opc) const override;
-  bool isHighLatencyInstruction(const MachineInstr& MI) const {
+  bool isHighLatencyInstruction(const MachineInstr &MI) const {
     return isHighLatencyDef(MI.getOpcode());
   }
 

From ebcbb24c4f8123b5e34cfc3c0a3e01b0778f858e Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Tue, 18 Mar 2025 11:43:48 -0700
Subject: [PATCH 23/25] Fixing undef deprecator failures

---
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp  | 4 ++--
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 3feaa2f0f508f..55477bd39fb73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -258,7 +258,7 @@ bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) {
 
 bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
   // Support multi def for pattern of pointer:
-  // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+  // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
   // %808.sub1:sgpr_64 = S_MOV_B32 0
   bool HasSub0 = false;
   bool HasSub1 = false;
@@ -296,7 +296,7 @@ bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
 
 LaneBitmask getRegMask(const MachineOperand &MO,
                        const MachineRegisterInfo &MRI) {
-  // We don't rely on read-undef flag because in case of tentative schedule
+  // We don't rely on read-undef_ flag because in case of tentative schedule
   // tracking it isn't set correctly yet. This works correctly however since
   // use mask has been tracked before using LIS.
   return MO.getSubReg() == 0
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index 19a63b7900645..0673346b11ab4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -420,7 +420,7 @@ void ExpDag::addDataDep() {
       Register Reg = MO.getReg();
 
       // For case like:
-      // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+      // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
       // %808.sub1:sgpr_64 = S_MOV_B32 0
       // When partially write, link MI to previous def.
       if (MO.getSubReg() != 0) {

From b5d143c2511f853732db1da1dc0a1b92be066ed2 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Tue, 18 Mar 2025 12:19:33 -0700
Subject: [PATCH 24/25] Ran latest format

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 40 ++++++++-----------
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    |  9 ++---
 2 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 3e691239ab2f1..8fecd9f7e2534 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -365,15 +365,13 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
 
   LLVM_DEBUG(
       const SIRegisterInfo *SIRI = ST->getRegisterInfo();
-      dbgs() << "output live"; for (auto &It
-                                    : Status.MBBOutputLiveMap) {
+      dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) {
         unsigned Idx = It.first->getNumber();
         auto LiveReg = It.second;
         dbgs() << "MBB" << Idx << ":";
         llvm::dumpLiveSet(LiveReg, SIRI);
       } dbgs() << "input live";
-      for (auto &It
-           : Status.MBBInputLiveMap) {
+      for (auto &It : Status.MBBInputLiveMap) {
         unsigned Idx = It.first->getNumber();
         auto LiveReg = It.second;
         dbgs() << "MBB" << Idx << ":";
@@ -1811,10 +1809,9 @@ std::vector<SubExp> buildSubExpFromCandidates(
     Defs.emplace_back(&MI);
   }
 
-  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
-                                                            : Defs) {
-    MI->dump();
-  } dbgs() << "\nFinished Candidate Defs End\n";);
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n";
+             for (MachineInstr *MI : Defs) { MI->dump(); } dbgs()
+             << "\nFinished Candidate Defs End\n";);
 
   // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
   // Candidates as output.
@@ -1999,13 +1996,11 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
     Defs.emplace_back(&MI);
   }
 
-  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
-                                                            : Defs) {
-    MI->dump();
-  } dbgs() << "\nFinished Candidate Defs End\n";);
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n";
+             for (MachineInstr *MI : Defs) { MI->dump(); } dbgs()
+             << "\nFinished Candidate Defs End\n";);
 
-  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto It
-                                                    : LocalCandidates) {
+  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto It : LocalCandidates) {
     pressure::print_reg(It.first, MRI, SIRI, llvm::dbgs());
   } dbgs() << "\nLocalCandidates End\n";);
   // Make sure all input reg are uniqueDef.
@@ -3552,13 +3547,13 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &PassThrus,
   llvm::SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
       Result = Candidates.takeVector();
 
-  LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto It
-                                                              : Result) {
-    MachineBasicBlock *MBB = It.first;
-    auto &defInMBB = It.second;
-    MBB->dump();
-    llvm::dumpLiveSet(defInMBB, SIRI);
-  } llvm::dbgs() << "end of candidates\n";);
+  LLVM_DEBUG(
+      llvm::dbgs() << "Before sort candidates\n"; for (auto It : Result) {
+        MachineBasicBlock *MBB = It.first;
+        auto &defInMBB = It.second;
+        MBB->dump();
+        llvm::dumpLiveSet(defInMBB, SIRI);
+      } llvm::dbgs() << "end of candidates\n";);
 
   std::sort(Result.begin(), Result.end(),
             [](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &It0,
@@ -3566,8 +3561,7 @@ groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &PassThrus,
               return It0.first->getNumber() < It1.first->getNumber();
             });
 
-  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto It
-                                                             : Result) {
+  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto It : Result) {
     MachineBasicBlock *MBB = It.first;
     auto &defInMBB = It.second;
     MBB->dump();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index 0673346b11ab4..548bfa508c735 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -1023,8 +1023,7 @@ void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
   }
 
   LLVM_DEBUG(
-      dbgs() << "Chained Nodes:"; for (SUnit *SU
-                                       : ChainedNodes) {
+      dbgs() << "Chained Nodes:"; for (SUnit *SU : ChainedNodes) {
         dbgs() << " " << SU->NodeNum << "\n";
       } for (unsigned i = 0; i < Lineages.size(); i++) {
         dbgs() << "Lineage" << i << ":";
@@ -1225,8 +1224,7 @@ void HRB::buildReachRelation(ArrayRef<SUnit *> BotRoots) {
   }
   ReachMap.erase(&FakeEntry);
 
-  LLVM_DEBUG(for (Lineage &L
-                  : Lineages) {
+  LLVM_DEBUG(for (Lineage &L : Lineages) {
     for (SUnit *SU : L.Nodes) {
       DenseSet<SUnit *> &CurReach = ReachMap[SU];
       dbgs() << SU->NodeNum << " reach: ";
@@ -1687,8 +1685,7 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
                 return ConfA > ConfB;
               });
 
-    LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU
-                                              : ReadyList) {
+    LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU : ReadyList) {
       dbgs() << " " << SU->NodeNum;
     } dbgs() << "\n";);
     SUnit *Candidate = nullptr;

From 87d9404f5a6f1e3921954380fc75ac1e88e72d59 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 31 Mar 2025 16:38:06 -0700
Subject: [PATCH 25/25] Fixed failing tests, and added tests

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |   2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     |  35 +-
 .../test/CodeGen/AMDGPU/remat/phi_pacifist.ll |  88 +++
 .../CodeGen/AMDGPU/remat/phi_pacifist.mir     | 372 ++++++++++++
 .../CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir | 565 ++++++++++++++++++
 5 files changed, 1055 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 8fecd9f7e2534..7d2e1a6d81db8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -2714,7 +2714,7 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
   SmallVector<MachineInstr *, 32> PacifistList;
   LLVM_DEBUG(dbgs() << "pacifist begin\n");
   for (MachineInstr &MI : MBB) {
-    if (MI.isDebugInstr())
+    if (MI.isDebugInstr() || MI.isPHI())
       continue;
     if (collectPacifist(MI, InputLive, OutputLive, MRI)) {
       PacifistList.emplace_back(&MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 55477bd39fb73..beace3a501a19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -509,6 +509,14 @@ static void updateSubReg(MachineOperand &UseMO,
   }
 }
 
+static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI,
+                                      const MachineRegisterInfo &MRI) {
+  const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+  const TargetRegisterClass *SubregRC =
+      SIRI->getSubRegisterClass(RC, AMDGPU::sub0);
+  return SubregRC->LaneMask.getNumLanes();
+}
+
 bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
                    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                    const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
@@ -526,7 +534,19 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
 
   const llvm::TargetRegisterClass *NewRC =
       SIRI->getRegClass(Desc.operands().front().RegClass);
-  unsigned Size = NewRC->getLaneMask().getNumLanes();
+  if (!NewRC->isAllocatable()) {
+    if (SIRI->isSGPRClass(NewRC))
+      NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
+    else if (SIRI->isVGPRClass(NewRC))
+      NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
+    else
+      return false;
+
+    if (!NewRC->isAllocatable())
+      return false;
+  }
+
+  unsigned NumLanes = NewRC->getLaneMask().getNumLanes();
   if (Offset > 0) {
     // Update offset operand in MI.
     MachineOperand *OffsetOp =
@@ -573,8 +593,8 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
     for (MachineOperand *UseMO : UseMOs) {
       updateSubReg(*UseMO, NewRC, Offset, SIRI);
     }
-  } else if (Size == 1) {
-    // Clear subReg when size is 1.
+  } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) {
+    // Clear subReg when it's a single 32-bit reg.
     for (MachineOperand *UseMO : UseMOs) {
       UseMO->setSubReg(0);
     }
@@ -630,7 +650,10 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
     if (IsImm && Piece.Offset != 0)
       return false;
 
-    switch (Piece.Size) {
+    const unsigned Num32BitLanes =
+        Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI);
+
+    switch (Num32BitLanes) {
     default:
       return false;
     case 1:
@@ -645,7 +668,7 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
                                          : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
                            MRI, SIRI, SIII, SlotIndexes);
     case 3:
-      if (FullMask == 0xf)
+      if (FullMask == 0xff)
         return false;
       LLVM_FALLTHROUGH;
     case 4:
@@ -657,7 +680,7 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
     case 5:
     case 6:
     case 7:
-      if (FullMask == 0xff)
+      if (FullMask == 0xffff)
         return false;
       LLVM_FALLTHROUGH;
     case 8:
diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll
new file mode 100644
index 0000000000000..3369486e0323a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-hot-block-remat -amdgpu-remat-enable-sub-exp-remat
+
+; Regression test for PHI being sinked to uses as a pacifist.
+; Just checking that the test does not crash.
+
+; ModuleID = 'reduced.ll'
+source_filename = "reduced.ll"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn--amdpal"
+
+define amdgpu_ps void @_amdgpu_ps_main(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, i32 %arg5, float %arg6, float %arg7, float %arg8, <2 x half> %arg9, i1 %arg10) #0 {
+bb:
+  br label %bb19
+
+bb11:                                             ; preds = %bb19
+  %i = bitcast i32 %i21 to float
+  %i12 = bitcast i32 %i23 to float
+  %i13 = fmul float 0.000000e+00, %i26
+  %i14 = fmul float %i13, 0.000000e+00
+  %i15 = fmul float %i12, %i
+  %i16 = fadd float %i15, %i14
+  %i17 = select i1 false, float 0.000000e+00, float %i16
+  %i18 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %arg4, float %arg8)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i18, <2 x half> %arg9, i1 false, i1 false)
+  ret void
+
+bb19:                                             ; preds = %bb19, %bb
+  %i20 = phi i32 [ 0, %bb19 ], [ %arg5, %bb ]
+  %i21 = phi i32 [ %i35, %bb19 ], [ 0, %bb ]
+  %i22 = phi i32 [ %i38, %bb19 ], [ 0, %bb ]
+  %i23 = phi i32 [ %i60, %bb19 ], [ 0, %bb ]
+  %i24 = phi i32 [ %i61, %bb19 ], [ 0, %bb ]
+  %i25 = phi i32 [ %i62, %bb19 ], [ 0, %bb ]
+  %i26 = phi float [ %i39, %bb19 ], [ 0.000000e+00, %bb ]
+  %i27 = phi i32 [ %i49, %bb19 ], [ 0, %bb ]
+  %i28 = phi i32 [ %i50, %bb19 ], [ 0, %bb ]
+  %i29 = phi i32 [ %i51, %bb19 ], [ 0, %bb ]
+  %i30 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 1, i32 %i20, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
+  %i31 = extractelement <4 x float> %i30, i64 0
+  %i32 = fmul float %arg1, %i31
+  %i33 = bitcast i32 %i22 to float
+  %i34 = fmul float %arg, %i32
+  %i35 = select i1 %arg10, i32 %arg5, i32 %i21
+  %i36 = fadd float 0.000000e+00, %i33
+  %i37 = bitcast float %i36 to i32
+  %i38 = select i1 %arg10, i32 %i22, i32 %i37
+  %i39 = fadd float %i26, 1.000000e+00
+  %i40 = bitcast i32 %i27 to float
+  %i41 = bitcast i32 %i28 to float
+  %i42 = bitcast i32 %i29 to float
+  %i43 = fadd float 0.000000e+00, %i40
+  %i44 = fadd float 0.000000e+00, %i41
+  %i45 = fadd float 0.000000e+00, %i42
+  %i46 = bitcast float %i43 to i32
+  %i47 = bitcast float %i44 to i32
+  %i48 = bitcast float %i45 to i32
+  %i49 = select i1 %arg10, i32 %i27, i32 %i46
+  %i50 = select i1 %arg10, i32 %i28, i32 %i47
+  %i51 = select i1 %arg10, i32 %i29, i32 %i48
+  %i52 = fmul float %i34, %arg7
+  %i53 = bitcast i32 %i24 to float
+  %i54 = bitcast i32 %i25 to float
+  %i55 = fadd float %arg6, %i53
+  %i56 = fadd float %arg2, %i54
+  %i57 = bitcast float %i52 to i32
+  %i58 = bitcast float %i55 to i32
+  %i59 = bitcast float %i56 to i32
+  %i60 = select i1 %arg10, i32 %i57, i32 %i23
+  %i61 = select i1 %arg10, i32 %i58, i32 %i24
+  %i62 = select i1 %arg10, i32 %i59, i32 %i25
+  %i63 = sitofp i32 %i20 to float
+  %i64 = fcmp olt float %arg3, %i63
+  br i1 %i64, label %bb11, label %bb19
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #3
+
+attributes #0 = { "target-features"=",+wavefrontsize64,+cumode,-xnack" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(read) }
diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir
new file mode 100644
index 0000000000000..e9a8486bfa6b1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/phi_pacifist.mir
@@ -0,0 +1,372 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat  -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+
+# Regression test for PHI being sinked to uses as a pacifist.
+
+# CHECK: bb.2.bb19:
+# CHECK: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+# CHECK-NEXT: PHI
+
+--- |
+  ; ModuleID = 'C:\llvm-project\llvm\test\CodeGen\AMDGPU\remat\phi_pacifist.ll'
+  source_filename = "reduced.ll"
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+  target triple = "amdgcn"
+  
+  define amdgpu_ps void @_amdgpu_ps_main(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, i32 %arg5, float %arg6, float %arg7, float %arg8, <2 x half> %arg9, i1 %arg10) #0 {
+  bb:
+    br label %bb19, !amdgpu.uniform !0
+  
+  bb11:                                             ; preds = %bb19
+    %i21.lcssa = phi i32 [ %i21, %bb19 ]
+    %i23.lcssa = phi i32 [ %i23, %bb19 ]
+    %i26.lcssa = phi float [ %i26, %bb19 ]
+    %.lcssa = phi i64 [ %0, %bb19 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %.lcssa)
+    %i = bitcast i32 %i21.lcssa to float
+    %i12 = bitcast i32 %i23.lcssa to float
+    %i13 = fmul float 0.000000e+00, %i26.lcssa
+    %i18 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %arg4, float %arg8)
+    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i18, <2 x half> %arg9, i1 false, i1 false)
+    ret void
+  
+  bb19:                                             ; preds = %bb19, %bb
+    %phi.broken = phi i64 [ %0, %bb19 ], [ 0, %bb ]
+    %i20 = phi i32 [ %arg5, %bb ], [ 0, %bb19 ]
+    %i21 = phi i32 [ 0, %bb ], [ %i35, %bb19 ]
+    %i22 = phi i32 [ 0, %bb ], [ %i38, %bb19 ]
+    %i23 = phi i32 [ 0, %bb ], [ %i60, %bb19 ]
+    %i24 = phi i32 [ 0, %bb ], [ %i61, %bb19 ]
+    %i25 = phi i32 [ 0, %bb ], [ %i62, %bb19 ]
+    %i26 = phi float [ 0.000000e+00, %bb ], [ %i39, %bb19 ]
+    %i27 = phi i32 [ 0, %bb ], [ %i49, %bb19 ]
+    %i28 = phi i32 [ 0, %bb ], [ %i50, %bb19 ]
+    %i29 = phi i32 [ 0, %bb ], [ %i51, %bb19 ]
+    %i30 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 1, i32 %i20, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
+    %i31 = extractelement <4 x float> %i30, i64 0
+    %i32 = fmul float %arg1, %i31
+    %i33 = bitcast i32 %i22 to float
+    %i34 = fmul float %arg, %i32
+    %i35 = select i1 %arg10, i32 %arg5, i32 %i21
+    %i36 = fadd float 0.000000e+00, %i33
+    %i37 = bitcast float %i36 to i32
+    %i38 = select i1 %arg10, i32 %i22, i32 %i37
+    %i39 = fadd float %i26, 1.000000e+00
+    %i40 = bitcast i32 %i27 to float
+    %i41 = bitcast i32 %i28 to float
+    %i42 = bitcast i32 %i29 to float
+    %i43 = fadd float 0.000000e+00, %i40
+    %i44 = fadd float 0.000000e+00, %i41
+    %i45 = fadd float 0.000000e+00, %i42
+    %i46 = bitcast float %i43 to i32
+    %i47 = bitcast float %i44 to i32
+    %i48 = bitcast float %i45 to i32
+    %i49 = select i1 %arg10, i32 %i27, i32 %i46
+    %i50 = select i1 %arg10, i32 %i28, i32 %i47
+    %i51 = select i1 %arg10, i32 %i29, i32 %i48
+    %i52 = fmul float %i34, %arg7
+    %i53 = bitcast i32 %i24 to float
+    %i54 = bitcast i32 %i25 to float
+    %i55 = fadd float %arg6, %i53
+    %i56 = fadd float %arg2, %i54
+    %i57 = bitcast float %i52 to i32
+    %i58 = bitcast float %i55 to i32
+    %i59 = bitcast float %i56 to i32
+    %i60 = select i1 %arg10, i32 %i57, i32 %i23
+    %i61 = select i1 %arg10, i32 %i58, i32 %i24
+    %i62 = select i1 %arg10, i32 %i59, i32 %i25
+    %i63 = sitofp i32 %i20 to float
+    %i64 = fcmp olt float %arg3, %i63
+    %0 = call i64 @llvm.amdgcn.if.break.i64(i1 %i64, i64 %phi.broken)
+    %1 = call i1 @llvm.amdgcn.loop.i64(i64 %0)
+    br i1 %1, label %bb11, label %bb19
+  }
+  
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+  
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+  declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #2
+  
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+  declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #3
+  
+  ; Function Attrs: nocallback nofree nounwind willreturn memory(none)
+  declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #4
+  
+  ; Function Attrs: nocallback nofree nounwind willreturn
+  declare i1 @llvm.amdgcn.loop.i64(i64) #5
+  
+  ; Function Attrs: nocallback nofree nounwind willreturn
+  declare void @llvm.amdgcn.end.cf.i64(i64) #5
+  
+  attributes #0 = { "target-cpu"="gfx1010" "target-features"=",+wavefrontsize64,+cumode,-xnack" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
+  attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx1010" }
+  attributes #3 = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx1010" }
+  attributes #4 = { nocallback nofree nounwind willreturn memory(none) }
+  attributes #5 = { nocallback nofree nounwind willreturn }
+  
+  !0 = !{}
+
+...
+---
+name:            _amdgpu_ps_main
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 6, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 7, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 12, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 20, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 23, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 25, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 26, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 27, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 30, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 31, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 32, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 33, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 34, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 35, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 36, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 37, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 38, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 39, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 40, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 41, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 42, class: sreg_64_xexec, preferred-register: '$vcc', flags: [  ] }
+  - { id: 43, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 44, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 45, class: sgpr_256, preferred-register: '', flags: [  ] }
+  - { id: 46, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 47, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 48, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 49, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 50, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 51, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 52, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 53, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 54, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 55, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 56, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 57, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 58, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 59, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 60, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 61, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 62, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 63, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 64, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 65, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 66, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 67, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 68, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 69, class: sreg_64, preferred-register: '$vcc', flags: [  ] }
+  - { id: 70, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 71, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 72, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 73, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 74, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 75, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 76, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%25' }
+  - { reg: '$vgpr1', virtual-reg: '%26' }
+  - { reg: '$vgpr2', virtual-reg: '%27' }
+  - { reg: '$vgpr3', virtual-reg: '%28' }
+  - { reg: '$vgpr4', virtual-reg: '%29' }
+  - { reg: '$vgpr5', virtual-reg: '%30' }
+  - { reg: '$vgpr6', virtual-reg: '%31' }
+  - { reg: '$vgpr7', virtual-reg: '%32' }
+  - { reg: '$vgpr8', virtual-reg: '%33' }
+  - { reg: '$vgpr9', virtual-reg: '%34' }
+  - { reg: '$vgpr10', virtual-reg: '%35' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 0
+  maxKernArgAlign: 4
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  scratchRSrcReg:  '$sgpr100_sgpr101_sgpr102_sgpr103'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sgpr32'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
+  psInputAddr:     2047
+  psInputEnable:   2047
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            false
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       20
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: '$sgpr104_sgpr105'
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+body:             |
+  bb.0.bb:
+    successors: %bb.2(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+  
+    %35:vgpr_32 = COPY $vgpr10
+    %34:vgpr_32 = COPY $vgpr9
+    %33:vgpr_32 = COPY $vgpr8
+    %32:vgpr_32 = COPY $vgpr7
+    %31:vgpr_32 = COPY $vgpr6
+    %30:vgpr_32 = COPY $vgpr5
+    %29:vgpr_32 = COPY $vgpr4
+    %28:vgpr_32 = COPY $vgpr3
+    %27:vgpr_32 = COPY $vgpr2
+    %26:vgpr_32 = COPY $vgpr1
+    %25:vgpr_32 = COPY $vgpr0
+    %41:vgpr_32 = V_AND_B32_e64 1, %35, implicit $exec
+    %42:sreg_64_xexec = V_CMP_EQ_U32_e64 1, killed %41, implicit $exec
+    %39:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %38:sreg_64 = S_MOV_B64 0
+    %76:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %51:sgpr_32 = S_MOV_B32 0
+    %45:sgpr_256 = REG_SEQUENCE %51, %subreg.sub0, %51, %subreg.sub1, %51, %subreg.sub2, %51, %subreg.sub3, %51, %subreg.sub4, %51, %subreg.sub5, %51, %subreg.sub6, %51, %subreg.sub7
+    S_BRANCH %bb.2
+  
+  bb.1.bb11:
+    SI_END_CF %24, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %70:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, %29, 0, %33, 0, 0, implicit $mode, implicit $exec
+    %71:sreg_32 = IMPLICIT_DEF
+    %72:vgpr_32 = COPY %71
+    %73:sreg_32 = IMPLICIT_DEF
+    %74:vgpr_32 = COPY %73
+    EXP 0, killed %70, %34, %72, %74, 0, -1, 0, implicit $exec
+    S_ENDPGM 0
+  
+  bb.2.bb19:
+    successors: %bb.1(0x04000000), %bb.2(0x7c000000)
+  
+    %4:sreg_64 = PHI %38, %bb.0, %24, %bb.2
+    %5:vgpr_32 = PHI %30, %bb.0, %76, %bb.2
+    %6:vgpr_32 = PHI %39, %bb.0, %15, %bb.2
+    %7:vgpr_32 = PHI %39, %bb.0, %16, %bb.2
+    %8:vgpr_32 = PHI %39, %bb.0, %21, %bb.2
+    %9:vgpr_32 = PHI %39, %bb.0, %22, %bb.2
+    %10:vgpr_32 = PHI %39, %bb.0, %23, %bb.2
+    %75:vgpr_32 = PHI %76, %bb.0, %55, %bb.2
+    %12:vgpr_32 = PHI %39, %bb.0, %18, %bb.2
+    %13:vgpr_32 = PHI %39, %bb.0, %19, %bb.2
+    %14:vgpr_32 = PHI %39, %bb.0, %20, %bb.2
+    %46:vgpr_32 = IMAGE_LOAD_V1_V2_nsa_gfx10 %5, %76, %45, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    %48:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %26, 0, killed %46, 0, 0, implicit $mode, implicit $exec
+    %49:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %25, 0, killed %48, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32 = V_CNDMASK_B32_e64 0, %6, 0, %30, %42, implicit $exec
+    %52:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %7, 0, 0, implicit $mode, implicit $exec
+    %16:vgpr_32 = V_CNDMASK_B32_e64 0, killed %52, 0, %7, %42, implicit $exec
+    %55:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 1065353216, 0, %75, 0, 0, implicit $mode, implicit $exec
+    %56:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %57:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %13, 0, 0, implicit $mode, implicit $exec
+    %58:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %14, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = V_CNDMASK_B32_e64 0, killed %56, 0, %12, %42, implicit $exec
+    %19:vgpr_32 = V_CNDMASK_B32_e64 0, killed %57, 0, %13, %42, implicit $exec
+    %20:vgpr_32 = V_CNDMASK_B32_e64 0, killed %58, 0, %14, %42, implicit $exec
+    %62:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %49, 0, %32, 0, 0, implicit $mode, implicit $exec
+    %63:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %31, 0, %9, 0, 0, implicit $mode, implicit $exec
+    %64:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %27, 0, %10, 0, 0, implicit $mode, implicit $exec
+    %21:vgpr_32 = V_CNDMASK_B32_e64 0, %8, 0, killed %62, %42, implicit $exec
+    %22:vgpr_32 = V_CNDMASK_B32_e64 0, %9, 0, killed %63, %42, implicit $exec
+    %23:vgpr_32 = V_CNDMASK_B32_e64 0, %10, 0, killed %64, %42, implicit $exec
+    %68:vgpr_32 = V_CVT_F32_I32_e64 %5, 0, 0, implicit $mode, implicit $exec
+    %69:sreg_64 = nofpexcept V_CMP_LT_F32_e64 0, %28, 0, killed %68, 0, implicit $mode, implicit $exec
+    %24:sreg_64 = SI_IF_BREAK killed %69, %4, implicit-def dead $scc
+    SI_LOOP %24, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
new file mode 100644
index 0000000000000..9f5d402340329
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
@@ -0,0 +1,565 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive  -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the buffer loads have been moved to the use and the lanes are reduced
+# correctly.
+#
+# CHECK: bb.2:
+#==========================================================================
+# X4_IMM, Using .x
+# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0
+# X4_IMM, Using .xy
+# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0
+# X4_IMM, Using .xyz
+# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0
+# X4_IMM, Using .yz
+# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0
+# X4_IMM, Using .yzw
+# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0
+#==========================================================================
+# X8_IMM, Using .x
+# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0
+# X8_IMM, Using .xy
+# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0
+# X8_IMM, Using .xyz
+# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0
+# X8_IMM, Using .xyzw
+# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0
+# X8_IMM, Using .xyzw + 5th dword
+# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0
+#==========================================================================
+# X16_IMM, Using .xy and .zw
+# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0
+#==========================================================================
+# X4_SGPR, Using .x
+# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0
+# X8_SGPR, Using .xy
+# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0
+# X16_SGPR, Using .xy + .zw
+# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0
+#==========================================================================
+#
+#
+# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1
+
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3
+
+    ; X4_IMM
+    %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0
+    %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0
+    %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0
+    %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0
+    %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0
+
+    ; X8_IMM
+    %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0
+    %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0
+    %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0
+    %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0
+    %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0
+
+    ; X16_IMM
+    %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0
+
+    ; X4_SGPR
+    %50:sgpr_32 = COPY $sgpr0
+    %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0
+
+    ; X8_SGPR
+    %51:sgpr_32 = COPY $sgpr1
+    %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0
+
+    ; X16_SGPR
+    %52:sgpr_32 = COPY $sgpr2
+    %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0
+
+    %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0
+    %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0
+    %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0
+    %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0
+    %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0
+    %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0
+    %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0
+    %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0
+    %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0
+    %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0
+    %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0
+    %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0
+    %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0
+    %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0
+    %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0
+    %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0
+    %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0
+    %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0
+    %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0
+    %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0
+    %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0
+    %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0
+    %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0
+    %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0
+    %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0
+    %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0
+    %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0
+    %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0
+    %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0
+    %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0
+    %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0
+    %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0
+    %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0
+    %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0
+    %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0
+    %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0
+    %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0
+    %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0
+    %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0
+    %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0
+    %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0
+    %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0
+    %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0
+    %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0
+    %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0
+    %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0
+    %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0
+    %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0
+    %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0
+    %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0
+
+    %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %8001:vgpr_32 = COPY %8000
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+    ;==========================================================================
+    ; X4_IMM, Using .x
+    S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0
+    S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses.
+
+    ; X4_IMM, Using .xy
+    S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0
+    S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0
+
+    ; X4_IMM, Using .xyz
+    S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0
+    S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0
+    S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0
+
+    ; X4_IMM, Using .yz
+    S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0
+    S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0
+
+    ; X4_IMM, Using .yzw
+    S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0
+    S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0
+    S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0
+
+    ;==========================================================================
+    ; X8_IMM, Using .x
+    S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0
+    S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0
+    
+    ; X8_IMM, Using .xy
+    S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0
+    S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0
+
+    ; X8_IMM, Using .xyz
+    S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0
+    S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0
+    S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0
+
+    ; X8_IMM, Using .xyzw
+    S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0
+    S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0
+    S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0
+    S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0
+    
+    ; X8_IMM, Using .xyzw + 5th dword
+    S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0
+
+    ;==========================================================================
+    ; X16_IMM, Using .xy and .zw
+    S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0
+    S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0
+
+    ;==========================================================================
+    ; X4_SGPR, Using .x
+    S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0
+    S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0
+
+    ; X8_SGPR, Using .xy
+    S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0
+    S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0
+
+    ; X16_SGPR, Using .xy + .zw
+    S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0
+    S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0
+
+    ;==========================================================================
+    S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+    EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+
+
+
+