[EraVM] Add support for splitting live ranges of PHI nodes in loops

vladimirradosavljevic · akiramenai · commit 4c59238f8453 · 2024-09-25T20:56:34.000+02:00
This is useful for loops with large switch statements where
PHI nodes are used frequently, and we want to keep these
variables in a single register.

Signed-off-by: Vladimir Radosavljevic &lt;vr@matterlabs.dev&gt;
diff --git a/llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp b/llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp
@@ -19,9 +19,12 @@
 
 #include "EraVM.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -30,6 +33,14 @@ using namespace llvm::PatternMatch;
 #define ERAVM_POST_CODEGEN_PREPARE_NAME                                        \
   "EraVM optimizations after CodeGenPrepare pass"
 
+static cl::opt<bool> EnableSplitLoopPHILiveRanges(
+    "eravm-enable-split-loop-phi-live-ranges", cl::Hidden, cl::init(true),
+    cl::desc("Enable splitting live ranges of PHI nodes in loops"));
+
+static cl::opt<unsigned> NumOfPHIUsesToSplitLiveRanges(
+    "eravm-num-of-phi-uses-to-split-live-ranges", cl::Hidden, cl::init(20),
+    cl::desc("Number of uses of PHI node to consider splitting live ranges"));
+
 namespace {
 struct EraVMPostCodegenPrepare : public FunctionPass {
 public:
@@ -44,6 +55,9 @@ struct EraVMPostCodegenPrepare : public FunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -303,7 +317,229 @@ static bool rearrangeOverflowHandlingBranches(Function &F) {
   return Changed;
 }
 
-static bool runImpl(Function &F) {
+struct InstSplitInfo {
+  using InstInfo = std::pair<Instruction *, APInt>;
+  // Vector of instructions with their original constant operand.
+  SmallVector<InstInfo, 4> Insts;
+  // Index of the last instruction that will be used to split the live range
+  // in the dominatee blocks.
+  uint64_t LastSplitIdx = UINT64_MAX;
+
+  // Add instruction and update the last split index if needed.
+  void addInst(Instruction *I) {
+    assert(isa<ConstantInt>(I->getOperand(1)) && "Expected constant operand");
+
+    // Update the index if this instruction is closer to the end of the block.
+    if (LastSplitIdx == UINT64_MAX ||
+        !I->comesBefore(Insts[LastSplitIdx].first))
+      LastSplitIdx = Insts.size();
+
+    Insts.push_back({I, cast<ConstantInt>(I->getOperand(1))->getValue()});
+  }
+
+  // Get the last instruction that will be used to split the live range.
+  InstInfo &getLastSplitInst() {
+    assert(LastSplitIdx < Insts.size() && "Expected valid index");
+    return Insts[LastSplitIdx];
+  }
+};
+
+// Update operands of instruction in InstInfo with the instruction in
+// DomInstInfo. Basically, we are doing the following transformation:
+//   %Dominator = add %In, DomImm
+//   %I         = add %In, Imm
+// to
+//   %Dominator = add %In, DomImm
+//   %I         = add %Dominator, Imm - DomImm
+void updateInstOperands(InstSplitInfo::InstInfo &InstInfo,
+                        InstSplitInfo::InstInfo &DomInstInfo,
+                        const DominatorTree &DT) {
+  auto [Inst, Imm] = InstInfo;
+  auto [DomInst, DomImm] = DomInstInfo;
+  assert(DT.dominates(DomInst, Inst) && "Expected dominator instruction");
+
+  Inst->setOperand(0, DomInst);
+  Inst->setOperand(1, ConstantInt::get(Inst->getType(), Imm - DomImm));
+
+  // TODO: Relax this check, since we don't need to drop poison
+  // flags in all cases.
+  Inst->dropPoisonGeneratingFlags();
+}
+
+// This function splits PHI nodes live ranges, if users are add instructions
+// with a constant operand. This is useful for loops with large switch
+// statements where PHI nodes are used frequently, and we want to keep these
+// variables in a single register.
+// In case regalloc is not able to keep these variables in a single register,
+// we will get something like this in all cases of the switch where variable
+// is used:
+//   preheader:
+//     %r1 = def
+//   header:
+//     jump @JTI
+//   ...
+//   bb1:
+//     %r2 = add %r1, 1
+//     bcc bb2
+//   bb2:
+//     %r1 = copy %r2 <- regalloc is not able to keep variable in the same reg
+//     b latch
+//   ...
+//   latch:
+//     bcc header
+//
+// Ideally, we would like to have something like this:
+//   preheader:
+//     %r1 = def
+//   header:
+//     jump @JTI
+//   ...
+//   bb1:
+//     %r1 = add %r1, 1 <- regalloc managed to keep variable in the same reg
+//     bcc bb2
+//   bb2:
+//                      <- no need for copy instruction
+//     b latch
+//   ...
+//   latch:
+//     bcc header
+//
+// To help regalloc to try to preserve frequently used PHI nodes in a single
+// register we are finding add instructions with constant operands that are
+// users of the PHI, and changing first operand of the add instruction to the
+// nearest dominating add instruction while updating the constant operand. This
+// way, regalloc will have a better chance to keep the variable in the same
+// register, since we changed the intervals of the variable.
+// For example, we are transforming this:
+//   header:
+//     %phi = phi
+//     ...
+//   bb1:
+//     %add1 = add %phi, 64
+//     ...
+//     %add2 = add %phi, -64
+//     bcc bb2
+//   bb2:
+//     %add3 = add %phi, -32
+//     ...
+//     %add4 = add %phi, -96
+//     ...
+//     b latch
+//   latch:
+//     bcc header
+//
+// To this (where add3 and add4 are updated with the nearest dominating add,
+// which is add2):
+//   header:
+//     %phi = phi
+//     ...
+//   bb1:
+//     %add1 = add %phi, 64
+//     ...
+//     %add2 = add %phi, -64
+//     bcc bb2
+//   bb2:
+//     %add3 = add %add2, 32
+//     ...
+//     %add4 = add %add2, -32
+//     ...
+//     b latch
+//   latch:
+//     bcc header
+//
+// In order to do so, we are doing the following steps:
+//   1. Find all users of the PHI node in the loop that are add instructions
+//      with constant operands, and updating the index of the instruction that
+//      is closer to the end of the block. This instruction will be used to
+//      split ranges in the dominatee blocks.
+//   2. For each block, we are finding the nearest dominator block from which
+//      we can split the live range.
+//   3. Update the instructions in the block with the nearest dominator by
+//      changing the first operands to the dominator instruction and updating
+//      the constant operands.
+static bool splitPHILiveRange(PHINode &Phi, const LoopInfo &LI, const Loop &L,
+                              const DominatorTree &DT) {
+  assert(Phi.getParent() == L.getHeader() &&
+         "Expected PHI node in a loop header");
+  DomTreeNode *LoopHeaderNode = DT.getNode(L.getHeader());
+  if (!LoopHeaderNode)
+    return false;
+
+  DenseMap<BasicBlock *, InstSplitInfo> Splits;
+  for (auto *U : Phi.users()) {
+    // Only collect add instructions with constant operands.
+    auto *UI = cast<Instruction>(U);
+    if (!UI || UI->getOpcode() != Instruction::Add ||
+        !isa<ConstantInt>(UI->getOperand(1)) ||
+        LI.getLoopFor(UI->getParent()) != &L)
+      continue;
+    Splits[UI->getParent()].addInst(UI);
+  }
+
+  // If there are no at least two blocks, we can't split the live range,
+  // since we need dominator and dominatee blocks to do so.
+  if (Splits.size() < 2)
+    return false;
+
+  // Split ranges across blocks. This is done by finding the nearest
+  // dominator block, from which we can split the live range.
+  bool Changed = false;
+  for (auto &[BB, Infos] : Splits) {
+    DomTreeNode *Node = DT.getNode(BB);
+    if (!Node)
+      continue;
+
+    // Find the nearest dominator, from which we can split the live range.
+    InstSplitInfo::InstInfo *NearestDominator = nullptr;
+    while ((Node = Node->getIDom())) {
+      auto I = Splits.find(Node->getBlock());
+      if (I != Splits.end()) {
+        // We found the nearest dominator block, so take the last
+        // instruction from it.
+        NearestDominator = &I->second.getLastSplitInst();
+        break;
+      }
+
+      // Bail out if we reached the start of the loop.
+      if (Node == LoopHeaderNode)
+        break;
+    }
+
+    // If we didn't find any dominator, skip this BB.
+    if (!NearestDominator)
+      continue;
+
+    // Update instructions in the block with the nearest dominator.
+    for (auto &Info : Infos.Insts)
+      updateInstOperands(Info, *NearestDominator, DT);
+
+    // TODO: Relax this check, since we don't need to drop poison
+    // flags in all cases.
+    NearestDominator->first->dropPoisonGeneratingFlags();
+    Changed = true;
+  }
+  return Changed;
+}
+
+// This optimization tries to split live ranges of PHI nodes in a loop,
+// with a large number of users.
+static bool splitLoopPHILiveRanges(Function &F, LoopInfo &LI,
+                                   DominatorTree &DT) {
+  if (!EnableSplitLoopPHILiveRanges)
+    return false;
+
+  bool Changed = false;
+  for (auto *L : LI) {
+    for (auto &Phi : L->getHeader()->phis()) {
+      if (Phi.getNumUses() <= NumOfPHIUsesToSplitLiveRanges)
+        continue;
+      Changed |= splitPHILiveRange(Phi, LI, *L, DT);
+    }
+  }
+  return Changed;
+}
+
+static bool runImpl(Function &F, LoopInfo &LI, DominatorTree &DT) {
   bool Changed = false;
   for (auto &BB : F) {
     for (auto &I : llvm::make_early_inc_range(BB)) {
@@ -320,28 +556,38 @@ static bool runImpl(Function &F) {
     }
   }
 
+  Changed |= splitLoopPHILiveRanges(F, LI, DT);
   Changed |= rearrangeOverflowHandlingBranches(F);
   return Changed;
 }
 
 bool EraVMPostCodegenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
-  return runImpl(F);
+
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return runImpl(F, LI, DT);
 }
 
 char EraVMPostCodegenPrepare::ID = 0;
 
-INITIALIZE_PASS(EraVMPostCodegenPrepare, DEBUG_TYPE,
-                ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
+INITIALIZE_PASS_BEGIN(EraVMPostCodegenPrepare, DEBUG_TYPE,
+                      ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(EraVMPostCodegenPrepare, DEBUG_TYPE,
+                    ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
 
 FunctionPass *llvm::createEraVMPostCodegenPreparePass() {
   return new EraVMPostCodegenPrepare();
 }
 
 PreservedAnalyses
 EraVMPostCodegenPreparePass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (runImpl(F))
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (runImpl(F, LI, DT))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/CodeGen/EraVM/O3-pipeline.ll b/llvm/test/CodeGen/EraVM/O3-pipeline.ll
@@ -90,6 +90,8 @@ target triple = "eravm"
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
 ; CHECK-NEXT:       CodeGen Prepare
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       EraVM optimizations after CodeGenPrepare pass
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
diff --git a/llvm/test/CodeGen/EraVM/split-loop-phi-live-ranges.ll b/llvm/test/CodeGen/EraVM/split-loop-phi-live-ranges.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -passes=eravm-post-codegen-prepare -S < %s | FileCheck %s
+; RUN: opt -passes=eravm-post-codegen-prepare -eravm-num-of-phi-uses-to-split-live-ranges=2 -S < %s | FileCheck %s
 
 target datalayout = "E-p:256:256-i256:256:256-S32-a:256:256"
 target triple = "eravm"
@@ -29,19 +29,19 @@ define i256 @test() {
 ; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i256 [[RESULT]], 64
 ; CHECK-NEXT:    [[INTTOPTR0:%.*]] = inttoptr i256 [[ADD2]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR0]], align 1
-; CHECK-NEXT:    [[ADD3:%.*]] = add nuw nsw i256 [[RESULT]], -64
+; CHECK-NEXT:    [[ADD3:%.*]] = add i256 [[RESULT]], -64
 ; CHECK-NEXT:    [[INTTOPTR1:%.*]] = inttoptr i256 [[ADD3]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR1]], align 1
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i256 [[RESULT]], 1000
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[CASE2_BB1:%.*]], label [[EXIT]]
 ; CHECK:       case2_bb1:
-; CHECK-NEXT:    [[ADD4:%.*]] = add nuw nsw i256 [[RESULT]], -32
+; CHECK-NEXT:    [[ADD4:%.*]] = add i256 [[ADD3]], 32
 ; CHECK-NEXT:    [[INTTOPTR2:%.*]] = inttoptr i256 [[ADD4]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR2]], align 1
-; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i256 [[RESULT]], -96
+; CHECK-NEXT:    [[ADD5:%.*]] = add i256 [[ADD3]], -32
 ; CHECK-NEXT:    [[INTTOPTR3:%.*]] = inttoptr i256 [[ADD5]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR3]], align 1
-; CHECK-NEXT:    [[ADD6:%.*]] = add nuw nsw i256 [[RESULT]], 128
+; CHECK-NEXT:    [[ADD6:%.*]] = add i256 [[ADD3]], 192
 ; CHECK-NEXT:    [[INTTOPTR4:%.*]] = inttoptr i256 [[ADD6]] to ptr addrspace(1)
 ; CHECK-NEXT:    store i256 0, ptr addrspace(1) [[INTTOPTR4]], align 1
 ; CHECK-NEXT:    br label [[INCREMENT]]