Skip to content

Commit 4c59238

Browse files
vladimirradosavljevicakiramenai
authored andcommitted
[EraVM] Add support for splitting live ranges of PHI nodes in loops
This is useful for loops with large switch statements where PHI nodes are used frequently, and we want to keep these variables in a single register. Signed-off-by: Vladimir Radosavljevic <[email protected]>
1 parent 72097e2 commit 4c59238

File tree

3 files changed

+258
-10
lines changed

3 files changed

+258
-10
lines changed

llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp

Lines changed: 251 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@
1919

2020
#include "EraVM.h"
2121
#include "llvm/ADT/STLExtras.h"
22+
#include "llvm/Analysis/LoopInfo.h"
23+
#include "llvm/IR/Dominators.h"
2224
#include "llvm/IR/IRBuilder.h"
2325
#include "llvm/IR/InstrTypes.h"
2426
#include "llvm/IR/PatternMatch.h"
27+
#include "llvm/InitializePasses.h"
2528

2629
using namespace llvm;
2730
using namespace llvm::PatternMatch;
@@ -30,6 +33,14 @@ using namespace llvm::PatternMatch;
3033
#define ERAVM_POST_CODEGEN_PREPARE_NAME \
3134
"EraVM optimizations after CodeGenPrepare pass"
3235

36+
static cl::opt<bool> EnableSplitLoopPHILiveRanges(
37+
"eravm-enable-split-loop-phi-live-ranges", cl::Hidden, cl::init(true),
38+
cl::desc("Enable splitting live ranges of PHI nodes in loops"));
39+
40+
static cl::opt<unsigned> NumOfPHIUsesToSplitLiveRanges(
41+
"eravm-num-of-phi-uses-to-split-live-ranges", cl::Hidden, cl::init(20),
42+
cl::desc("Number of uses of PHI node to consider splitting live ranges"));
43+
3344
namespace {
3445
struct EraVMPostCodegenPrepare : public FunctionPass {
3546
public:
@@ -44,6 +55,9 @@ struct EraVMPostCodegenPrepare : public FunctionPass {
4455
}
4556

4657
void getAnalysisUsage(AnalysisUsage &AU) const override {
58+
AU.addRequired<DominatorTreeWrapperPass>();
59+
AU.addRequired<LoopInfoWrapperPass>();
60+
AU.addPreserved<LoopInfoWrapperPass>();
4761
FunctionPass::getAnalysisUsage(AU);
4862
}
4963
};
@@ -303,7 +317,229 @@ static bool rearrangeOverflowHandlingBranches(Function &F) {
303317
return Changed;
304318
}
305319

306-
static bool runImpl(Function &F) {
320+
struct InstSplitInfo {
321+
using InstInfo = std::pair<Instruction *, APInt>;
322+
// Vector of instructions with their original constant operand.
323+
SmallVector<InstInfo, 4> Insts;
324+
// Index of the last instruction that will be used to split the live range
325+
// in the dominatee blocks.
326+
uint64_t LastSplitIdx = UINT64_MAX;
327+
328+
// Add instruction and update the last split index if needed.
329+
void addInst(Instruction *I) {
330+
assert(isa<ConstantInt>(I->getOperand(1)) && "Expected constant operand");
331+
332+
// Update the index if this instruction is closer to the end of the block.
333+
if (LastSplitIdx == UINT64_MAX ||
334+
!I->comesBefore(Insts[LastSplitIdx].first))
335+
LastSplitIdx = Insts.size();
336+
337+
Insts.push_back({I, cast<ConstantInt>(I->getOperand(1))->getValue()});
338+
}
339+
340+
// Get the last instruction that will be used to split the live range.
341+
InstInfo &getLastSplitInst() {
342+
assert(LastSplitIdx < Insts.size() && "Expected valid index");
343+
return Insts[LastSplitIdx];
344+
}
345+
};
346+
347+
// Update operands of instruction in InstInfo with the instruction in
348+
// DomInstInfo. Basically, we are doing the following transformation:
349+
// %Dominator = add %In, DomImm
350+
// %I = add %In, Imm
351+
// to
352+
// %Dominator = add %In, DomImm
353+
// %I = add %Dominator, Imm - DomImm
354+
void updateInstOperands(InstSplitInfo::InstInfo &InstInfo,
355+
InstSplitInfo::InstInfo &DomInstInfo,
356+
const DominatorTree &DT) {
357+
auto [Inst, Imm] = InstInfo;
358+
auto [DomInst, DomImm] = DomInstInfo;
359+
assert(DT.dominates(DomInst, Inst) && "Expected dominator instruction");
360+
361+
Inst->setOperand(0, DomInst);
362+
Inst->setOperand(1, ConstantInt::get(Inst->getType(), Imm - DomImm));
363+
364+
// TODO: Relax this check, since we don't need to drop poison
365+
// flags in all cases.
366+
Inst->dropPoisonGeneratingFlags();
367+
}
368+
369+
// This function splits PHI nodes live ranges, if users are add instructions
370+
// with a constant operand. This is useful for loops with large switch
371+
// statements where PHI nodes are used frequently, and we want to keep these
372+
// variables in a single register.
373+
// In case regalloc is not able to keep these variables in a single register,
374+
// we will get something like this in all cases of the switch where variable
375+
// is used:
376+
// preheader:
377+
// %r1 = def
378+
// header:
379+
// jump @JTI
380+
// ...
381+
// bb1:
382+
// %r2 = add %r1, 1
383+
// bcc bb2
384+
// bb2:
385+
// %r1 = copy %r2 <- regalloc is not able to keep variable in the same reg
386+
// b latch
387+
// ...
388+
// latch:
389+
// bcc header
390+
//
391+
// Ideally, we would like to have something like this:
392+
// preheader:
393+
// %r1 = def
394+
// header:
395+
// jump @JTI
396+
// ...
397+
// bb1:
398+
// %r1 = add %r1, 1 <- regalloc managed to keep variable in the same reg
399+
// bcc bb2
400+
// bb2:
401+
// <- no need for copy instruction
402+
// b latch
403+
// ...
404+
// latch:
405+
// bcc header
406+
//
407+
// To help regalloc to try to preserve frequently used PHI nodes in a single
408+
// register we are finding add instructions with constant operands that are
409+
// users of the PHI, and changing first operand of the add instruction to the
410+
// nearest dominating add instruction while updating the constant operand. This
411+
// way, regalloc will have a better chance to keep the variable in the same
412+
// register, since we changed the intervals of the variable.
413+
// For example, we are transforming this:
414+
// header:
415+
// %phi = phi
416+
// ...
417+
// bb1:
418+
// %add1 = add %phi, 64
419+
// ...
420+
// %add2 = add %phi, -64
421+
// bcc bb2
422+
// bb2:
423+
// %add3 = add %phi, -32
424+
// ...
425+
// %add4 = add %phi, -96
426+
// ...
427+
// b latch
428+
// latch:
429+
// bcc header
430+
//
431+
// To this (where add3 and add4 are updated with the nearest dominating add,
432+
// which is add2):
433+
// header:
434+
// %phi = phi
435+
// ...
436+
// bb1:
437+
// %add1 = add %phi, 64
438+
// ...
439+
// %add2 = add %phi, -64
440+
// bcc bb2
441+
// bb2:
442+
// %add3 = add %add2, 32
443+
// ...
444+
// %add4 = add %add2, -32
445+
// ...
446+
// b latch
447+
// latch:
448+
// bcc header
449+
//
450+
// In order to do so, we are doing the following steps:
451+
// 1. Find all users of the PHI node in the loop that are add instructions
452+
// with constant operands, and updating the index of the instruction that
453+
// is closer to the end of the block. This instruction will be used to
454+
// split ranges in the dominatee blocks.
455+
// 2. For each block, we are finding the nearest dominator block from which
456+
// we can split the live range.
457+
// 3. Update the instructions in the block with the nearest dominator by
458+
// changing the first operands to the dominator instruction and updating
459+
// the constant operands.
460+
static bool splitPHILiveRange(PHINode &Phi, const LoopInfo &LI, const Loop &L,
461+
const DominatorTree &DT) {
462+
assert(Phi.getParent() == L.getHeader() &&
463+
"Expected PHI node in a loop header");
464+
DomTreeNode *LoopHeaderNode = DT.getNode(L.getHeader());
465+
if (!LoopHeaderNode)
466+
return false;
467+
468+
DenseMap<BasicBlock *, InstSplitInfo> Splits;
469+
for (auto *U : Phi.users()) {
470+
// Only collect add instructions with constant operands.
471+
auto *UI = cast<Instruction>(U);
472+
if (!UI || UI->getOpcode() != Instruction::Add ||
473+
!isa<ConstantInt>(UI->getOperand(1)) ||
474+
LI.getLoopFor(UI->getParent()) != &L)
475+
continue;
476+
Splits[UI->getParent()].addInst(UI);
477+
}
478+
479+
// If there are no at least two blocks, we can't split the live range,
480+
// since we need dominator and dominatee blocks to do so.
481+
if (Splits.size() < 2)
482+
return false;
483+
484+
// Split ranges across blocks. This is done by finding the nearest
485+
// dominator block, from which we can split the live range.
486+
bool Changed = false;
487+
for (auto &[BB, Infos] : Splits) {
488+
DomTreeNode *Node = DT.getNode(BB);
489+
if (!Node)
490+
continue;
491+
492+
// Find the nearest dominator, from which we can split the live range.
493+
InstSplitInfo::InstInfo *NearestDominator = nullptr;
494+
while ((Node = Node->getIDom())) {
495+
auto I = Splits.find(Node->getBlock());
496+
if (I != Splits.end()) {
497+
// We found the nearest dominator block, so take the last
498+
// instruction from it.
499+
NearestDominator = &I->second.getLastSplitInst();
500+
break;
501+
}
502+
503+
// Bail out if we reached the start of the loop.
504+
if (Node == LoopHeaderNode)
505+
break;
506+
}
507+
508+
// If we didn't find any dominator, skip this BB.
509+
if (!NearestDominator)
510+
continue;
511+
512+
// Update instructions in the block with the nearest dominator.
513+
for (auto &Info : Infos.Insts)
514+
updateInstOperands(Info, *NearestDominator, DT);
515+
516+
// TODO: Relax this check, since we don't need to drop poison
517+
// flags in all cases.
518+
NearestDominator->first->dropPoisonGeneratingFlags();
519+
Changed = true;
520+
}
521+
return Changed;
522+
}
523+
524+
// This optimization tries to split live ranges of PHI nodes in a loop,
525+
// with a large number of users.
526+
static bool splitLoopPHILiveRanges(Function &F, LoopInfo &LI,
527+
DominatorTree &DT) {
528+
if (!EnableSplitLoopPHILiveRanges)
529+
return false;
530+
531+
bool Changed = false;
532+
for (auto *L : LI) {
533+
for (auto &Phi : L->getHeader()->phis()) {
534+
if (Phi.getNumUses() <= NumOfPHIUsesToSplitLiveRanges)
535+
continue;
536+
Changed |= splitPHILiveRange(Phi, LI, *L, DT);
537+
}
538+
}
539+
return Changed;
540+
}
541+
542+
static bool runImpl(Function &F, LoopInfo &LI, DominatorTree &DT) {
307543
bool Changed = false;
308544
for (auto &BB : F) {
309545
for (auto &I : llvm::make_early_inc_range(BB)) {
@@ -320,28 +556,38 @@ static bool runImpl(Function &F) {
320556
}
321557
}
322558

559+
Changed |= splitLoopPHILiveRanges(F, LI, DT);
323560
Changed |= rearrangeOverflowHandlingBranches(F);
324561
return Changed;
325562
}
326563

327564
bool EraVMPostCodegenPrepare::runOnFunction(Function &F) {
328565
if (skipFunction(F))
329566
return false;
330-
return runImpl(F);
567+
568+
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
569+
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
570+
return runImpl(F, LI, DT);
331571
}
332572

333573
char EraVMPostCodegenPrepare::ID = 0;
334574

335-
INITIALIZE_PASS(EraVMPostCodegenPrepare, DEBUG_TYPE,
336-
ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
575+
INITIALIZE_PASS_BEGIN(EraVMPostCodegenPrepare, DEBUG_TYPE,
576+
ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
577+
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
578+
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
579+
INITIALIZE_PASS_END(EraVMPostCodegenPrepare, DEBUG_TYPE,
580+
ERAVM_POST_CODEGEN_PREPARE_NAME, false, false)
337581

338582
FunctionPass *llvm::createEraVMPostCodegenPreparePass() {
339583
return new EraVMPostCodegenPrepare();
340584
}
341585

342586
PreservedAnalyses
343587
EraVMPostCodegenPreparePass::run(Function &F, FunctionAnalysisManager &AM) {
344-
if (runImpl(F))
588+
auto &LI = AM.getResult<LoopAnalysis>(F);
589+
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
590+
if (runImpl(F, LI, DT))
345591
return PreservedAnalyses::none();
346592
return PreservedAnalyses::all();
347593
}

llvm/test/CodeGen/EraVM/O3-pipeline.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ target triple = "eravm"
9090
; CHECK-NEXT: Natural Loop Information
9191
; CHECK-NEXT: TLS Variable Hoist
9292
; CHECK-NEXT: CodeGen Prepare
93+
; CHECK-NEXT: Dominator Tree Construction
94+
; CHECK-NEXT: Natural Loop Information
9395
; CHECK-NEXT: EraVM optimizations after CodeGenPrepare pass
9496
; CHECK-NEXT: Prepare callbr
9597
; CHECK-NEXT: Safe Stack instrumentation pass

llvm/test/CodeGen/EraVM/split-loop-phi-live-ranges.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2-
; RUN: opt -passes=eravm-post-codegen-prepare -S < %s | FileCheck %s
2+
; RUN: opt -passes=eravm-post-codegen-prepare -eravm-num-of-phi-uses-to-split-live-ranges=2 -S < %s | FileCheck %s
33

44
target datalayout = "E-p:256:256-i256:256:256-S32-a:256:256"
55
target triple = "eravm"
@@ -29,19 +29,19 @@ define i256 @test() {
2929
; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i256 [[RESULT]], 64
3030
; CHECK-NEXT: [[INTTOPTR0:%.*]] = inttoptr i256 [[ADD2]] to ptr addrspace(1)
3131
; CHECK-NEXT: store i256 0, ptr addrspace(1) [[INTTOPTR0]], align 1
32-
; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i256 [[RESULT]], -64
32+
; CHECK-NEXT: [[ADD3:%.*]] = add i256 [[RESULT]], -64
3333
; CHECK-NEXT: [[INTTOPTR1:%.*]] = inttoptr i256 [[ADD3]] to ptr addrspace(1)
3434
; CHECK-NEXT: store i256 0, ptr addrspace(1) [[INTTOPTR1]], align 1
3535
; CHECK-NEXT: [[CMP1:%.*]] = icmp ugt i256 [[RESULT]], 1000
3636
; CHECK-NEXT: br i1 [[CMP1]], label [[CASE2_BB1:%.*]], label [[EXIT]]
3737
; CHECK: case2_bb1:
38-
; CHECK-NEXT: [[ADD4:%.*]] = add nuw nsw i256 [[RESULT]], -32
38+
; CHECK-NEXT: [[ADD4:%.*]] = add i256 [[ADD3]], 32
3939
; CHECK-NEXT: [[INTTOPTR2:%.*]] = inttoptr i256 [[ADD4]] to ptr addrspace(1)
4040
; CHECK-NEXT: store i256 0, ptr addrspace(1) [[INTTOPTR2]], align 1
41-
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i256 [[RESULT]], -96
41+
; CHECK-NEXT: [[ADD5:%.*]] = add i256 [[ADD3]], -32
4242
; CHECK-NEXT: [[INTTOPTR3:%.*]] = inttoptr i256 [[ADD5]] to ptr addrspace(1)
4343
; CHECK-NEXT: store i256 0, ptr addrspace(1) [[INTTOPTR3]], align 1
44-
; CHECK-NEXT: [[ADD6:%.*]] = add nuw nsw i256 [[RESULT]], 128
44+
; CHECK-NEXT: [[ADD6:%.*]] = add i256 [[ADD3]], 192
4545
; CHECK-NEXT: [[INTTOPTR4:%.*]] = inttoptr i256 [[ADD6]] to ptr addrspace(1)
4646
; CHECK-NEXT: store i256 0, ptr addrspace(1) [[INTTOPTR4]], align 1
4747
; CHECK-NEXT: br label [[INCREMENT]]

0 commit comments

Comments
 (0)