Skip to content

Commit 469702c

Browse files
authored
[LICM] Sink unused l-invariant loads in preheader. (#157559)
Unused loop invariant loads were not sunk from the preheader to the exit block, increasing live range. This commit moves the sinkUnusedInvariant logic from indvarsimplify to LICM also adds functionality to sink unused load that's not clobbered by the loop body.
1 parent 8e6ef2d commit 469702c

36 files changed

+453
-323
lines changed

llvm/lib/Transforms/Scalar/IndVarSimplify.cpp

Lines changed: 0 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,6 @@ class IndVarSimplify {
162162
const SCEV *ExitCount,
163163
PHINode *IndVar, SCEVExpander &Rewriter);
164164

165-
bool sinkUnusedInvariants(Loop *L);
166-
167165
public:
168166
IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
169167
const DataLayout &DL, TargetLibraryInfo *TLI,
@@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
10791077
return true;
10801078
}
10811079

1082-
//===----------------------------------------------------------------------===//
1083-
// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
1084-
//===----------------------------------------------------------------------===//
1085-
1086-
/// If there's a single exit block, sink any loop-invariant values that
1087-
/// were defined in the preheader but not used inside the loop into the
1088-
/// exit block to reduce register pressure in the loop.
1089-
bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
1090-
BasicBlock *ExitBlock = L->getExitBlock();
1091-
if (!ExitBlock) return false;
1092-
1093-
BasicBlock *Preheader = L->getLoopPreheader();
1094-
if (!Preheader) return false;
1095-
1096-
bool MadeAnyChanges = false;
1097-
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1098-
1099-
// Skip BB Terminator.
1100-
if (Preheader->getTerminator() == &I)
1101-
continue;
1102-
1103-
// New instructions were inserted at the end of the preheader.
1104-
if (isa<PHINode>(I))
1105-
break;
1106-
1107-
// Don't move instructions which might have side effects, since the side
1108-
// effects need to complete before instructions inside the loop. Also don't
1109-
// move instructions which might read memory, since the loop may modify
1110-
// memory. Note that it's okay if the instruction might have undefined
1111-
// behavior: LoopSimplify guarantees that the preheader dominates the exit
1112-
// block.
1113-
if (I.mayHaveSideEffects() || I.mayReadFromMemory())
1114-
continue;
1115-
1116-
// Skip debug or pseudo instructions.
1117-
if (I.isDebugOrPseudoInst())
1118-
continue;
1119-
1120-
// Skip eh pad instructions.
1121-
if (I.isEHPad())
1122-
continue;
1123-
1124-
// Don't sink alloca: we never want to sink static alloca's out of the
1125-
// entry block, and correctly sinking dynamic alloca's requires
1126-
// checks for stacksave/stackrestore intrinsics.
1127-
// FIXME: Refactor this check somehow?
1128-
if (isa<AllocaInst>(&I))
1129-
continue;
1130-
1131-
// Determine if there is a use in or before the loop (direct or
1132-
// otherwise).
1133-
bool UsedInLoop = false;
1134-
for (Use &U : I.uses()) {
1135-
Instruction *User = cast<Instruction>(U.getUser());
1136-
BasicBlock *UseBB = User->getParent();
1137-
if (PHINode *P = dyn_cast<PHINode>(User)) {
1138-
unsigned i =
1139-
PHINode::getIncomingValueNumForOperand(U.getOperandNo());
1140-
UseBB = P->getIncomingBlock(i);
1141-
}
1142-
if (UseBB == Preheader || L->contains(UseBB)) {
1143-
UsedInLoop = true;
1144-
break;
1145-
}
1146-
}
1147-
1148-
// If there is, the def must remain in the preheader.
1149-
if (UsedInLoop)
1150-
continue;
1151-
1152-
// Otherwise, sink it to the exit block.
1153-
I.moveBefore(ExitBlock->getFirstInsertionPt());
1154-
SE->forgetValue(&I);
1155-
MadeAnyChanges = true;
1156-
}
1157-
1158-
return MadeAnyChanges;
1159-
}
1160-
11611080
static void replaceExitCond(BranchInst *BI, Value *NewCond,
11621081
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
11631082
auto *OldCond = BI->getCondition();
@@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) {
20651984

20661985
// The Rewriter may not be used from this point on.
20671986

2068-
// Loop-invariant instructions in the preheader that aren't used in the
2069-
// loop may be sunk below the loop to reduce register pressure.
2070-
Changed |= sinkUnusedInvariants(L);
2071-
20721987
// rewriteFirstIterationLoopExitValues does not rely on the computation of
20731988
// trip count and therefore can further simplify exit values in addition to
20741989
// rewriteLoopExitValues.

llvm/lib/Transforms/Scalar/LICM.cpp

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock(
211211
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
212212
MemorySSAUpdater &MSSAU);
213213

214-
static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
215-
ICFLoopSafetyInfo &SafetyInfo,
216-
MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
214+
static void moveInstructionBefore(
215+
Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo,
216+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
217+
MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator);
218+
219+
static bool sinkUnusedInvariantsFromPreheaderToExit(
220+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
221+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
222+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
217223

218224
static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
219225
function_ref<void(Instruction *)> Fn);
@@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
471477
TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
472478
: sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
473479
MSSAU, &SafetyInfo, Flags, ORE);
480+
481+
// sink pre-header defs that are unused in-loop into the unique exit to reduce
482+
// pressure.
483+
Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
484+
SE, DT, Flags, ORE);
485+
474486
Flags.setIsSink(false);
475487
if (Preheader)
476488
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
@@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
14561468

14571469
static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
14581470
ICFLoopSafetyInfo &SafetyInfo,
1459-
MemorySSAUpdater &MSSAU,
1460-
ScalarEvolution *SE) {
1471+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
1472+
MemorySSA::InsertionPlace Point) {
14611473
SafetyInfo.removeInstruction(&I);
14621474
SafetyInfo.insertInstructionTo(&I, Dest->getParent());
14631475
I.moveBefore(*Dest->getParent(), Dest);
14641476
if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
14651477
MSSAU.getMemorySSA()->getMemoryAccess(&I)))
1466-
MSSAU.moveToPlace(OldMemAcc, Dest->getParent(),
1467-
MemorySSA::BeforeTerminator);
1478+
MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);
14681479
if (SE)
14691480
SE->forgetBlockAndLoopDispositions(&I);
14701481
}
14711482

1483+
// If there's a single exit block, sink any loop-invariant values that were
1484+
// defined in the preheader but not used inside the loop into the exit block
1485+
// to reduce register pressure in the loop.
1486+
static bool sinkUnusedInvariantsFromPreheaderToExit(
1487+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
1488+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
1489+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
1490+
BasicBlock *ExitBlock = L->getExitBlock();
1491+
if (!ExitBlock)
1492+
return false;
1493+
1494+
BasicBlock *Preheader = L->getLoopPreheader();
1495+
if (!Preheader)
1496+
return false;
1497+
1498+
bool MadeAnyChanges = false;
1499+
1500+
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1501+
1502+
// Skip terminator.
1503+
if (Preheader->getTerminator() == &I)
1504+
continue;
1505+
1506+
// New instructions were inserted at the end of the preheader.
1507+
if (isa<PHINode>(I))
1508+
break;
1509+
1510+
// Don't move instructions which might have side effects, since the side
1511+
// effects need to complete before instructions inside the loop. Note that
1512+
// it's okay if the instruction might have undefined behavior: LoopSimplify
1513+
// guarantees that the preheader dominates the exit block.
1514+
if (I.mayHaveSideEffects())
1515+
continue;
1516+
1517+
if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
1518+
continue;
1519+
1520+
// Determine if there is a use in or before the loop (direct or
1521+
// otherwise).
1522+
bool UsedInLoopOrPreheader = false;
1523+
for (Use &U : I.uses()) {
1524+
auto *UserI = cast<Instruction>(U.getUser());
1525+
BasicBlock *UseBB = UserI->getParent();
1526+
if (auto *PN = dyn_cast<PHINode>(UserI)) {
1527+
UseBB = PN->getIncomingBlock(U);
1528+
}
1529+
if (UseBB == Preheader || L->contains(UseBB)) {
1530+
UsedInLoopOrPreheader = true;
1531+
break;
1532+
}
1533+
}
1534+
if (UsedInLoopOrPreheader)
1535+
continue;
1536+
1537+
moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo,
1538+
MSSAU, SE, MemorySSA::Beginning);
1539+
MadeAnyChanges = true;
1540+
}
1541+
1542+
return MadeAnyChanges;
1543+
}
1544+
14721545
static Instruction *sinkThroughTriviallyReplaceablePHI(
14731546
PHINode *TPN, Instruction *I, LoopInfo *LI,
14741547
SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,

llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
7373
}
7474

7575
; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
76-
; GFX908: NumSgprs: 64
77-
; GFX908-GCNTRACKERS: NumSgprs: 64
76+
; GFX908: NumSgprs: 56
77+
; GFX908-GCNTRACKERS: NumSgprs: 56
7878
; GFX908: NumVgprs: 43
79-
; GFX908-GCNTRACKERS: NumVgprs: 39
79+
; GFX908-GCNTRACKERS: NumVgprs: 40
8080
; GFX908: Occupancy: 5
8181
; GFX908-GCNTRACKERS: Occupancy: 6
8282

llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
2323

2424
; OFFREG is offset system SGPR
25-
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
26-
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
2725
; GCN: NumVgprs: 256
2826
; GCN: ScratchSize: 640
2927

0 commit comments

Comments
 (0)