Skip to content

Commit 82d62d8

Browse files
committed
[LICM] Sink unused l-invariant loads in preheader. (llvm#157559)
Unused loop invariant loads were not sunk from the preheader to the exit block, increasing live range. This commit moves the sinkUnusedInvariant logic from indvarsimplify to LICM also adds functionality to sink unused load that's not clobbered by the loop body.
1 parent 94f12a2 commit 82d62d8

34 files changed

+455
-325
lines changed

llvm/lib/Transforms/Scalar/IndVarSimplify.cpp

Lines changed: 0 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,6 @@ class IndVarSimplify {
158158
const SCEV *ExitCount,
159159
PHINode *IndVar, SCEVExpander &Rewriter);
160160

161-
bool sinkUnusedInvariants(Loop *L);
162-
163161
public:
164162
IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
165163
const DataLayout &DL, TargetLibraryInfo *TLI,
@@ -1075,85 +1073,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
10751073
return true;
10761074
}
10771075

1078-
//===----------------------------------------------------------------------===//
1079-
// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
1080-
//===----------------------------------------------------------------------===//
1081-
1082-
/// If there's a single exit block, sink any loop-invariant values that
1083-
/// were defined in the preheader but not used inside the loop into the
1084-
/// exit block to reduce register pressure in the loop.
1085-
bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
1086-
BasicBlock *ExitBlock = L->getExitBlock();
1087-
if (!ExitBlock) return false;
1088-
1089-
BasicBlock *Preheader = L->getLoopPreheader();
1090-
if (!Preheader) return false;
1091-
1092-
bool MadeAnyChanges = false;
1093-
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1094-
1095-
// Skip BB Terminator.
1096-
if (Preheader->getTerminator() == &I)
1097-
continue;
1098-
1099-
// New instructions were inserted at the end of the preheader.
1100-
if (isa<PHINode>(I))
1101-
break;
1102-
1103-
// Don't move instructions which might have side effects, since the side
1104-
// effects need to complete before instructions inside the loop. Also don't
1105-
// move instructions which might read memory, since the loop may modify
1106-
// memory. Note that it's okay if the instruction might have undefined
1107-
// behavior: LoopSimplify guarantees that the preheader dominates the exit
1108-
// block.
1109-
if (I.mayHaveSideEffects() || I.mayReadFromMemory())
1110-
continue;
1111-
1112-
// Skip debug or pseudo instructions.
1113-
if (I.isDebugOrPseudoInst())
1114-
continue;
1115-
1116-
// Skip eh pad instructions.
1117-
if (I.isEHPad())
1118-
continue;
1119-
1120-
// Don't sink alloca: we never want to sink static alloca's out of the
1121-
// entry block, and correctly sinking dynamic alloca's requires
1122-
// checks for stacksave/stackrestore intrinsics.
1123-
// FIXME: Refactor this check somehow?
1124-
if (isa<AllocaInst>(&I))
1125-
continue;
1126-
1127-
// Determine if there is a use in or before the loop (direct or
1128-
// otherwise).
1129-
bool UsedInLoop = false;
1130-
for (Use &U : I.uses()) {
1131-
Instruction *User = cast<Instruction>(U.getUser());
1132-
BasicBlock *UseBB = User->getParent();
1133-
if (PHINode *P = dyn_cast<PHINode>(User)) {
1134-
unsigned i =
1135-
PHINode::getIncomingValueNumForOperand(U.getOperandNo());
1136-
UseBB = P->getIncomingBlock(i);
1137-
}
1138-
if (UseBB == Preheader || L->contains(UseBB)) {
1139-
UsedInLoop = true;
1140-
break;
1141-
}
1142-
}
1143-
1144-
// If there is, the def must remain in the preheader.
1145-
if (UsedInLoop)
1146-
continue;
1147-
1148-
// Otherwise, sink it to the exit block.
1149-
I.moveBefore(ExitBlock->getFirstInsertionPt());
1150-
SE->forgetValue(&I);
1151-
MadeAnyChanges = true;
1152-
}
1153-
1154-
return MadeAnyChanges;
1155-
}
1156-
11571076
static void replaceExitCond(BranchInst *BI, Value *NewCond,
11581077
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
11591078
auto *OldCond = BI->getCondition();
@@ -2016,10 +1935,6 @@ bool IndVarSimplify::run(Loop *L) {
20161935

20171936
// The Rewriter may not be used from this point on.
20181937

2019-
// Loop-invariant instructions in the preheader that aren't used in the
2020-
// loop may be sunk below the loop to reduce register pressure.
2021-
Changed |= sinkUnusedInvariants(L);
2022-
20231938
// rewriteFirstIterationLoopExitValues does not rely on the computation of
20241939
// trip count and therefore can further simplify exit values in addition to
20251940
// rewriteLoopExitValues.

llvm/lib/Transforms/Scalar/LICM.cpp

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -207,9 +207,15 @@ static Instruction *cloneInstructionInExitBlock(
207207
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
208208
MemorySSAUpdater &MSSAU);
209209

210-
static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
211-
ICFLoopSafetyInfo &SafetyInfo,
212-
MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
210+
static void moveInstructionBefore(
211+
Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo,
212+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
213+
MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator);
214+
215+
static bool sinkUnusedInvariantsFromPreheaderToExit(
216+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
217+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
218+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
213219

214220
static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
215221
function_ref<void(Instruction *)> Fn);
@@ -468,6 +474,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
468474
TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
469475
: sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
470476
MSSAU, &SafetyInfo, Flags, ORE);
477+
478+
// sink pre-header defs that are unused in-loop into the unique exit to reduce
479+
// pressure.
480+
Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
481+
SE, DT, Flags, ORE);
482+
471483
Flags.setIsSink(false);
472484
if (Preheader)
473485
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
@@ -1441,19 +1453,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
14411453

14421454
static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
14431455
ICFLoopSafetyInfo &SafetyInfo,
1444-
MemorySSAUpdater &MSSAU,
1445-
ScalarEvolution *SE) {
1456+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
1457+
MemorySSA::InsertionPlace Point) {
14461458
SafetyInfo.removeInstruction(&I);
14471459
SafetyInfo.insertInstructionTo(&I, Dest->getParent());
14481460
I.moveBefore(*Dest->getParent(), Dest);
14491461
if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
14501462
MSSAU.getMemorySSA()->getMemoryAccess(&I)))
1451-
MSSAU.moveToPlace(OldMemAcc, Dest->getParent(),
1452-
MemorySSA::BeforeTerminator);
1463+
MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);
14531464
if (SE)
14541465
SE->forgetBlockAndLoopDispositions(&I);
14551466
}
14561467

1468+
// If there's a single exit block, sink any loop-invariant values that were
1469+
// defined in the preheader but not used inside the loop into the exit block
1470+
// to reduce register pressure in the loop.
1471+
static bool sinkUnusedInvariantsFromPreheaderToExit(
1472+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
1473+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
1474+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
1475+
BasicBlock *ExitBlock = L->getExitBlock();
1476+
if (!ExitBlock)
1477+
return false;
1478+
1479+
BasicBlock *Preheader = L->getLoopPreheader();
1480+
if (!Preheader)
1481+
return false;
1482+
1483+
bool MadeAnyChanges = false;
1484+
1485+
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1486+
1487+
// Skip terminator.
1488+
if (Preheader->getTerminator() == &I)
1489+
continue;
1490+
1491+
// New instructions were inserted at the end of the preheader.
1492+
if (isa<PHINode>(I))
1493+
break;
1494+
1495+
// Don't move instructions which might have side effects, since the side
1496+
// effects need to complete before instructions inside the loop. Note that
1497+
// it's okay if the instruction might have undefined behavior: LoopSimplify
1498+
// guarantees that the preheader dominates the exit block.
1499+
if (I.mayHaveSideEffects())
1500+
continue;
1501+
1502+
if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
1503+
continue;
1504+
1505+
// Determine if there is a use in or before the loop (direct or
1506+
// otherwise).
1507+
bool UsedInLoopOrPreheader = false;
1508+
for (Use &U : I.uses()) {
1509+
auto *UserI = cast<Instruction>(U.getUser());
1510+
BasicBlock *UseBB = UserI->getParent();
1511+
if (auto *PN = dyn_cast<PHINode>(UserI)) {
1512+
UseBB = PN->getIncomingBlock(U);
1513+
}
1514+
if (UseBB == Preheader || L->contains(UseBB)) {
1515+
UsedInLoopOrPreheader = true;
1516+
break;
1517+
}
1518+
}
1519+
if (UsedInLoopOrPreheader)
1520+
continue;
1521+
1522+
moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo,
1523+
MSSAU, SE, MemorySSA::Beginning);
1524+
MadeAnyChanges = true;
1525+
}
1526+
1527+
return MadeAnyChanges;
1528+
}
1529+
14571530
static Instruction *sinkThroughTriviallyReplaceablePHI(
14581531
PHINode *TPN, Instruction *I, LoopInfo *LI,
14591532
SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,

llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
7373
}
7474

7575
; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
76-
; GFX908: NumSgprs: 64
77-
; GFX908-GCNTRACKERS: NumSgprs: 64
76+
; GFX908: NumSgprs: 56
77+
; GFX908-GCNTRACKERS: NumSgprs: 56
7878
; GFX908: NumVgprs: 43
79-
; GFX908-GCNTRACKERS: NumVgprs: 39
79+
; GFX908-GCNTRACKERS: NumVgprs: 40
8080
; GFX908: Occupancy: 5
8181
; GFX908-GCNTRACKERS: Occupancy: 6
8282

llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
2323

2424
; OFFREG is offset system SGPR
25-
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
26-
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
2725
; GCN: NumVgprs: 256
2826
; GCN: ScratchSize: 640
2927

0 commit comments

Comments
 (0)