Skip to content

Commit 9e3347c

Browse files
VigneshwarJluciechoi
authored andcommitted
[LICM] Sink unused l-invariant loads in preheader. (llvm#157559)
Unused loop invariant loads were not sunk from the preheader to the exit block, increasing live range. This commit moves the sinkUnusedInvariant logic from indvarsimplify to LICM also adds functionality to sink unused load that's not clobbered by the loop body.
1 parent 549f975 commit 9e3347c

36 files changed

+453
-323
lines changed

llvm/lib/Transforms/Scalar/IndVarSimplify.cpp

Lines changed: 0 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,6 @@ class IndVarSimplify {
163163
const SCEV *ExitCount,
164164
PHINode *IndVar, SCEVExpander &Rewriter);
165165

166-
bool sinkUnusedInvariants(Loop *L);
167-
168166
public:
169167
IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
170168
const DataLayout &DL, TargetLibraryInfo *TLI,
@@ -1080,85 +1078,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
10801078
return true;
10811079
}
10821080

1083-
//===----------------------------------------------------------------------===//
1084-
// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
1085-
//===----------------------------------------------------------------------===//
1086-
1087-
/// If there's a single exit block, sink any loop-invariant values that
1088-
/// were defined in the preheader but not used inside the loop into the
1089-
/// exit block to reduce register pressure in the loop.
1090-
bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
1091-
BasicBlock *ExitBlock = L->getExitBlock();
1092-
if (!ExitBlock) return false;
1093-
1094-
BasicBlock *Preheader = L->getLoopPreheader();
1095-
if (!Preheader) return false;
1096-
1097-
bool MadeAnyChanges = false;
1098-
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1099-
1100-
// Skip BB Terminator.
1101-
if (Preheader->getTerminator() == &I)
1102-
continue;
1103-
1104-
// New instructions were inserted at the end of the preheader.
1105-
if (isa<PHINode>(I))
1106-
break;
1107-
1108-
// Don't move instructions which might have side effects, since the side
1109-
// effects need to complete before instructions inside the loop. Also don't
1110-
// move instructions which might read memory, since the loop may modify
1111-
// memory. Note that it's okay if the instruction might have undefined
1112-
// behavior: LoopSimplify guarantees that the preheader dominates the exit
1113-
// block.
1114-
if (I.mayHaveSideEffects() || I.mayReadFromMemory())
1115-
continue;
1116-
1117-
// Skip debug or pseudo instructions.
1118-
if (I.isDebugOrPseudoInst())
1119-
continue;
1120-
1121-
// Skip eh pad instructions.
1122-
if (I.isEHPad())
1123-
continue;
1124-
1125-
// Don't sink alloca: we never want to sink static alloca's out of the
1126-
// entry block, and correctly sinking dynamic alloca's requires
1127-
// checks for stacksave/stackrestore intrinsics.
1128-
// FIXME: Refactor this check somehow?
1129-
if (isa<AllocaInst>(&I))
1130-
continue;
1131-
1132-
// Determine if there is a use in or before the loop (direct or
1133-
// otherwise).
1134-
bool UsedInLoop = false;
1135-
for (Use &U : I.uses()) {
1136-
Instruction *User = cast<Instruction>(U.getUser());
1137-
BasicBlock *UseBB = User->getParent();
1138-
if (PHINode *P = dyn_cast<PHINode>(User)) {
1139-
unsigned i =
1140-
PHINode::getIncomingValueNumForOperand(U.getOperandNo());
1141-
UseBB = P->getIncomingBlock(i);
1142-
}
1143-
if (UseBB == Preheader || L->contains(UseBB)) {
1144-
UsedInLoop = true;
1145-
break;
1146-
}
1147-
}
1148-
1149-
// If there is, the def must remain in the preheader.
1150-
if (UsedInLoop)
1151-
continue;
1152-
1153-
// Otherwise, sink it to the exit block.
1154-
I.moveBefore(ExitBlock->getFirstInsertionPt());
1155-
SE->forgetValue(&I);
1156-
MadeAnyChanges = true;
1157-
}
1158-
1159-
return MadeAnyChanges;
1160-
}
1161-
11621081
static void replaceExitCond(BranchInst *BI, Value *NewCond,
11631082
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
11641083
auto *OldCond = BI->getCondition();
@@ -2078,10 +1997,6 @@ bool IndVarSimplify::run(Loop *L) {
20781997

20791998
// The Rewriter may not be used from this point on.
20801999

2081-
// Loop-invariant instructions in the preheader that aren't used in the
2082-
// loop may be sunk below the loop to reduce register pressure.
2083-
Changed |= sinkUnusedInvariants(L);
2084-
20852000
// rewriteFirstIterationLoopExitValues does not rely on the computation of
20862001
// trip count and therefore can further simplify exit values in addition to
20872002
// rewriteLoopExitValues.

llvm/lib/Transforms/Scalar/LICM.cpp

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock(
211211
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
212212
MemorySSAUpdater &MSSAU);
213213

214-
static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
215-
ICFLoopSafetyInfo &SafetyInfo,
216-
MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
214+
static void moveInstructionBefore(
215+
Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo,
216+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
217+
MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator);
218+
219+
static bool sinkUnusedInvariantsFromPreheaderToExit(
220+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
221+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
222+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
217223

218224
static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
219225
function_ref<void(Instruction *)> Fn);
@@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
471477
TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
472478
: sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
473479
MSSAU, &SafetyInfo, Flags, ORE);
480+
481+
// sink pre-header defs that are unused in-loop into the unique exit to reduce
482+
// pressure.
483+
Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
484+
SE, DT, Flags, ORE);
485+
474486
Flags.setIsSink(false);
475487
if (Preheader)
476488
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
@@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
14561468

14571469
static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
14581470
ICFLoopSafetyInfo &SafetyInfo,
1459-
MemorySSAUpdater &MSSAU,
1460-
ScalarEvolution *SE) {
1471+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
1472+
MemorySSA::InsertionPlace Point) {
14611473
SafetyInfo.removeInstruction(&I);
14621474
SafetyInfo.insertInstructionTo(&I, Dest->getParent());
14631475
I.moveBefore(*Dest->getParent(), Dest);
14641476
if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
14651477
MSSAU.getMemorySSA()->getMemoryAccess(&I)))
1466-
MSSAU.moveToPlace(OldMemAcc, Dest->getParent(),
1467-
MemorySSA::BeforeTerminator);
1478+
MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);
14681479
if (SE)
14691480
SE->forgetBlockAndLoopDispositions(&I);
14701481
}
14711482

1483+
// If there's a single exit block, sink any loop-invariant values that were
1484+
// defined in the preheader but not used inside the loop into the exit block
1485+
// to reduce register pressure in the loop.
1486+
static bool sinkUnusedInvariantsFromPreheaderToExit(
1487+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
1488+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
1489+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
1490+
BasicBlock *ExitBlock = L->getExitBlock();
1491+
if (!ExitBlock)
1492+
return false;
1493+
1494+
BasicBlock *Preheader = L->getLoopPreheader();
1495+
if (!Preheader)
1496+
return false;
1497+
1498+
bool MadeAnyChanges = false;
1499+
1500+
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1501+
1502+
// Skip terminator.
1503+
if (Preheader->getTerminator() == &I)
1504+
continue;
1505+
1506+
// New instructions were inserted at the end of the preheader.
1507+
if (isa<PHINode>(I))
1508+
break;
1509+
1510+
// Don't move instructions which might have side effects, since the side
1511+
// effects need to complete before instructions inside the loop. Note that
1512+
// it's okay if the instruction might have undefined behavior: LoopSimplify
1513+
// guarantees that the preheader dominates the exit block.
1514+
if (I.mayHaveSideEffects())
1515+
continue;
1516+
1517+
if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
1518+
continue;
1519+
1520+
// Determine if there is a use in or before the loop (direct or
1521+
// otherwise).
1522+
bool UsedInLoopOrPreheader = false;
1523+
for (Use &U : I.uses()) {
1524+
auto *UserI = cast<Instruction>(U.getUser());
1525+
BasicBlock *UseBB = UserI->getParent();
1526+
if (auto *PN = dyn_cast<PHINode>(UserI)) {
1527+
UseBB = PN->getIncomingBlock(U);
1528+
}
1529+
if (UseBB == Preheader || L->contains(UseBB)) {
1530+
UsedInLoopOrPreheader = true;
1531+
break;
1532+
}
1533+
}
1534+
if (UsedInLoopOrPreheader)
1535+
continue;
1536+
1537+
moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo,
1538+
MSSAU, SE, MemorySSA::Beginning);
1539+
MadeAnyChanges = true;
1540+
}
1541+
1542+
return MadeAnyChanges;
1543+
}
1544+
14721545
static Instruction *sinkThroughTriviallyReplaceablePHI(
14731546
PHINode *TPN, Instruction *I, LoopInfo *LI,
14741547
SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,

llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
7373
}
7474

7575
; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
76-
; GFX908: NumSgprs: 64
77-
; GFX908-GCNTRACKERS: NumSgprs: 64
76+
; GFX908: NumSgprs: 56
77+
; GFX908-GCNTRACKERS: NumSgprs: 56
7878
; GFX908: NumVgprs: 43
79-
; GFX908-GCNTRACKERS: NumVgprs: 39
79+
; GFX908-GCNTRACKERS: NumVgprs: 40
8080
; GFX908: Occupancy: 5
8181
; GFX908-GCNTRACKERS: Occupancy: 6
8282

llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
2323

2424
; OFFREG is offset system SGPR
25-
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
26-
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
2725
; GCN: NumVgprs: 256
2826
; GCN: ScratchSize: 640
2927

0 commit comments

Comments
 (0)