Skip to content

Commit 4ad1bda

Browse files
committed
Reland "[LICM] Sink unused l-invariant loads in preheader. #157559"
This fixes the issue where stack overflow happened when the stack size is small for large loops due to getPreviousDefRecursive getting called for updating MSSAU
1 parent 83fc85c commit 4ad1bda

36 files changed

+499
-318
lines changed

llvm/lib/Transforms/Scalar/IndVarSimplify.cpp

Lines changed: 0 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,6 @@ class IndVarSimplify {
162162
const SCEV *ExitCount,
163163
PHINode *IndVar, SCEVExpander &Rewriter);
164164

165-
bool sinkUnusedInvariants(Loop *L);
166-
167165
public:
168166
IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
169167
const DataLayout &DL, TargetLibraryInfo *TLI,
@@ -1093,85 +1091,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
10931091
return true;
10941092
}
10951093

1096-
//===----------------------------------------------------------------------===//
1097-
// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
1098-
//===----------------------------------------------------------------------===//
1099-
1100-
/// If there's a single exit block, sink any loop-invariant values that
1101-
/// were defined in the preheader but not used inside the loop into the
1102-
/// exit block to reduce register pressure in the loop.
1103-
bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
1104-
BasicBlock *ExitBlock = L->getExitBlock();
1105-
if (!ExitBlock) return false;
1106-
1107-
BasicBlock *Preheader = L->getLoopPreheader();
1108-
if (!Preheader) return false;
1109-
1110-
bool MadeAnyChanges = false;
1111-
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1112-
1113-
// Skip BB Terminator.
1114-
if (Preheader->getTerminator() == &I)
1115-
continue;
1116-
1117-
// New instructions were inserted at the end of the preheader.
1118-
if (isa<PHINode>(I))
1119-
break;
1120-
1121-
// Don't move instructions which might have side effects, since the side
1122-
// effects need to complete before instructions inside the loop. Also don't
1123-
// move instructions which might read memory, since the loop may modify
1124-
// memory. Note that it's okay if the instruction might have undefined
1125-
// behavior: LoopSimplify guarantees that the preheader dominates the exit
1126-
// block.
1127-
if (I.mayHaveSideEffects() || I.mayReadFromMemory())
1128-
continue;
1129-
1130-
// Skip debug or pseudo instructions.
1131-
if (I.isDebugOrPseudoInst())
1132-
continue;
1133-
1134-
// Skip eh pad instructions.
1135-
if (I.isEHPad())
1136-
continue;
1137-
1138-
// Don't sink alloca: we never want to sink static alloca's out of the
1139-
// entry block, and correctly sinking dynamic alloca's requires
1140-
// checks for stacksave/stackrestore intrinsics.
1141-
// FIXME: Refactor this check somehow?
1142-
if (isa<AllocaInst>(&I))
1143-
continue;
1144-
1145-
// Determine if there is a use in or before the loop (direct or
1146-
// otherwise).
1147-
bool UsedInLoop = false;
1148-
for (Use &U : I.uses()) {
1149-
Instruction *User = cast<Instruction>(U.getUser());
1150-
BasicBlock *UseBB = User->getParent();
1151-
if (PHINode *P = dyn_cast<PHINode>(User)) {
1152-
unsigned i =
1153-
PHINode::getIncomingValueNumForOperand(U.getOperandNo());
1154-
UseBB = P->getIncomingBlock(i);
1155-
}
1156-
if (UseBB == Preheader || L->contains(UseBB)) {
1157-
UsedInLoop = true;
1158-
break;
1159-
}
1160-
}
1161-
1162-
// If there is, the def must remain in the preheader.
1163-
if (UsedInLoop)
1164-
continue;
1165-
1166-
// Otherwise, sink it to the exit block.
1167-
I.moveBefore(ExitBlock->getFirstInsertionPt());
1168-
SE->forgetValue(&I);
1169-
MadeAnyChanges = true;
1170-
}
1171-
1172-
return MadeAnyChanges;
1173-
}
1174-
11751094
static void replaceExitCond(BranchInst *BI, Value *NewCond,
11761095
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
11771096
auto *OldCond = BI->getCondition();
@@ -2079,10 +1998,6 @@ bool IndVarSimplify::run(Loop *L) {
20791998

20801999
// The Rewriter may not be used from this point on.
20812000

2082-
// Loop-invariant instructions in the preheader that aren't used in the
2083-
// loop may be sunk below the loop to reduce register pressure.
2084-
Changed |= sinkUnusedInvariants(L);
2085-
20862001
// rewriteFirstIterationLoopExitValues does not rely on the computation of
20872002
// trip count and therefore can further simplify exit values in addition to
20882003
// rewriteLoopExitValues.

llvm/lib/Transforms/Scalar/LICM.cpp

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,11 @@ static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
215215
ICFLoopSafetyInfo &SafetyInfo,
216216
MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
217217

218+
static bool sinkUnusedInvariantsFromPreheaderToExit(
219+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
220+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
221+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
222+
218223
static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
219224
function_ref<void(Instruction *)> Fn);
220225
using PointersAndHasReadsOutsideSet =
@@ -471,6 +476,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
471476
TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
472477
: sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
473478
MSSAU, &SafetyInfo, Flags, ORE);
479+
480+
// sink pre-header defs that are unused in-loop into the unique exit to reduce
481+
// pressure.
482+
Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
483+
SE, DT, Flags, ORE);
484+
474485
Flags.setIsSink(false);
475486
if (Preheader)
476487
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
@@ -1469,6 +1480,118 @@ static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
14691480
SE->forgetBlockAndLoopDispositions(&I);
14701481
}
14711482

1483+
// If there's a single exit block, sink any loop-invariant values that were
1484+
// defined in the preheader but not used inside the loop into the exit block
1485+
// to reduce register pressure in the loop.
1486+
static bool sinkUnusedInvariantsFromPreheaderToExit(
1487+
Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
1488+
MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
1489+
SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
1490+
BasicBlock *ExitBlock = L->getExitBlock();
1491+
if (!ExitBlock)
1492+
return false;
1493+
1494+
BasicBlock *Preheader = L->getLoopPreheader();
1495+
if (!Preheader)
1496+
return false;
1497+
1498+
bool MadeAnyChanges = false;
1499+
MemoryAccess *ExitDef = nullptr;
1500+
1501+
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
1502+
1503+
// Skip terminator.
1504+
if (Preheader->getTerminator() == &I)
1505+
continue;
1506+
1507+
// New instructions were inserted at the end of the preheader.
1508+
if (isa<PHINode>(I))
1509+
break;
1510+
1511+
// Don't move instructions which might have side effects, since the side
1512+
// effects need to complete before instructions inside the loop. Note that
1513+
// it's okay if the instruction might have undefined behavior: LoopSimplify
1514+
// guarantees that the preheader dominates the exit block.
1515+
if (I.mayHaveSideEffects())
1516+
continue;
1517+
1518+
if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
1519+
continue;
1520+
1521+
// Determine if there is a use in or before the loop (direct or
1522+
// otherwise).
1523+
bool UsedInLoopOrPreheader = false;
1524+
for (Use &U : I.uses()) {
1525+
auto *UserI = cast<Instruction>(U.getUser());
1526+
BasicBlock *UseBB = UserI->getParent();
1527+
if (auto *PN = dyn_cast<PHINode>(UserI)) {
1528+
UseBB = PN->getIncomingBlock(U);
1529+
}
1530+
if (UseBB == Preheader || L->contains(UseBB)) {
1531+
UsedInLoopOrPreheader = true;
1532+
break;
1533+
}
1534+
}
1535+
if (UsedInLoopOrPreheader)
1536+
continue;
1537+
1538+
// Move the instruction.
1539+
SafetyInfo->removeInstruction(&I);
1540+
SafetyInfo->insertInstructionTo(&I, ExitBlock);
1541+
I.moveBefore(*ExitBlock, ExitBlock->getFirstInsertionPt());
1542+
if (SE)
1543+
SE->forgetBlockAndLoopDispositions(&I);
1544+
1545+
// Update MemorySSA.
1546+
if (auto *OldMA = MSSAU.getMemorySSA()->getMemoryAccess(&I)) {
1547+
// apviding the expensive getPreviousDefRecursive call by manually
1548+
// setting the defining access.
1549+
if (!ExitDef) {
1550+
if (auto *MPhi = MSSAU.getMemorySSA()->getMemoryAccess(ExitBlock)) {
1551+
ExitDef = MPhi;
1552+
} else {
1553+
BasicBlock *Current = *predecessors(ExitBlock).begin();
1554+
while (true) {
1555+
if (auto *Accesses =
1556+
MSSAU.getMemorySSA()->getBlockAccesses(Current)) {
1557+
if (!Accesses->empty()) {
1558+
MemoryAccess *Back =
1559+
const_cast<MemoryAccess *>(&Accesses->back());
1560+
if (isa<MemoryDef>(Back) || isa<MemoryPhi>(Back))
1561+
ExitDef = Back;
1562+
else
1563+
ExitDef = MSSAU.getMemorySSA()
1564+
->getWalker()
1565+
->getClobberingMemoryAccess(Back);
1566+
break;
1567+
}
1568+
}
1569+
1570+
if (Current == L->getHeader()) {
1571+
Current = Preheader;
1572+
continue;
1573+
}
1574+
1575+
if (pred_empty(Current)) {
1576+
ExitDef = MSSAU.getMemorySSA()->getLiveOnEntryDef();
1577+
break;
1578+
}
1579+
Current = *pred_begin(Current);
1580+
}
1581+
}
1582+
}
1583+
MemoryAccess *NewMA = MSSAU.createMemoryAccessInBB(&I, ExitDef, ExitBlock,
1584+
MemorySSA::Beginning);
1585+
OldMA->replaceAllUsesWith(NewMA);
1586+
MSSAU.removeMemoryAccess(OldMA);
1587+
}
1588+
1589+
MadeAnyChanges = true;
1590+
}
1591+
1592+
return MadeAnyChanges;
1593+
}
1594+
14721595
static Instruction *sinkThroughTriviallyReplaceablePHI(
14731596
PHINode *TPN, Instruction *I, LoopInfo *LI,
14741597
SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,

llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
7373
}
7474

7575
; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
76-
; GFX908: NumSgprs: 64
77-
; GFX908-GCNTRACKERS: NumSgprs: 64
76+
; GFX908: NumSgprs: 56
77+
; GFX908-GCNTRACKERS: NumSgprs: 56
7878
; GFX908: NumVgprs: 41
7979
; GFX908-GCNTRACKERS: NumVgprs: 39
8080
; GFX908: Occupancy: 5

llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
2323

2424
; OFFREG is offset system SGPR
25-
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
26-
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
2725
; GCN: NumVgprs: 256
2826
; GCN: ScratchSize: 640
2927

0 commit comments

Comments
 (0)