Skip to content

Commit a17ec75

Browse files
lioujheyuigcbot
authored andcommitted
Implement dynamic alloca-related memory operation cost reduction for loop unrolling
Implement dynamic alloca-related memory operation cost reduction for loop unrolling
1 parent 0b245df commit a17ec75

File tree

3 files changed

+183
-72
lines changed

3 files changed

+183
-72
lines changed

IGC/Compiler/GenTTI.cpp

Lines changed: 166 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ SPDX-License-Identifier: MIT
2626
using namespace llvm;
2727
using namespace IGC;
2828

29+
#define DEBUG_TYPE "GENtti"
30+
2931
namespace llvm {
3032

3133
bool GenIntrinsicsTTIImpl::isLoweredToCall(const Function *F) {
@@ -81,48 +83,78 @@ unsigned countTotalInstructions(const Function *F, bool CheckSendMsg = true) {
8183

8284
unsigned GenIntrinsicsTTIImpl::getFlatAddressSpace() { return ADDRESS_SPACE_PRIVATE; }
8385

84-
/// Returns true if load instruction source address calculation
85-
/// depends only on base address, constants, and loop induction variables
86-
bool canReplaceWithRegisters(const LoadInst *LI, const Loop *L, ScalarEvolution &SE) {
87-
auto Pointer = LI->getPointerOperand();
88-
auto Base = LI->getPointerOperand()->stripInBoundsOffsets();
86+
bool GenIntrinsicsTTIImpl::isGEPLoopConstDerived(GetElementPtrInst *GEP, const Loop *L, ScalarEvolution &SE) {
87+
if (!GEP)
88+
return false;
89+
90+
const SCEV *SGEP = SE.getSCEV(GEP);
91+
92+
if (auto *AR = dyn_cast<SCEVAddRecExpr>(SGEP)) {
93+
if (AR->getLoop() == L)
94+
return true;
95+
}
8996

90-
// Start with load source address
91-
SmallVector<const Value *, 16> WorkList = {Pointer};
97+
// Don't let pointer base interfere the traversal. This is due to some frontends
98+
// generate GEP without inbound.
99+
const SCEV *SGEPMinusPointerBase = SE.removePointerBase(SGEP);
92100

93-
// Traverse the source address calculation dependency tree
94-
while (!WorkList.empty()) {
95-
auto V = WorkList.pop_back_val();
101+
struct CheckConstDerived {
102+
bool TraversalDone = false;
103+
bool AddRecFound = false;
104+
bool isConstDerived = true;
96105

97-
if (V == Base || isa<Constant>(V)) {
98-
// Do nothing if we meet base address or some constant
99-
} else if (isa<CallBase>(V)) {
100-
// Stop at calls
106+
const Loop *L = nullptr;
107+
const SCEV *S = nullptr;
108+
109+
CheckConstDerived(const Loop *L) : L(L) {}
110+
111+
bool setNotConstDerived() {
112+
TraversalDone = true;
113+
isConstDerived = false;
101114
return false;
102-
} else if (auto U = dyn_cast<User>(V)) {
103-
// In case of Instuction/Operator append
104-
// all the operands to the work list,
105-
// skip PHI nodes to prevent infinite while-loop
106-
for (unsigned i = 0; i < U->getNumOperands(); ++i) {
107-
auto O = U->getOperand(i);
108-
if (auto P = dyn_cast<PHINode>(O)) {
109-
if (!L->isAuxiliaryInductionVariable(*P, SE)) {
110-
// Stop at non-auxilary IV
111-
return false;
112-
}
113-
} else
114-
WorkList.push_back(O);
115+
}
116+
117+
bool follow(const SCEV *S) {
118+
switch (S->getSCEVType()) {
119+
case scConstant:
120+
case scPtrToInt:
121+
case scTruncate:
122+
case scZeroExtend:
123+
case scSignExtend:
124+
case scAddExpr:
125+
case scMulExpr:
126+
case scUMaxExpr:
127+
case scSMaxExpr:
128+
case scUMinExpr:
129+
case scSMinExpr:
130+
case scSequentialUMinExpr:
131+
case scUDivExpr:
132+
return true;
133+
134+
case scAddRecExpr: {
135+
const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop();
136+
if (L && (ARLoop == L || ARLoop->contains(L))) {
137+
AddRecFound = true;
138+
return false; // Don't traverse into it
139+
}
140+
141+
return setNotConstDerived();
115142
}
116-
} else {
117-
// Stop if we meet something apart from
118-
// base address, constant value, IV
119-
return false;
143+
144+
case scUnknown:
145+
case scCouldNotCompute:
146+
return setNotConstDerived();
147+
}
148+
llvm_unreachable("Unknown SCEV kind!");
120149
}
121-
}
122150

123-
// If nothing was found above, consider load instruction source
124-
// being a candidate to be replaced by registers
125-
return true;
151+
bool isDone() { return TraversalDone; }
152+
};
153+
154+
CheckConstDerived CCD(L);
155+
SCEVTraversal<CheckConstDerived> ST(CCD);
156+
ST.visitAll(SGEPMinusPointerBase);
157+
return (CCD.isConstDerived && CCD.AddRecFound);
126158
}
127159

128160
void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
@@ -230,10 +262,9 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
230262
UP.Partial = true;
231263
} else // for high registry pressure shaders, limit the unrolling to small loops and only fully unroll
232264
{
233-
if (IGC_GET_FLAG_VALUE(SetLoopUnrollThresholdForHighRegPressure) != 0)
234-
UP.Threshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThresholdForHighRegPressure);
235-
else
236-
UP.Threshold = 200;
265+
UP.Threshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThresholdForHighRegPressure);
266+
// This is similiar to LLVM OptForSize scenario in LoopUnrollPass
267+
UP.MaxPercentThresholdBoost = IGC_GET_FLAG_VALUE(SetLoopUnrollMaxPercentThresholdBoostForHighRegPressure);
237268
}
238269

239270
unsigned MaxTripCount = SE.getSmallConstantMaxTripCount(L);
@@ -243,40 +274,90 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
243274
UP.Force = true;
244275
}
245276

246-
const unsigned MaxTripCountToUseUpperBoundForLoopWithLoads = 16;
247-
if (MaxTripCount && MaxTripCount <= MaxTripCountToUseUpperBoundForLoopWithLoads) {
248-
// Check if loop contains LoadInst from an array
249-
// that can potentially be replaced by registers
250-
251-
// Group all load instructions by base address
252-
// of the source posinter
253-
DenseMap<Value *, SmallSet<LoadInst *, 4>> LoadInstructions;
277+
// For all the load/store who (having a GEP to),
278+
// 1. Accessing a fixed size Alloca
279+
// 2. Having an loop-iteration-inducted-only index
280+
// For exmaple,
281+
//
282+
// bb:
283+
// %ALLOC = alloca [32 x float], align 4
284+
// Loop1:
285+
// %i8 = phi i32 [ 0, %bb ], [ %i23, %Loop1 ]
286+
// %i19 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i8
287+
// %i23 = add i32 %i8, 1
288+
// %i14 = fmul ...
289+
// store float %i14, ptr %i19, align 4
290+
// %i24 = icmp eq i32 %i23, 32
291+
// br i1 %i24, label %..., label %Loop1
292+
// ...
293+
// Loop5:
294+
// %i93 = phi i32 [ %i115, %Loop5 ], [ 0, %... ]
295+
// %i99 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i93
296+
// %i103 = load float, ptr %i99, align 4
297+
// %i107 = fmul float %i103, 0x3F699999A0000000
298+
// %i115 = add i32 %i93, 1
299+
// %i116 = icmp eq i32 %i115, 32
300+
// br i1 %i116, label %bb117, label %Loop5
301+
//
302+
// Fully unrolling both loops leads SROA pass eliminate the entire access chain of the alloca. This is one of most
303+
// impacted yet super common pattern across all application types. In many cases, especially when the only values that
304+
// stored into alloca are compiler-detectable constant, these loops need to be unroll regardless how high the register
305+
// pressure is.
306+
//
307+
// TODO: Having an analysis pass to link alloca with loops globally so that they are either unrolled together or not.
308+
// It can potentially do some global cost estimations.
309+
// TODO: Having compilation retry enables loop unrolling for this case and determines if unrolling actually helps
310+
// reduce register pressure.
311+
const unsigned UnrollMaxCountForAlloca = 64; // May need to be higher for OpenCL
312+
bool AllocaFound = false;
313+
if (MaxTripCount && MaxTripCount <= UnrollMaxCountForAlloca &&
314+
IGC_IS_FLAG_ENABLED(EnablePromoteLoopUnrollwithAlloca)) {
315+
unsigned int ThresholdBoost = 0;
254316
for (auto BB : L->blocks()) {
255317
for (auto &I : *BB) {
256-
if (auto LI = dyn_cast<LoadInst>(&I)) {
257-
auto Base = LI->getPointerOperand()->stripInBoundsOffsets();
258-
if (isa<AllocaInst>(Base)) {
259-
auto LIIterator = LoadInstructions.find(Base);
260-
if (LIIterator == LoadInstructions.end())
261-
LIIterator = LoadInstructions.insert(std::make_pair(Base, SmallSet<LoadInst *, 4>())).first;
262-
LIIterator->second.insert(LI);
263-
}
264-
}
265-
}
266-
}
318+
AllocaInst *AI = nullptr;
319+
GetElementPtrInst *GEP = nullptr;
320+
321+
if (auto *LI = dyn_cast<LoadInst>(&I))
322+
AI = dyn_cast<AllocaInst>(LI->getPointerOperand());
323+
else if ((GEP = dyn_cast<GetElementPtrInst>(&I))) {
324+
// Test if the GEP index is a function of the loop induction variable.
325+
if (!isGEPLoopConstDerived(GEP, L, SE))
326+
continue;
267327

268-
// Find at least one base address, such that all loads
269-
// from it can be replaced by registers
270-
for (const auto &LIIterator : LoadInstructions) {
271-
bool Found = true;
272-
for (const auto &LI : LIIterator.second)
273-
Found &= canReplaceWithRegisters(LI, L, SE);
274-
if (Found) {
275-
UP.UpperBound = true;
276-
UP.Force = true;
277-
break;
328+
auto *SBase = dyn_cast<SCEVUnknown>(SE.getPointerBase(SE.getSCEV(GEP)));
329+
AI = dyn_cast<AllocaInst>(SBase->getValue());
330+
} else
331+
continue;
332+
333+
if (!AI)
334+
continue;
335+
336+
Type *Ty = AI->getAllocatedType();
337+
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
338+
if (AllocaSize > 1024 || AllocaSize == 0)
339+
continue;
340+
341+
ThresholdBoost += AllocaSize;
342+
if (GEP)
343+
isGEPLoopInduction[GEP] = true;
344+
AllocaFound = true;
278345
}
279346
}
347+
if (AllocaFound) {
348+
// LLVM default only to 10, boost to UnrollMaxCountForAlloca
349+
UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
350+
UP.Threshold += ThresholdBoost;
351+
UP.Runtime = true;
352+
UP.UpperBound = true;
353+
UP.Force = true;
354+
355+
LLVM_DEBUG(dbgs() << "Increasing L:" << L->getName() << " threshold to " << UP.Threshold
356+
<< " due to Alloca accessed by:");
357+
for (const auto &pair : isGEPLoopInduction)
358+
LLVM_DEBUG(dbgs() << " " << pair.first->getName());
359+
LLVM_DEBUG(dbgs() << " \n");
360+
}
280361
}
281362

282363
unsigned sendMessage = 0;
@@ -439,7 +520,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
439520
(instCount + sendMessage * 4 > unrollLimitInstCount);
440521

441522
// if the loop doesn't have sample, skip the unrolling parameter change
442-
if (!sendMessage) {
523+
if (!sendMessage && !AllocaFound) {
443524
// if the estimated unrolled instruction count is larger than the unrolling threshold, limit unrolling.
444525
if (limitUnrolling) {
445526
UP.Count = MIN(unrollLimitInstCount / (instCount + sendMessage * 4), 4);
@@ -453,7 +534,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
453534

454535
// if the TripCount is known, and the estimated unrolled count exceed LoopUnrollThreshold, set the unrolling count to
455536
// 4
456-
if (limitUnrolling) {
537+
if (limitUnrolling && !AllocaFound) {
457538
UP.Count = MIN(TripCount, 4);
458539
UP.MaxCount = UP.Count;
459540
}
@@ -469,11 +550,13 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
469550
}
470551
}
471552

553+
/*** TESTING for removal
472554
UP.Runtime = true;
473555
UP.Count = 4;
474556
UP.MaxCount = UP.Count;
475557
// The following is only available and required from LLVM 3.7+.
476558
UP.AllowExpensiveTripCount = true;
559+
***/
477560

478561
if (MDNode *LoopID = L->getLoopID()) {
479562
const llvm::StringRef maxIterMetadataNames = "spv.loop.iterations.max";
@@ -559,6 +642,19 @@ llvm::InstructionCost GenIntrinsicsTTIImpl::internalCalculateCost(const User *U,
559642
}
560643
}
561644

645+
if (IGC_IS_FLAG_ENABLED(EnablePromoteLoopUnrollwithAlloca)) {
646+
const GetElementPtrInst *GEP = nullptr;
647+
if (Operator::getOpcode(U) == Instruction::Load)
648+
GEP = dyn_cast<GetElementPtrInst>(cast<LoadInst>(U)->getPointerOperand());
649+
if (Operator::getOpcode(U) == Instruction::Store)
650+
GEP = dyn_cast<GetElementPtrInst>(cast<StoreInst>(U)->getPointerOperand());
651+
652+
if (GEP) {
653+
if (isGEPLoopInduction.find(GEP) != isGEPLoopInduction.end())
654+
return TTI::TCC_Free;
655+
}
656+
}
657+
562658
const Function *F = dyn_cast<Function>(U);
563659
if (F != nullptr) {
564660
IGC::CodeGenContext *CGC = this->ctx;

IGC/Compiler/GenTTI.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ class GenIntrinsicsTTIImpl : public IGCLLVM::TTIImplCRTPBase<GenIntrinsicsTTIImp
2929
public:
3030
GenIntrinsicsTTIImpl(IGC::CodeGenContext *pCtx) : BaseT(pCtx->getModule()->getDataLayout()), ctx(pCtx) {}
3131

32+
DenseMap<Value *, bool> isGEPLoopInduction;
33+
3234
bool shouldBuildLookupTables();
3335

3436
bool isLoweredToCall(const Function *F);
@@ -40,6 +42,8 @@ class GenIntrinsicsTTIImpl : public IGCLLVM::TTIImplCRTPBase<GenIntrinsicsTTIImp
4042
// from variables to memory operations.
4143
unsigned getFlatAddressSpace();
4244

45+
bool isGEPLoopConstDerived(GetElementPtrInst *GEP, const Loop *L, ScalarEvolution &SE);
46+
4347
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
4448
OptimizationRemarkEmitter *ORE);
4549

IGC/common/igc_flags.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -432,8 +432,19 @@ DECLARE_IGC_REGKEY(
432432
DECLARE_IGC_REGKEY(DWORD, SetLoopUnrollThreshold, 0,
433433
"Set the loop unroll threshold. Value 0 will use the default threshold.", false)
434434
DECLARE_IGC_REGKEY(
435-
DWORD, SetLoopUnrollThresholdForHighRegPressure, 0,
436-
"Set the loop unroll threshold for shaders with high reg pressure. Value 0 will use the default threshold.", false)
435+
DWORD, SetLoopUnrollThresholdForHighRegPressure, 200,
436+
"Set the loop unroll threshold for shaders with high reg pressure.", false)
437+
DECLARE_IGC_REGKEY(DWORD, SetLoopUnrollMaxPercentThresholdBoostForHighRegPressure, 100,
438+
"Set the loop unroll max allowed threshold boost in percentage for shaders with high reg pressure. "
439+
"The LLVM internal value is 400. Setting 100 here because the cost model is currently based on "
440+
"latency instead of code size where the latter is what we need.",
441+
false)
442+
DECLARE_IGC_REGKEY(
443+
bool, EnablePromoteLoopUnrollwithAlloca, true,
444+
"Loop cost estimation assumes Load/Store who accesses Alloca with index deductible to loop count having 0 cost. "
445+
"Disable this flag makes them always cost something as well as disables dynamic threshold increase based on the "
446+
"size of alloca and number of GEP to the alloca in the loop, leading to the loop less likely to be unrolled.",
447+
false)
437448
DECLARE_IGC_REGKEY(DWORD, SetRegisterPressureThresholdForLoopUnroll, 96,
438449
"Set the register pressure threshold for limiting the loop unroll to smaller loops", false)
439450
DECLARE_IGC_REGKEY(DWORD, SetBranchSwapThreshold, 400, "Set the branch swaping threshold.", false)

0 commit comments

Comments
 (0)