@@ -26,6 +26,8 @@ SPDX-License-Identifier: MIT
2626using namespace llvm ;
2727using namespace IGC ;
2828
29+ #define DEBUG_TYPE " GENtti"
30+
2931namespace llvm {
3032
3133bool GenIntrinsicsTTIImpl::isLoweredToCall (const Function *F) {
@@ -81,48 +83,78 @@ unsigned countTotalInstructions(const Function *F, bool CheckSendMsg = true) {
8183
8284unsigned GenIntrinsicsTTIImpl::getFlatAddressSpace () { return ADDRESS_SPACE_PRIVATE; }
8385
84- // / Returns true if load instruction source address calculation
85- // / depends only on base address, constants, and loop induction variables
86- bool canReplaceWithRegisters (const LoadInst *LI, const Loop *L, ScalarEvolution &SE) {
87- auto Pointer = LI->getPointerOperand ();
88- auto Base = LI->getPointerOperand ()->stripInBoundsOffsets ();
86+ bool GenIntrinsicsTTIImpl::isGEPLoopConstDerived (GetElementPtrInst *GEP, const Loop *L, ScalarEvolution &SE) {
87+ if (!GEP)
88+ return false ;
89+
90+ const SCEV *SGEP = SE.getSCEV (GEP);
91+
92+ if (auto *AR = dyn_cast<SCEVAddRecExpr>(SGEP)) {
93+ if (AR->getLoop () == L)
94+ return true ;
95+ }
8996
90- // Start with load source address
91- SmallVector<const Value *, 16 > WorkList = {Pointer};
97+ // Don't let pointer base interfere the traversal. This is due to some frontends
98+ // generate GEP without inbound.
99+ const SCEV *SGEPMinusPointerBase = SE.removePointerBase (SGEP);
92100
93- // Traverse the source address calculation dependency tree
94- while (!WorkList.empty ()) {
95- auto V = WorkList.pop_back_val ();
101+ struct CheckConstDerived {
102+ bool TraversalDone = false ;
103+ bool AddRecFound = false ;
104+ bool isConstDerived = true ;
96105
97- if (V == Base || isa<Constant>(V)) {
98- // Do nothing if we meet base address or some constant
99- } else if (isa<CallBase>(V)) {
100- // Stop at calls
106+ const Loop *L = nullptr ;
107+ const SCEV *S = nullptr ;
108+
109+ CheckConstDerived (const Loop *L) : L(L) {}
110+
111+ bool setNotConstDerived () {
112+ TraversalDone = true ;
113+ isConstDerived = false ;
101114 return false ;
102- } else if (auto U = dyn_cast<User>(V)) {
103- // In case of Instuction/Operator append
104- // all the operands to the work list,
105- // skip PHI nodes to prevent infinite while-loop
106- for (unsigned i = 0 ; i < U->getNumOperands (); ++i) {
107- auto O = U->getOperand (i);
108- if (auto P = dyn_cast<PHINode>(O)) {
109- if (!L->isAuxiliaryInductionVariable (*P, SE)) {
110- // Stop at non-auxilary IV
111- return false ;
112- }
113- } else
114- WorkList.push_back (O);
115+ }
116+
117+ bool follow (const SCEV *S) {
118+ switch (S->getSCEVType ()) {
119+ case scConstant:
120+ case scPtrToInt:
121+ case scTruncate:
122+ case scZeroExtend:
123+ case scSignExtend:
124+ case scAddExpr:
125+ case scMulExpr:
126+ case scUMaxExpr:
127+ case scSMaxExpr:
128+ case scUMinExpr:
129+ case scSMinExpr:
130+ case scSequentialUMinExpr:
131+ case scUDivExpr:
132+ return true ;
133+
134+ case scAddRecExpr: {
135+ const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop ();
136+ if (L && (ARLoop == L || ARLoop->contains (L))) {
137+ AddRecFound = true ;
138+ return false ; // Don't traverse into it
139+ }
140+
141+ return setNotConstDerived ();
115142 }
116- } else {
117- // Stop if we meet something apart from
118- // base address, constant value, IV
119- return false ;
143+
144+ case scUnknown:
145+ case scCouldNotCompute:
146+ return setNotConstDerived ();
147+ }
148+ llvm_unreachable (" Unknown SCEV kind!" );
120149 }
121- }
122150
123- // If nothing was found above, consider load instruction source
124- // being a candidate to be replaced by registers
125- return true ;
151+ bool isDone () { return TraversalDone; }
152+ };
153+
154+ CheckConstDerived CCD (L);
155+ SCEVTraversal<CheckConstDerived> ST (CCD);
156+ ST.visitAll (SGEPMinusPointerBase);
157+ return (CCD.isConstDerived && CCD.AddRecFound );
126158}
127159
128160void GenIntrinsicsTTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
@@ -230,10 +262,9 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
230262 UP.Partial = true ;
231263 } else // for high registry pressure shaders, limit the unrolling to small loops and only fully unroll
232264 {
233- if (IGC_GET_FLAG_VALUE (SetLoopUnrollThresholdForHighRegPressure) != 0 )
234- UP.Threshold = IGC_GET_FLAG_VALUE (SetLoopUnrollThresholdForHighRegPressure);
235- else
236- UP.Threshold = 200 ;
265+ UP.Threshold = IGC_GET_FLAG_VALUE (SetLoopUnrollThresholdForHighRegPressure);
266+ // This is similiar to LLVM OptForSize scenario in LoopUnrollPass
267+ UP.MaxPercentThresholdBoost = IGC_GET_FLAG_VALUE (SetLoopUnrollMaxPercentThresholdBoostForHighRegPressure);
237268 }
238269
239270 unsigned MaxTripCount = SE.getSmallConstantMaxTripCount (L);
@@ -243,40 +274,90 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
243274 UP.Force = true ;
244275 }
245276
246- const unsigned MaxTripCountToUseUpperBoundForLoopWithLoads = 16 ;
247- if (MaxTripCount && MaxTripCount <= MaxTripCountToUseUpperBoundForLoopWithLoads) {
248- // Check if loop contains LoadInst from an array
249- // that can potentially be replaced by registers
250-
251- // Group all load instructions by base address
252- // of the source posinter
253- DenseMap<Value *, SmallSet<LoadInst *, 4 >> LoadInstructions;
277+ // For all the load/store who (having a GEP to),
278+ // 1. Accessing a fixed size Alloca
279+ // 2. Having an loop-iteration-inducted-only index
280+ // For exmaple,
281+ //
282+ // bb:
283+ // %ALLOC = alloca [32 x float], align 4
284+ // Loop1:
285+ // %i8 = phi i32 [ 0, %bb ], [ %i23, %Loop1 ]
286+ // %i19 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i8
287+ // %i23 = add i32 %i8, 1
288+ // %i14 = fmul ...
289+ // store float %i14, ptr %i19, align 4
290+ // %i24 = icmp eq i32 %i23, 32
291+ // br i1 %i24, label %..., label %Loop1
292+ // ...
293+ // Loop5:
294+ // %i93 = phi i32 [ %i115, %Loop5 ], [ 0, %... ]
295+ // %i99 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i93
296+ // %i103 = load float, ptr %i99, align 4
297+ // %i107 = fmul float %i103, 0x3F699999A0000000
298+ // %i115 = add i32 %i93, 1
299+ // %i116 = icmp eq i32 %i115, 32
300+ // br i1 %i116, label %bb117, label %Loop5
301+ //
302+ // Fully unrolling both loops leads SROA pass eliminate the entire access chain of the alloca. This is one of most
303+ // impacted yet super common pattern across all application types. In many cases, especially when the only values that
304+ // stored into alloca are compiler-detectable constant, these loops need to be unroll regardless how high the register
305+ // pressure is.
306+ //
307+ // TODO: Having an analysis pass to link alloca with loops globally so that they are either unrolled together or not.
308+ // It can potentially do some global cost estimations.
309+ // TODO: Having compilation retry enables loop unrolling for this case and determines if unrolling actually helps
310+ // reduce register pressure.
311+ const unsigned UnrollMaxCountForAlloca = 64 ; // May need to be higher for OpenCL
312+ bool AllocaFound = false ;
313+ if (MaxTripCount && MaxTripCount <= UnrollMaxCountForAlloca &&
314+ IGC_IS_FLAG_ENABLED (EnablePromoteLoopUnrollwithAlloca)) {
315+ unsigned int ThresholdBoost = 0 ;
254316 for (auto BB : L->blocks ()) {
255317 for (auto &I : *BB) {
256- if (auto LI = dyn_cast<LoadInst>(&I)) {
257- auto Base = LI->getPointerOperand ()->stripInBoundsOffsets ();
258- if (isa<AllocaInst>(Base)) {
259- auto LIIterator = LoadInstructions.find (Base);
260- if (LIIterator == LoadInstructions.end ())
261- LIIterator = LoadInstructions.insert (std::make_pair (Base, SmallSet<LoadInst *, 4 >())).first ;
262- LIIterator->second .insert (LI);
263- }
264- }
265- }
266- }
318+ AllocaInst *AI = nullptr ;
319+ GetElementPtrInst *GEP = nullptr ;
320+
321+ if (auto *LI = dyn_cast<LoadInst>(&I))
322+ AI = dyn_cast<AllocaInst>(LI->getPointerOperand ());
323+ else if ((GEP = dyn_cast<GetElementPtrInst>(&I))) {
324+ // Test if the GEP index is a function of the loop induction variable.
325+ if (!isGEPLoopConstDerived (GEP, L, SE))
326+ continue ;
267327
268- // Find at least one base address, such that all loads
269- // from it can be replaced by registers
270- for (const auto &LIIterator : LoadInstructions) {
271- bool Found = true ;
272- for (const auto &LI : LIIterator.second )
273- Found &= canReplaceWithRegisters (LI, L, SE);
274- if (Found) {
275- UP.UpperBound = true ;
276- UP.Force = true ;
277- break ;
328+ auto *SBase = dyn_cast<SCEVUnknown>(SE.getPointerBase (SE.getSCEV (GEP)));
329+ AI = dyn_cast<AllocaInst>(SBase->getValue ());
330+ } else
331+ continue ;
332+
333+ if (!AI)
334+ continue ;
335+
336+ Type *Ty = AI->getAllocatedType ();
337+ unsigned AllocaSize = Ty->isSized () ? DL.getTypeAllocSize (Ty) : 0 ;
338+ if (AllocaSize > 1024 || AllocaSize == 0 )
339+ continue ;
340+
341+ ThresholdBoost += AllocaSize;
342+ if (GEP)
343+ isGEPLoopInduction[GEP] = true ;
344+ AllocaFound = true ;
278345 }
279346 }
347+ if (AllocaFound) {
348+ // LLVM default only to 10, boost to UnrollMaxCountForAlloca
349+ UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
350+ UP.Threshold += ThresholdBoost;
351+ UP.Runtime = true ;
352+ UP.UpperBound = true ;
353+ UP.Force = true ;
354+
355+ LLVM_DEBUG (dbgs () << " Increasing L:" << L->getName () << " threshold to " << UP.Threshold
356+ << " due to Alloca accessed by:" );
357+ for (const auto &pair : isGEPLoopInduction)
358+ LLVM_DEBUG (dbgs () << " " << pair.first ->getName ());
359+ LLVM_DEBUG (dbgs () << " \n " );
360+ }
280361 }
281362
282363 unsigned sendMessage = 0 ;
@@ -439,7 +520,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
439520 (instCount + sendMessage * 4 > unrollLimitInstCount);
440521
441522 // if the loop doesn't have sample, skip the unrolling parameter change
442- if (!sendMessage) {
523+ if (!sendMessage && !AllocaFound ) {
443524 // if the estimated unrolled instruction count is larger than the unrolling threshold, limit unrolling.
444525 if (limitUnrolling) {
445526 UP.Count = MIN (unrollLimitInstCount / (instCount + sendMessage * 4 ), 4 );
@@ -453,7 +534,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
453534
454535 // if the TripCount is known, and the estimated unrolled count exceed LoopUnrollThreshold, set the unrolling count to
455536 // 4
456- if (limitUnrolling) {
537+ if (limitUnrolling && !AllocaFound ) {
457538 UP.Count = MIN (TripCount, 4 );
458539 UP.MaxCount = UP.Count ;
459540 }
@@ -469,11 +550,13 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
469550 }
470551 }
471552
553+ /* ** TESTING for removal
472554 UP.Runtime = true;
473555 UP.Count = 4;
474556 UP.MaxCount = UP.Count;
475557 // The following is only available and required from LLVM 3.7+.
476558 UP.AllowExpensiveTripCount = true;
559+ ***/
477560
478561 if (MDNode *LoopID = L->getLoopID ()) {
479562 const llvm::StringRef maxIterMetadataNames = " spv.loop.iterations.max" ;
@@ -559,6 +642,19 @@ llvm::InstructionCost GenIntrinsicsTTIImpl::internalCalculateCost(const User *U,
559642 }
560643 }
561644
645+ if (IGC_IS_FLAG_ENABLED (EnablePromoteLoopUnrollwithAlloca)) {
646+ const GetElementPtrInst *GEP = nullptr ;
647+ if (Operator::getOpcode (U) == Instruction::Load)
648+ GEP = dyn_cast<GetElementPtrInst>(cast<LoadInst>(U)->getPointerOperand ());
649+ if (Operator::getOpcode (U) == Instruction::Store)
650+ GEP = dyn_cast<GetElementPtrInst>(cast<StoreInst>(U)->getPointerOperand ());
651+
652+ if (GEP) {
653+ if (isGEPLoopInduction.find (GEP) != isGEPLoopInduction.end ())
654+ return TTI::TCC_Free;
655+ }
656+ }
657+
562658 const Function *F = dyn_cast<Function>(U);
563659 if (F != nullptr ) {
564660 IGC::CodeGenContext *CGC = this ->ctx ;
0 commit comments