@@ -26,6 +26,8 @@ SPDX-License-Identifier: MIT
26
26
using namespace llvm ;
27
27
using namespace IGC ;
28
28
29
+ #define DEBUG_TYPE " GENtti"
30
+
29
31
namespace llvm {
30
32
31
33
bool GenIntrinsicsTTIImpl::isLoweredToCall (const Function *F) {
@@ -81,48 +83,78 @@ unsigned countTotalInstructions(const Function *F, bool CheckSendMsg = true) {
81
83
82
84
unsigned GenIntrinsicsTTIImpl::getFlatAddressSpace () { return ADDRESS_SPACE_PRIVATE; }
83
85
84
- // / Returns true if load instruction source address calculation
85
- // / depends only on base address, constants, and loop induction variables
86
- bool canReplaceWithRegisters (const LoadInst *LI, const Loop *L, ScalarEvolution &SE) {
87
- auto Pointer = LI->getPointerOperand ();
88
- auto Base = LI->getPointerOperand ()->stripInBoundsOffsets ();
86
+ bool GenIntrinsicsTTIImpl::isGEPLoopConstDerived (GetElementPtrInst *GEP, const Loop *L, ScalarEvolution &SE) {
87
+ if (!GEP)
88
+ return false ;
89
+
90
+ const SCEV *SGEP = SE.getSCEV (GEP);
91
+
92
+ if (auto *AR = dyn_cast<SCEVAddRecExpr>(SGEP)) {
93
+ if (AR->getLoop () == L)
94
+ return true ;
95
+ }
89
96
90
- // Start with load source address
91
- SmallVector<const Value *, 16 > WorkList = {Pointer};
97
+ // Don't let pointer base interfere the traversal. This is due to some frontends
98
+ // generate GEP without inbound.
99
+ const SCEV *SGEPMinusPointerBase = SE.removePointerBase (SGEP);
92
100
93
- // Traverse the source address calculation dependency tree
94
- while (!WorkList.empty ()) {
95
- auto V = WorkList.pop_back_val ();
101
+ struct CheckConstDerived {
102
+ bool TraversalDone = false ;
103
+ bool AddRecFound = false ;
104
+ bool isConstDerived = true ;
96
105
97
- if (V == Base || isa<Constant>(V)) {
98
- // Do nothing if we meet base address or some constant
99
- } else if (isa<CallBase>(V)) {
100
- // Stop at calls
106
+ const Loop *L = nullptr ;
107
+ const SCEV *S = nullptr ;
108
+
109
+ CheckConstDerived (const Loop *L) : L(L) {}
110
+
111
+ bool setNotConstDerived () {
112
+ TraversalDone = true ;
113
+ isConstDerived = false ;
101
114
return false ;
102
- } else if (auto U = dyn_cast<User>(V)) {
103
- // In case of Instuction/Operator append
104
- // all the operands to the work list,
105
- // skip PHI nodes to prevent infinite while-loop
106
- for (unsigned i = 0 ; i < U->getNumOperands (); ++i) {
107
- auto O = U->getOperand (i);
108
- if (auto P = dyn_cast<PHINode>(O)) {
109
- if (!L->isAuxiliaryInductionVariable (*P, SE)) {
110
- // Stop at non-auxilary IV
111
- return false ;
112
- }
113
- } else
114
- WorkList.push_back (O);
115
+ }
116
+
117
+ bool follow (const SCEV *S) {
118
+ switch (S->getSCEVType ()) {
119
+ case scConstant:
120
+ case scPtrToInt:
121
+ case scTruncate:
122
+ case scZeroExtend:
123
+ case scSignExtend:
124
+ case scAddExpr:
125
+ case scMulExpr:
126
+ case scUMaxExpr:
127
+ case scSMaxExpr:
128
+ case scUMinExpr:
129
+ case scSMinExpr:
130
+ case scSequentialUMinExpr:
131
+ case scUDivExpr:
132
+ return true ;
133
+
134
+ case scAddRecExpr: {
135
+ const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop ();
136
+ if (L && (ARLoop == L || ARLoop->contains (L))) {
137
+ AddRecFound = true ;
138
+ return false ; // Don't traverse into it
139
+ }
140
+
141
+ return setNotConstDerived ();
115
142
}
116
- } else {
117
- // Stop if we meet something apart from
118
- // base address, constant value, IV
119
- return false ;
143
+
144
+ case scUnknown:
145
+ case scCouldNotCompute:
146
+ return setNotConstDerived ();
147
+ }
148
+ llvm_unreachable (" Unknown SCEV kind!" );
120
149
}
121
- }
122
150
123
- // If nothing was found above, consider load instruction source
124
- // being a candidate to be replaced by registers
125
- return true ;
151
+ bool isDone () { return TraversalDone; }
152
+ };
153
+
154
+ CheckConstDerived CCD (L);
155
+ SCEVTraversal<CheckConstDerived> ST (CCD);
156
+ ST.visitAll (SGEPMinusPointerBase);
157
+ return (CCD.isConstDerived && CCD.AddRecFound );
126
158
}
127
159
128
160
void GenIntrinsicsTTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
@@ -230,10 +262,9 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
230
262
UP.Partial = true ;
231
263
} else // for high registry pressure shaders, limit the unrolling to small loops and only fully unroll
232
264
{
233
- if (IGC_GET_FLAG_VALUE (SetLoopUnrollThresholdForHighRegPressure) != 0 )
234
- UP.Threshold = IGC_GET_FLAG_VALUE (SetLoopUnrollThresholdForHighRegPressure);
235
- else
236
- UP.Threshold = 200 ;
265
+ UP.Threshold = IGC_GET_FLAG_VALUE (SetLoopUnrollThresholdForHighRegPressure);
266
+ // This is similiar to LLVM OptForSize scenario in LoopUnrollPass
267
+ UP.MaxPercentThresholdBoost = IGC_GET_FLAG_VALUE (SetLoopUnrollMaxPercentThresholdBoostForHighRegPressure);
237
268
}
238
269
239
270
unsigned MaxTripCount = SE.getSmallConstantMaxTripCount (L);
@@ -243,40 +274,90 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
243
274
UP.Force = true ;
244
275
}
245
276
246
- const unsigned MaxTripCountToUseUpperBoundForLoopWithLoads = 16 ;
247
- if (MaxTripCount && MaxTripCount <= MaxTripCountToUseUpperBoundForLoopWithLoads) {
248
- // Check if loop contains LoadInst from an array
249
- // that can potentially be replaced by registers
250
-
251
- // Group all load instructions by base address
252
- // of the source posinter
253
- DenseMap<Value *, SmallSet<LoadInst *, 4 >> LoadInstructions;
277
+ // For all the load/store who (having a GEP to),
278
+ // 1. Accessing a fixed size Alloca
279
+ // 2. Having an loop-iteration-inducted-only index
280
+ // For exmaple,
281
+ //
282
+ // bb:
283
+ // %ALLOC = alloca [32 x float], align 4
284
+ // Loop1:
285
+ // %i8 = phi i32 [ 0, %bb ], [ %i23, %Loop1 ]
286
+ // %i19 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i8
287
+ // %i23 = add i32 %i8, 1
288
+ // %i14 = fmul ...
289
+ // store float %i14, ptr %i19, align 4
290
+ // %i24 = icmp eq i32 %i23, 32
291
+ // br i1 %i24, label %..., label %Loop1
292
+ // ...
293
+ // Loop5:
294
+ // %i93 = phi i32 [ %i115, %Loop5 ], [ 0, %... ]
295
+ // %i99 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i93
296
+ // %i103 = load float, ptr %i99, align 4
297
+ // %i107 = fmul float %i103, 0x3F699999A0000000
298
+ // %i115 = add i32 %i93, 1
299
+ // %i116 = icmp eq i32 %i115, 32
300
+ // br i1 %i116, label %bb117, label %Loop5
301
+ //
302
+ // Fully unrolling both loops leads SROA pass eliminate the entire access chain of the alloca. This is one of most
303
+ // impacted yet super common pattern across all application types. In many cases, especially when the only values that
304
+ // stored into alloca are compiler-detectable constant, these loops need to be unroll regardless how high the register
305
+ // pressure is.
306
+ //
307
+ // TODO: Having an analysis pass to link alloca with loops globally so that they are either unrolled together or not.
308
+ // It can potentially do some global cost estimations.
309
+ // TODO: Having compilation retry enables loop unrolling for this case and determines if unrolling actually helps
310
+ // reduce register pressure.
311
+ const unsigned UnrollMaxCountForAlloca = 64 ; // May need to be higher for OpenCL
312
+ bool AllocaFound = false ;
313
+ if (MaxTripCount && MaxTripCount <= UnrollMaxCountForAlloca &&
314
+ IGC_IS_FLAG_ENABLED (EnablePromoteLoopUnrollwithAlloca)) {
315
+ unsigned int ThresholdBoost = 0 ;
254
316
for (auto BB : L->blocks ()) {
255
317
for (auto &I : *BB) {
256
- if (auto LI = dyn_cast<LoadInst>(&I)) {
257
- auto Base = LI->getPointerOperand ()->stripInBoundsOffsets ();
258
- if (isa<AllocaInst>(Base)) {
259
- auto LIIterator = LoadInstructions.find (Base);
260
- if (LIIterator == LoadInstructions.end ())
261
- LIIterator = LoadInstructions.insert (std::make_pair (Base, SmallSet<LoadInst *, 4 >())).first ;
262
- LIIterator->second .insert (LI);
263
- }
264
- }
265
- }
266
- }
318
+ AllocaInst *AI = nullptr ;
319
+ GetElementPtrInst *GEP = nullptr ;
320
+
321
+ if (auto *LI = dyn_cast<LoadInst>(&I))
322
+ AI = dyn_cast<AllocaInst>(LI->getPointerOperand ());
323
+ else if ((GEP = dyn_cast<GetElementPtrInst>(&I))) {
324
+ // Test if the GEP index is a function of the loop induction variable.
325
+ if (!isGEPLoopConstDerived (GEP, L, SE))
326
+ continue ;
267
327
268
- // Find at least one base address, such that all loads
269
- // from it can be replaced by registers
270
- for (const auto &LIIterator : LoadInstructions) {
271
- bool Found = true ;
272
- for (const auto &LI : LIIterator.second )
273
- Found &= canReplaceWithRegisters (LI, L, SE);
274
- if (Found) {
275
- UP.UpperBound = true ;
276
- UP.Force = true ;
277
- break ;
328
+ auto *SBase = dyn_cast<SCEVUnknown>(SE.getPointerBase (SE.getSCEV (GEP)));
329
+ AI = dyn_cast<AllocaInst>(SBase->getValue ());
330
+ } else
331
+ continue ;
332
+
333
+ if (!AI)
334
+ continue ;
335
+
336
+ Type *Ty = AI->getAllocatedType ();
337
+ unsigned AllocaSize = Ty->isSized () ? DL.getTypeAllocSize (Ty) : 0 ;
338
+ if (AllocaSize > 1024 || AllocaSize == 0 )
339
+ continue ;
340
+
341
+ ThresholdBoost += AllocaSize;
342
+ if (GEP)
343
+ isGEPLoopInduction[GEP] = true ;
344
+ AllocaFound = true ;
278
345
}
279
346
}
347
+ if (AllocaFound) {
348
+ // LLVM default only to 10, boost to UnrollMaxCountForAlloca
349
+ UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
350
+ UP.Threshold += ThresholdBoost;
351
+ UP.Runtime = true ;
352
+ UP.UpperBound = true ;
353
+ UP.Force = true ;
354
+
355
+ LLVM_DEBUG (dbgs () << " Increasing L:" << L->getName () << " threshold to " << UP.Threshold
356
+ << " due to Alloca accessed by:" );
357
+ for (const auto &pair : isGEPLoopInduction)
358
+ LLVM_DEBUG (dbgs () << " " << pair.first ->getName ());
359
+ LLVM_DEBUG (dbgs () << " \n " );
360
+ }
280
361
}
281
362
282
363
unsigned sendMessage = 0 ;
@@ -439,7 +520,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
439
520
(instCount + sendMessage * 4 > unrollLimitInstCount);
440
521
441
522
// if the loop doesn't have sample, skip the unrolling parameter change
442
- if (!sendMessage) {
523
+ if (!sendMessage && !AllocaFound ) {
443
524
// if the estimated unrolled instruction count is larger than the unrolling threshold, limit unrolling.
444
525
if (limitUnrolling) {
445
526
UP.Count = MIN (unrollLimitInstCount / (instCount + sendMessage * 4 ), 4 );
@@ -453,7 +534,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
453
534
454
535
// if the TripCount is known, and the estimated unrolled count exceed LoopUnrollThreshold, set the unrolling count to
455
536
// 4
456
- if (limitUnrolling) {
537
+ if (limitUnrolling && !AllocaFound ) {
457
538
UP.Count = MIN (TripCount, 4 );
458
539
UP.MaxCount = UP.Count ;
459
540
}
@@ -469,11 +550,13 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
469
550
}
470
551
}
471
552
553
+ /* ** TESTING for removal
472
554
UP.Runtime = true;
473
555
UP.Count = 4;
474
556
UP.MaxCount = UP.Count;
475
557
// The following is only available and required from LLVM 3.7+.
476
558
UP.AllowExpensiveTripCount = true;
559
+ ***/
477
560
478
561
if (MDNode *LoopID = L->getLoopID ()) {
479
562
const llvm::StringRef maxIterMetadataNames = " spv.loop.iterations.max" ;
@@ -559,6 +642,19 @@ llvm::InstructionCost GenIntrinsicsTTIImpl::internalCalculateCost(const User *U,
559
642
}
560
643
}
561
644
645
+ if (IGC_IS_FLAG_ENABLED (EnablePromoteLoopUnrollwithAlloca)) {
646
+ const GetElementPtrInst *GEP = nullptr ;
647
+ if (Operator::getOpcode (U) == Instruction::Load)
648
+ GEP = dyn_cast<GetElementPtrInst>(cast<LoadInst>(U)->getPointerOperand ());
649
+ if (Operator::getOpcode (U) == Instruction::Store)
650
+ GEP = dyn_cast<GetElementPtrInst>(cast<StoreInst>(U)->getPointerOperand ());
651
+
652
+ if (GEP) {
653
+ if (isGEPLoopInduction.find (GEP) != isGEPLoopInduction.end ())
654
+ return TTI::TCC_Free;
655
+ }
656
+ }
657
+
562
658
const Function *F = dyn_cast<Function>(U);
563
659
if (F != nullptr ) {
564
660
IGC::CodeGenContext *CGC = this ->ctx ;
0 commit comments