@@ -308,7 +308,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
308308 // It can potentially do some global cost estimations.
309309 // TODO: Having compilation retry enables loop unrolling for this case and determines if unrolling actually helps
310310 // reduce register pressure.
311- const unsigned UnrollMaxCountForAlloca = 64 ; // May need to be higher for OpenCL
311+ const unsigned UnrollMaxCountForAlloca = IGC_GET_FLAG_VALUE (PromoteLoopUnrollwithAllocaCountThreshold);
312312 bool AllocaFound = false ;
313313 if (MaxTripCount && MaxTripCount <= UnrollMaxCountForAlloca &&
314314 IGC_IS_FLAG_ENABLED (EnablePromoteLoopUnrollwithAlloca)) {
@@ -332,12 +332,16 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
332332
333333 if (!AI)
334334 continue ;
335-
336- Type *Ty = AI->getAllocatedType ();
337- unsigned AllocaSize = Ty->isSized () ? DL.getTypeAllocSize (Ty) : 0 ;
338- if (AllocaSize > 1024 || AllocaSize == 0 )
335+ // Not fixed size or not in entry block
336+ // TODO: Can a alloca with a fixed size not reside in the entry block?
337+ if (!AI->isStaticAlloca ())
338+ continue ;
339+ // Assume every iteration consumes 1 alloca element.
340+ if (cast<ConstantInt>(AI->getArraySize ())->getZExtValue () > UnrollMaxCountForAlloca)
339341 continue ;
340342
343+ // Using alloca size in bytes as the threshold boost seems a bit tricky.
344+ unsigned AllocaSize = *(AI->getAllocationSizeInBits (DL)) / 8 ;
341345 ThresholdBoost += AllocaSize;
342346 if (GEP)
343347 isGEPLoopInduction[GEP] = true ;
@@ -348,7 +352,6 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
348352 // LLVM default only to 10, boost to UnrollMaxCountForAlloca
349353 UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
350354 UP.Threshold += ThresholdBoost;
351- UP.Runtime = true ;
352355 UP.UpperBound = true ;
353356 UP.Force = true ;
354357
0 commit comments