@@ -4068,51 +4068,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
40684068
40694069 // Try to unroll small, single block loops, if they have load/store
40704070 // dependencies, to expose more parallel memory access streams.
4071- if (L->getHeader () != L->getLoopLatch () || Size > 8 )
4072- return ;
4071+ BasicBlock *Header = L->getHeader ();
4072+ if (Header == L->getLoopLatch ()) {
4073+ if (Size > 8 )
4074+ return ;
40734075
4074- SmallPtrSet<Value *, 8 > LoadedValues;
4075- SmallVector<StoreInst *> Stores;
4076- for (auto *BB : L->blocks ()) {
4077- for (auto &I : *BB) {
4078- Value *Ptr = getLoadStorePointerOperand (&I);
4079- if (!Ptr)
4080- continue ;
4081- const SCEV *PtrSCEV = SE.getSCEV (Ptr);
4082- if (SE.isLoopInvariant (PtrSCEV, L))
4083- continue ;
4084- if (isa<LoadInst>(&I))
4085- LoadedValues.insert (&I);
4086- else
4087- Stores.push_back (cast<StoreInst>(&I));
4076+ SmallPtrSet<Value *, 8 > LoadedValues;
4077+ SmallVector<StoreInst *> Stores;
4078+ for (auto *BB : L->blocks ()) {
4079+ for (auto &I : *BB) {
4080+ Value *Ptr = getLoadStorePointerOperand (&I);
4081+ if (!Ptr)
4082+ continue ;
4083+ const SCEV *PtrSCEV = SE.getSCEV (Ptr);
4084+ if (SE.isLoopInvariant (PtrSCEV, L))
4085+ continue ;
4086+ if (isa<LoadInst>(&I))
4087+ LoadedValues.insert (&I);
4088+ else
4089+ Stores.push_back (cast<StoreInst>(&I));
4090+ }
40884091 }
4089- }
40904092
4091- // Try to find an unroll count that maximizes the use of the instruction
4092- // window, i.e. trying to fetch as many instructions per cycle as possible.
4093- unsigned MaxInstsPerLine = 16 ;
4094- unsigned UC = 1 ;
4095- unsigned BestUC = 1 ;
4096- unsigned SizeWithBestUC = BestUC * Size;
4097- while (UC <= 8 ) {
4098- unsigned SizeWithUC = UC * Size;
4099- if (SizeWithUC > 48 )
4100- break ;
4101- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4102- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4103- BestUC = UC;
4104- SizeWithBestUC = BestUC * Size;
4093+ // Try to find an unroll count that maximizes the use of the instruction
4094+ // window, i.e. trying to fetch as many instructions per cycle as possible.
4095+ unsigned MaxInstsPerLine = 16 ;
4096+ unsigned UC = 1 ;
4097+ unsigned BestUC = 1 ;
4098+ unsigned SizeWithBestUC = BestUC * Size;
4099+ while (UC <= 8 ) {
4100+ unsigned SizeWithUC = UC * Size;
4101+ if (SizeWithUC > 48 )
4102+ break ;
4103+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4104+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4105+ BestUC = UC;
4106+ SizeWithBestUC = BestUC * Size;
4107+ }
4108+ UC++;
41054109 }
4106- UC++;
4110+
4111+ if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4112+ return LoadedValues.contains (SI->getOperand (0 ));
4113+ }))
4114+ return ;
4115+
4116+ UP.Runtime = true ;
4117+ UP.DefaultUnrollRuntimeCount = BestUC;
4118+ return ;
41074119 }
41084120
4109- if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4110- return LoadedValues.contains (SI->getOperand (0 ));
4111- }))
4121+ // Try to runtime-unroll loops with early-continues depending on loop-varying
4122+ // loads; this helps with branch-prediction for the early-continues.
4123+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
4124+ auto *Latch = L->getLoopLatch ();
4125+ SmallVector<BasicBlock *> Preds (predecessors (Latch));
4126+ if (!Term || !Term->isConditional () || Preds.size () == 1 ||
4127+ none_of (Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4128+ none_of (Preds, [L](BasicBlock *Pred) { return L->contains (Pred); }))
41124129 return ;
41134130
4114- UP.Runtime = true ;
4115- UP.DefaultUnrollRuntimeCount = BestUC;
4131+ std::function<bool (Instruction *, unsigned )> DependsOnLoopLoad =
4132+ [&](Instruction *I, unsigned Depth) -> bool {
4133+ if (isa<PHINode>(I) || L->isLoopInvariant (I) || Depth > 8 )
4134+ return false ;
4135+
4136+ if (auto *LI = dyn_cast<LoadInst>(I))
4137+ return true ;
4138+
4139+ return any_of (I->operands (), [&](Value *V) {
4140+ auto *I = dyn_cast<Instruction>(V);
4141+ return I && DependsOnLoopLoad (I, Depth + 1 );
4142+ });
4143+ };
4144+ CmpInst::Predicate Pred;
4145+ Instruction *I;
4146+ if (match (Term, m_Br (m_ICmp (Pred, m_Instruction (I), m_Value ()), m_Value (),
4147+ m_Value ())) &&
4148+ DependsOnLoopLoad (I, 0 )) {
4149+ UP.Runtime = true ;
4150+ }
41164151}
41174152
41184153void AArch64TTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE,
0 commit comments