@@ -3650,51 +3650,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
36503650
36513651 // Try to unroll small, single block loops, if they have load/store
36523652 // dependencies, to expose more parallel memory access streams.
3653- if (L->getHeader () != L->getLoopLatch () || Size > 8 )
3654- return ;
3653+ BasicBlock *Header = L->getHeader ();
3654+ if (Header == L->getLoopLatch ()) {
3655+ if (Size > 8 )
3656+ return ;
36553657
3656- SmallPtrSet<Value *, 8 > LoadedValues;
3657- SmallVector<StoreInst *> Stores;
3658- for (auto *BB : L->blocks ()) {
3659- for (auto &I : *BB) {
3660- Value *Ptr = getLoadStorePointerOperand (&I);
3661- if (!Ptr)
3662- continue ;
3663- const SCEV *PtrSCEV = SE.getSCEV (Ptr);
3664- if (SE.isLoopInvariant (PtrSCEV, L))
3665- continue ;
3666- if (isa<LoadInst>(&I))
3667- LoadedValues.insert (&I);
3668- else
3669- Stores.push_back (cast<StoreInst>(&I));
3658+ SmallPtrSet<Value *, 8 > LoadedValues;
3659+ SmallVector<StoreInst *> Stores;
3660+ for (auto *BB : L->blocks ()) {
3661+ for (auto &I : *BB) {
3662+ Value *Ptr = getLoadStorePointerOperand (&I);
3663+ if (!Ptr)
3664+ continue ;
3665+ const SCEV *PtrSCEV = SE.getSCEV (Ptr);
3666+ if (SE.isLoopInvariant (PtrSCEV, L))
3667+ continue ;
3668+ if (isa<LoadInst>(&I))
3669+ LoadedValues.insert (&I);
3670+ else
3671+ Stores.push_back (cast<StoreInst>(&I));
3672+ }
36703673 }
3671- }
36723674
3673- // Try to find an unroll count that maximizes the use of the instruction
3674- // window, i.e. trying to fetch as many instructions per cycle as possible.
3675- unsigned MaxInstsPerLine = 16 ;
3676- unsigned UC = 1 ;
3677- unsigned BestUC = 1 ;
3678- unsigned SizeWithBestUC = BestUC * Size;
3679- while (UC <= 8 ) {
3680- unsigned SizeWithUC = UC * Size;
3681- if (SizeWithUC > 48 )
3682- break ;
3683- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
3684- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
3685- BestUC = UC;
3686- SizeWithBestUC = BestUC * Size;
3675+ // Try to find an unroll count that maximizes the use of the instruction
3676+ // window, i.e. trying to fetch as many instructions per cycle as possible.
3677+ unsigned MaxInstsPerLine = 16 ;
3678+ unsigned UC = 1 ;
3679+ unsigned BestUC = 1 ;
3680+ unsigned SizeWithBestUC = BestUC * Size;
3681+ while (UC <= 8 ) {
3682+ unsigned SizeWithUC = UC * Size;
3683+ if (SizeWithUC > 48 )
3684+ break ;
3685+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
3686+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
3687+ BestUC = UC;
3688+ SizeWithBestUC = BestUC * Size;
3689+ }
3690+ UC++;
36873691 }
3688- UC++;
3692+
3693+ if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
3694+ return LoadedValues.contains (SI->getOperand (0 ));
3695+ }))
3696+ return ;
3697+
3698+ UP.Runtime = true ;
3699+ UP.DefaultUnrollRuntimeCount = BestUC;
3700+ return ;
36893701 }
36903702
3691- if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
3692- return LoadedValues.contains (SI->getOperand (0 ));
3693- }))
3703+ // Try to runtime-unroll loops with early-continues depending on loop-varying
3704+ // loads; this helps with branch-prediction for the early-continues.
3705+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
3706+ auto *Latch = L->getLoopLatch ();
3707+ SmallVector<BasicBlock *> Preds (predecessors (Latch));
3708+ if (!Term || !Term->isConditional () || Preds.size () == 1 ||
3709+ none_of (Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
3710+ none_of (Preds, [L](BasicBlock *Pred) { return L->contains (Pred); }))
36943711 return ;
36953712
3696- UP.Runtime = true ;
3697- UP.DefaultUnrollRuntimeCount = BestUC;
3713+ std::function<bool (Instruction *, unsigned )> DependsOnLoopLoad =
3714+ [&](Instruction *I, unsigned Depth) -> bool {
3715+ if (isa<PHINode>(I) || L->isLoopInvariant (I) || Depth > 8 )
3716+ return false ;
3717+
3718+ if (isa<LoadInst>(I))
3719+ return true ;
3720+
3721+ return any_of (I->operands (), [&](Value *V) {
3722+ auto *I = dyn_cast<Instruction>(V);
3723+ return I && DependsOnLoopLoad (I, Depth + 1 );
3724+ });
3725+ };
3726+ CmpInst::Predicate Pred;
3727+ Instruction *I;
3728+ if (match (Term, m_Br (m_ICmp (Pred, m_Instruction (I), m_Value ()), m_Value (),
3729+ m_Value ())) &&
3730+ DependsOnLoopLoad (I, 0 )) {
3731+ UP.Runtime = true ;
3732+ }
36983733}
36993734
37003735void AArch64TTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE,
0 commit comments