@@ -4085,51 +4085,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
40854085
40864086 // Try to unroll small, single block loops, if they have load/store
40874087 // dependencies, to expose more parallel memory access streams.
4088- if (L->getHeader () != L->getLoopLatch () || Size > 8 )
4089- return ;
4088+ BasicBlock *Header = L->getHeader ();
4089+ if (Header == L->getLoopLatch ()) {
4090+ if (Size > 8 )
4091+ return ;
40904092
4091- SmallPtrSet<Value *, 8 > LoadedValues;
4092- SmallVector<StoreInst *> Stores;
4093- for (auto *BB : L->blocks ()) {
4094- for (auto &I : *BB) {
4095- Value *Ptr = getLoadStorePointerOperand (&I);
4096- if (!Ptr)
4097- continue ;
4098- const SCEV *PtrSCEV = SE.getSCEV (Ptr);
4099- if (SE.isLoopInvariant (PtrSCEV, L))
4100- continue ;
4101- if (isa<LoadInst>(&I))
4102- LoadedValues.insert (&I);
4103- else
4104- Stores.push_back (cast<StoreInst>(&I));
4093+ SmallPtrSet<Value *, 8 > LoadedValues;
4094+ SmallVector<StoreInst *> Stores;
4095+ for (auto *BB : L->blocks ()) {
4096+ for (auto &I : *BB) {
4097+ Value *Ptr = getLoadStorePointerOperand (&I);
4098+ if (!Ptr)
4099+ continue ;
4100+ const SCEV *PtrSCEV = SE.getSCEV (Ptr);
4101+ if (SE.isLoopInvariant (PtrSCEV, L))
4102+ continue ;
4103+ if (isa<LoadInst>(&I))
4104+ LoadedValues.insert (&I);
4105+ else
4106+ Stores.push_back (cast<StoreInst>(&I));
4107+ }
41054108 }
4106- }
41074109
4108- // Try to find an unroll count that maximizes the use of the instruction
4109- // window, i.e. trying to fetch as many instructions per cycle as possible.
4110- unsigned MaxInstsPerLine = 16 ;
4111- unsigned UC = 1 ;
4112- unsigned BestUC = 1 ;
4113- unsigned SizeWithBestUC = BestUC * Size;
4114- while (UC <= 8 ) {
4115- unsigned SizeWithUC = UC * Size;
4116- if (SizeWithUC > 48 )
4117- break ;
4118- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4119- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4120- BestUC = UC;
4121- SizeWithBestUC = BestUC * Size;
4110+ // Try to find an unroll count that maximizes the use of the instruction
4111+ // window, i.e. trying to fetch as many instructions per cycle as possible.
4112+ unsigned MaxInstsPerLine = 16 ;
4113+ unsigned UC = 1 ;
4114+ unsigned BestUC = 1 ;
4115+ unsigned SizeWithBestUC = BestUC * Size;
4116+ while (UC <= 8 ) {
4117+ unsigned SizeWithUC = UC * Size;
4118+ if (SizeWithUC > 48 )
4119+ break ;
4120+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4121+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4122+ BestUC = UC;
4123+ SizeWithBestUC = BestUC * Size;
4124+ }
4125+ UC++;
41224126 }
4123- UC++;
4127+
4128+ if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4129+ return LoadedValues.contains (SI->getOperand (0 ));
4130+ }))
4131+ return ;
4132+
4133+ UP.Runtime = true ;
4134+ UP.DefaultUnrollRuntimeCount = BestUC;
4135+ return ;
41244136 }
41254137
4126- if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4127- return LoadedValues.contains (SI->getOperand (0 ));
4128- }))
4138+ // Try to runtime-unroll loops with early-continues depending on loop-varying
4139+ // loads; this helps with branch-prediction for the early-continues.
4140+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
4141+ auto *Latch = L->getLoopLatch ();
4142+ SmallVector<BasicBlock *> Preds (predecessors (Latch));
4143+ if (!Term || !Term->isConditional () || Preds.size () == 1 ||
4144+ none_of (Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4145+ none_of (Preds, [L](BasicBlock *Pred) { return L->contains (Pred); }))
41294146 return ;
41304147
4131- UP.Runtime = true ;
4132- UP.DefaultUnrollRuntimeCount = BestUC;
4148+ std::function<bool (Instruction *, unsigned )> DependsOnLoopLoad =
4149+ [&](Instruction *I, unsigned Depth) -> bool {
4150+ if (isa<PHINode>(I) || L->isLoopInvariant (I) || Depth > 8 )
4151+ return false ;
4152+
4153+ if (isa<LoadInst>(I))
4154+ return true ;
4155+
4156+ return any_of (I->operands (), [&](Value *V) {
4157+ auto *I = dyn_cast<Instruction>(V);
4158+ return I && DependsOnLoopLoad (I, Depth + 1 );
4159+ });
4160+ };
4161+ CmpPredicate Pred;
4162+ Instruction *I;
4163+ if (match (Term, m_Br (m_ICmp (Pred, m_Instruction (I), m_Value ()), m_Value (),
4164+ m_Value ())) &&
4165+ DependsOnLoopLoad (I, 0 )) {
4166+ UP.Runtime = true ;
4167+ }
41334168}
41344169
41354170void AArch64TTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE,
0 commit comments