@@ -4912,13 +4912,35 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49124912 // load/store dependencies, to expose more parallel memory access streams,
49134913 // or if they do little work inside a block (i.e. load -> X -> store pattern).
49144914 BasicBlock *Header = L->getHeader ();
4915- if (Header == L->getLoopLatch ()) {
4915+ BasicBlock *Latch = L->getLoopLatch ();
4916+ if (Header == Latch) {
49164917 // Estimate the size of the loop.
49174918 unsigned Size;
49184919 unsigned Width = 10 ;
49194920 if (!isLoopSizeWithinBudget (L, TTI, Width, &Size))
49204921 return ;
49214922
4923+ // Try to find an unroll count that maximizes the use of the instruction
4924+ // window, i.e. trying to fetch as many instructions per cycle as possible.
4925+ unsigned MaxInstsPerLine = 16 ;
4926+ unsigned UC = 1 ;
4927+ unsigned BestUC = 1 ;
4928+ unsigned SizeWithBestUC = BestUC * Size;
4929+ while (UC <= 8 ) {
4930+ unsigned SizeWithUC = UC * Size;
4931+ if (SizeWithUC > 48 )
4932+ break ;
4933+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4934+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4935+ BestUC = UC;
4936+ SizeWithBestUC = BestUC * Size;
4937+ }
4938+ UC++;
4939+ }
4940+
4941+ if (BestUC == 1 )
4942+ return ;
4943+
49224944 SmallPtrSet<Value *, 8 > LoadedValuesPlus;
49234945 SmallVector<StoreInst *> Stores;
49244946 for (auto *BB : L->blocks ()) {
@@ -4940,25 +4962,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49404962 }
49414963 }
49424964
4943- // Try to find an unroll count that maximizes the use of the instruction
4944- // window, i.e. trying to fetch as many instructions per cycle as possible.
4945- unsigned MaxInstsPerLine = 16 ;
4946- unsigned UC = 1 ;
4947- unsigned BestUC = 1 ;
4948- unsigned SizeWithBestUC = BestUC * Size;
4949- while (UC <= 8 ) {
4950- unsigned SizeWithUC = UC * Size;
4951- if (SizeWithUC > 48 )
4952- break ;
4953- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4954- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4955- BestUC = UC;
4956- SizeWithBestUC = BestUC * Size;
4957- }
4958- UC++;
4959- }
4960-
4961- if (BestUC == 1 || none_of (Stores, [&LoadedValuesPlus](StoreInst *SI) {
4965+ if (none_of (Stores, [&LoadedValuesPlus](StoreInst *SI) {
49624966 return LoadedValuesPlus.contains (SI->getOperand (0 ));
49634967 }))
49644968 return ;
@@ -4971,7 +4975,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49714975 // Try to runtime-unroll loops with early-continues depending on loop-varying
49724976 // loads; this helps with branch-prediction for the early-continues.
49734977 auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
4974- auto *Latch = L->getLoopLatch ();
49754978 SmallVector<BasicBlock *> Preds (predecessors (Latch));
49764979 if (!Term || !Term->isConditional () || Preds.size () == 1 ||
49774980 !llvm::is_contained (Preds, Header) ||
0 commit comments