@@ -4912,13 +4912,35 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4912
4912
// load/store dependencies, to expose more parallel memory access streams,
4913
4913
// or if they do little work inside a block (i.e. load -> X -> store pattern).
4914
4914
BasicBlock *Header = L->getHeader ();
4915
- if (Header == L->getLoopLatch ()) {
4915
+ BasicBlock *Latch = L->getLoopLatch ();
4916
+ if (Header == Latch) {
4916
4917
// Estimate the size of the loop.
4917
4918
unsigned Size;
4918
4919
unsigned Width = 10 ;
4919
4920
if (!isLoopSizeWithinBudget (L, TTI, Width, &Size))
4920
4921
return ;
4921
4922
4923
+ // Try to find an unroll count that maximizes the use of the instruction
4924
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
4925
+ unsigned MaxInstsPerLine = 16 ;
4926
+ unsigned UC = 1 ;
4927
+ unsigned BestUC = 1 ;
4928
+ unsigned SizeWithBestUC = BestUC * Size;
4929
+ while (UC <= 8 ) {
4930
+ unsigned SizeWithUC = UC * Size;
4931
+ if (SizeWithUC > 48 )
4932
+ break ;
4933
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4934
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4935
+ BestUC = UC;
4936
+ SizeWithBestUC = BestUC * Size;
4937
+ }
4938
+ UC++;
4939
+ }
4940
+
4941
+ if (BestUC == 1 )
4942
+ return ;
4943
+
4922
4944
SmallPtrSet<Value *, 8 > LoadedValuesPlus;
4923
4945
SmallVector<StoreInst *> Stores;
4924
4946
for (auto *BB : L->blocks ()) {
@@ -4940,25 +4962,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4940
4962
}
4941
4963
}
4942
4964
4943
- // Try to find an unroll count that maximizes the use of the instruction
4944
- // window, i.e. trying to fetch as many instructions per cycle as possible.
4945
- unsigned MaxInstsPerLine = 16 ;
4946
- unsigned UC = 1 ;
4947
- unsigned BestUC = 1 ;
4948
- unsigned SizeWithBestUC = BestUC * Size;
4949
- while (UC <= 8 ) {
4950
- unsigned SizeWithUC = UC * Size;
4951
- if (SizeWithUC > 48 )
4952
- break ;
4953
- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4954
- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4955
- BestUC = UC;
4956
- SizeWithBestUC = BestUC * Size;
4957
- }
4958
- UC++;
4959
- }
4960
-
4961
- if (BestUC == 1 || none_of (Stores, [&LoadedValuesPlus](StoreInst *SI) {
4965
+ if (none_of (Stores, [&LoadedValuesPlus](StoreInst *SI) {
4962
4966
return LoadedValuesPlus.contains (SI->getOperand (0 ));
4963
4967
}))
4964
4968
return ;
@@ -4971,7 +4975,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4971
4975
// Try to runtime-unroll loops with early-continues depending on loop-varying
4972
4976
// loads; this helps with branch-prediction for the early-continues.
4973
4977
auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
4974
- auto *Latch = L->getLoopLatch ();
4975
4978
SmallVector<BasicBlock *> Preds (predecessors (Latch));
4976
4979
if (!Term || !Term->isConditional () || Preds.size () == 1 ||
4977
4980
!llvm::is_contained (Preds, Header) ||
0 commit comments