Skip to content

Commit 1b0bce9

Browse files
authored
Reorder checks to speed up getAppleRuntimeUnrollPreferences() (#154010)
- Delay load/store values calculation unless a best unroll-count is found - Remove extra getLoopLatch() invocation
1 parent f7b09ad commit 1b0bce9

File tree

1 file changed

+24
-21
lines changed

1 file changed

+24
-21
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4912,13 +4912,35 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49124912
// load/store dependencies, to expose more parallel memory access streams,
49134913
// or if they do little work inside a block (i.e. load -> X -> store pattern).
49144914
BasicBlock *Header = L->getHeader();
4915-
if (Header == L->getLoopLatch()) {
4915+
BasicBlock *Latch = L->getLoopLatch();
4916+
if (Header == Latch) {
49164917
// Estimate the size of the loop.
49174918
unsigned Size;
49184919
unsigned Width = 10;
49194920
if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
49204921
return;
49214922

4923+
// Try to find an unroll count that maximizes the use of the instruction
4924+
// window, i.e. trying to fetch as many instructions per cycle as possible.
4925+
unsigned MaxInstsPerLine = 16;
4926+
unsigned UC = 1;
4927+
unsigned BestUC = 1;
4928+
unsigned SizeWithBestUC = BestUC * Size;
4929+
while (UC <= 8) {
4930+
unsigned SizeWithUC = UC * Size;
4931+
if (SizeWithUC > 48)
4932+
break;
4933+
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4934+
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4935+
BestUC = UC;
4936+
SizeWithBestUC = BestUC * Size;
4937+
}
4938+
UC++;
4939+
}
4940+
4941+
if (BestUC == 1)
4942+
return;
4943+
49224944
SmallPtrSet<Value *, 8> LoadedValuesPlus;
49234945
SmallVector<StoreInst *> Stores;
49244946
for (auto *BB : L->blocks()) {
@@ -4940,25 +4962,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49404962
}
49414963
}
49424964

4943-
// Try to find an unroll count that maximizes the use of the instruction
4944-
// window, i.e. trying to fetch as many instructions per cycle as possible.
4945-
unsigned MaxInstsPerLine = 16;
4946-
unsigned UC = 1;
4947-
unsigned BestUC = 1;
4948-
unsigned SizeWithBestUC = BestUC * Size;
4949-
while (UC <= 8) {
4950-
unsigned SizeWithUC = UC * Size;
4951-
if (SizeWithUC > 48)
4952-
break;
4953-
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4954-
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4955-
BestUC = UC;
4956-
SizeWithBestUC = BestUC * Size;
4957-
}
4958-
UC++;
4959-
}
4960-
4961-
if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
4965+
if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
49624966
return LoadedValuesPlus.contains(SI->getOperand(0));
49634967
}))
49644968
return;
@@ -4971,7 +4975,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49714975
// Try to runtime-unroll loops with early-continues depending on loop-varying
49724976
// loads; this helps with branch-prediction for the early-continues.
49734977
auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4974-
auto *Latch = L->getLoopLatch();
49754978
SmallVector<BasicBlock *> Preds(predecessors(Latch));
49764979
if (!Term || !Term->isConditional() || Preds.size() == 1 ||
49774980
!llvm::is_contained(Preds, Header) ||

0 commit comments

Comments
 (0)