Skip to content

Commit a6300f7

Browse files
[SLP] Improve block traversal in getSpillCost()
This is a WIP patch due to the compilation time regressions, up to 7% on gcc_r/gcc_s. Previously, getSpillCost would skip in between blocks when traversing instructions backward. If one of the missing blocks has a function call, the existing logic would lead to incorrect spill cost calculations. The new implementation: - Uses post_order traversal to visit blocks - Tracks live entries across basic blocks - Computes reachable blocks once upfront using depth_first_ext - Maintains correct cost calculation for diamond-shaped control flow Performance improvements: - Reduces execution time of SPEC CPU benchmark 544.nab_r by 9.92% - Reduces code size of 508.namd by 1.73% This optimization improves vectorization decisions by making spill cost estimation more accurate, particularly for code with complex control flow.
1 parent e55f1a7 commit a6300f7

File tree

3 files changed

+80
-84
lines changed

3 files changed

+80
-84
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 52 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -12415,68 +12415,48 @@ InstructionCost BoUpSLP::getSpillCost() {
1241512415
InstructionCost Cost = 0;
1241612416

1241712417
SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12418-
const TreeEntry *Prev = nullptr;
12419-
12420-
// The entries in VectorizableTree are not necessarily ordered by their
12421-
// position in basic blocks. Collect them and order them by dominance so later
12422-
// instructions are guaranteed to be visited first. For instructions in
12423-
// different basic blocks, we only scan to the beginning of the block, so
12424-
// their order does not matter, as long as all instructions in a basic block
12425-
// are grouped together. Using dominance ensures a deterministic order.
12426-
SmallVector<TreeEntry *, 16> OrderedEntries;
12427-
for (const auto &TEPtr : VectorizableTree) {
12428-
if (TEPtr->isGather())
12429-
continue;
12430-
OrderedEntries.push_back(TEPtr.get());
12431-
}
12432-
llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12433-
const TreeEntry *TB) {
12434-
Instruction &A = getLastInstructionInBundle(TA);
12435-
Instruction &B = getLastInstructionInBundle(TB);
12436-
auto *NodeA = DT->getNode(A.getParent());
12437-
auto *NodeB = DT->getNode(B.getParent());
12438-
assert(NodeA && "Should only process reachable instructions");
12439-
assert(NodeB && "Should only process reachable instructions");
12440-
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12441-
"Different nodes should have different DFS numbers");
12442-
if (NodeA != NodeB)
12443-
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12444-
return B.comesBefore(&A);
12445-
});
1244612418

12447-
for (const TreeEntry *TE : OrderedEntries) {
12448-
if (!Prev) {
12449-
Prev = TE;
12419+
const TreeEntry *Root = VectorizableTree.front().get();
12420+
BasicBlock *RootBB = cast<Instruction>(Root->Scalars[0])->getParent();
12421+
12422+
// Compute what nodes are reachable from the leaves to the roots
12423+
df_iterator_default_set<const BasicBlock *> ReachableFromLeaves;
12424+
for (auto &TE : VectorizableTree) {
12425+
if (TE->isGather())
1245012426
continue;
12451-
}
12427+
auto *BB = getLastInstructionInBundle(TE.get()).getParent();
12428+
for (const BasicBlock *X : depth_first_ext(BB, ReachableFromLeaves))
12429+
ReachableFromLeaves.insert(X);
12430+
}
1245212431

12453-
LiveEntries.erase(Prev);
12454-
for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12455-
const TreeEntry *Op = getVectorizedOperand(Prev, I);
12456-
if (!Op)
12457-
continue;
12458-
assert(!Op->isGather() && "Expected vectorized operand.");
12459-
LiveEntries.insert(Op);
12460-
}
12432+
DenseSet<const BasicBlock *> Reachable;
12433+
for (const BasicBlock *X : inverse_depth_first(RootBB))
12434+
Reachable.insert(X);
12435+
set_intersect(Reachable, ReachableFromLeaves);
1246112436

12462-
LLVM_DEBUG({
12463-
dbgs() << "SLP: #LV: " << LiveEntries.size();
12464-
for (auto *X : LiveEntries)
12465-
X->dump();
12466-
dbgs() << ", Looking at ";
12467-
TE->dump();
12468-
});
12437+
DenseSet<const TreeEntry *> Defined;
1246912438

12470-
// Now find the sequence of instructions between PrevInst and Inst.
12471-
unsigned NumCalls = 0;
12472-
const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12473-
BasicBlock::const_reverse_iterator
12474-
InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12475-
PrevInstIt = PrevInst->getIterator().getReverse();
12476-
while (InstIt != PrevInstIt) {
12477-
if (PrevInstIt == PrevInst->getParent()->rend()) {
12478-
PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
12479-
continue;
12439+
// Iterate the tree from the root, post order so that all uses appear before
12440+
// definitions.
12441+
// TODO: LiveEntries are shared across all paths, so this may overestimate.
12442+
for (BasicBlock *BB : post_order(RootBB->getParent())) {
12443+
if (!Reachable.contains(BB))
12444+
continue;
12445+
12446+
for (Instruction &I : reverse(*BB)) {
12447+
for (const auto *TE : getTreeEntries(&I)) {
12448+
if (TE->isGather())
12449+
continue;
12450+
LiveEntries.erase(TE);
12451+
Defined.insert(TE);
12452+
for (unsigned Idx : seq<unsigned>(TE->getNumOperands())) {
12453+
const TreeEntry *Op = getVectorizedOperand(TE, Idx);
12454+
if (!Op)
12455+
continue;
12456+
assert(!Op->isGather() && "Expected vectorized operand.");
12457+
if (!Defined.contains(Op))
12458+
LiveEntries.insert(Op);
12459+
}
1248012460
}
1248112461

1248212462
auto NoCallIntrinsic = [this](const Instruction *I) {
@@ -12497,26 +12477,24 @@ InstructionCost BoUpSLP::getSpillCost() {
1249712477
// Debug information does not impact spill cost.
1249812478
// Vectorized calls, represented as vector intrinsics, do not impact spill
1249912479
// cost.
12500-
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12501-
CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
12502-
NumCalls++;
12503-
12504-
++PrevInstIt;
12505-
}
12480+
if (const auto *CB = dyn_cast<CallBase>(&I);
12481+
CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12482+
SmallVector<Type *, 4> EntriesTypes;
12483+
for (const TreeEntry *TE : LiveEntries) {
12484+
auto *ScalarTy = TE->getMainOp()->getType();
12485+
auto It = MinBWs.find(TE);
12486+
if (It != MinBWs.end())
12487+
ScalarTy =
12488+
IntegerType::get(ScalarTy->getContext(), It->second.first);
12489+
EntriesTypes.push_back(
12490+
getWidenedType(ScalarTy, TE->getVectorFactor()));
12491+
}
1250612492

12507-
if (NumCalls) {
12508-
SmallVector<Type *, 4> EntriesTypes;
12509-
for (const TreeEntry *TE : LiveEntries) {
12510-
auto *ScalarTy = TE->getMainOp()->getType();
12511-
auto It = MinBWs.find(TE);
12512-
if (It != MinBWs.end())
12513-
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12514-
EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
12493+
LLVM_DEBUG(dbgs() << "SLP: " << LiveEntries.size()
12494+
<< " entries alive over call:" << I << "\n");
12495+
Cost += TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
1251512496
}
12516-
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
1251712497
}
12518-
12519-
Prev = TE;
1252012498
}
1252112499

1252212500
return Cost;

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1740,7 +1740,9 @@ entry:
17401740
define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17411741
; CHECK-LABEL: define void @f
17421742
; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
1743-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
1743+
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
1744+
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
1745+
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
17441746
; CHECK-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
17451747
; CHECK: foo:
17461748
; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17511753
; CHECK-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
17521754
; CHECK-NEXT: br label [[BAZ]]
17531755
; CHECK: baz:
1754-
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
1756+
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
1757+
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
1758+
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
17551759
; CHECK-NEXT: ret void
17561760
;
17571761
; DEFAULT-LABEL: define void @f
17581762
; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
1759-
; DEFAULT-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
1763+
; DEFAULT-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
1764+
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
1765+
; DEFAULT-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
17601766
; DEFAULT-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
17611767
; DEFAULT: foo:
17621768
; DEFAULT-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17671773
; DEFAULT-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
17681774
; DEFAULT-NEXT: br label [[BAZ]]
17691775
; DEFAULT: baz:
1770-
; DEFAULT-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
1776+
; DEFAULT-NEXT: store i64 [[X0]], ptr [[Q]], align 8
1777+
; DEFAULT-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
1778+
; DEFAULT-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
17711779
; DEFAULT-NEXT: ret void
17721780
;
17731781
%x0 = load i64, ptr %p

llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ declare void @g()
77
define void @f0(i1 %c, ptr %p, ptr %q) {
88
; CHECK-LABEL: define void @f0(
99
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
10-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
10+
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
11+
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
12+
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
1113
; CHECK-NEXT: br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
1214
; CHECK: [[FOO]]:
1315
; CHECK-NEXT: call void @g()
@@ -20,7 +22,9 @@ define void @f0(i1 %c, ptr %p, ptr %q) {
2022
; CHECK-NEXT: call void @g()
2123
; CHECK-NEXT: br label %[[BAZ]]
2224
; CHECK: [[BAZ]]:
23-
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
25+
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
26+
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
27+
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
2428
; CHECK-NEXT: ret void
2529
;
2630
%x0 = load i64, ptr %p
@@ -50,19 +54,25 @@ define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
5054
; CHECK-LABEL: define void @f1(
5155
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR0]] {
5256
; CHECK-NEXT: [[ENTRY:.*:]]
53-
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[P]], align 8
57+
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
58+
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
59+
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
5460
; CHECK-NEXT: br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
5561
; CHECK: [[FOO]]:
56-
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1)
62+
; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
63+
; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 1
5764
; CHECK-NEXT: br label %[[BAZ:.*]]
5865
; CHECK: [[BAR]]:
5966
; CHECK-NEXT: call void @g()
6067
; CHECK-NEXT: call void @g()
6168
; CHECK-NEXT: call void @g()
6269
; CHECK-NEXT: br label %[[BAZ]]
6370
; CHECK: [[BAZ]]:
64-
; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i64> [ [[TMP1]], %[[FOO]] ], [ [[TMP0]], %[[BAR]] ]
65-
; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[Q]], align 8
71+
; CHECK-NEXT: [[PHI0:%.*]] = phi i64 [ [[Y0]], %[[FOO]] ], [ [[X0]], %[[BAR]] ]
72+
; CHECK-NEXT: [[PHI1:%.*]] = phi i64 [ [[Y1]], %[[FOO]] ], [ [[X1]], %[[BAR]] ]
73+
; CHECK-NEXT: store i64 [[PHI0]], ptr [[Q]], align 8
74+
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
75+
; CHECK-NEXT: store i64 [[PHI1]], ptr [[Q1]], align 8
6676
; CHECK-NEXT: ret void
6777
;
6878
entry:

0 commit comments

Comments
 (0)