Skip to content

Commit e0dfe4c

Browse files
committed
[𝘀𝗽𝗿] initial version
Created using spr 1.3.5
1 parent 69effe0 commit e0dfe4c

File tree

3 files changed

+201
-97
lines changed

3 files changed

+201
-97
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 182 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -12448,109 +12448,201 @@ InstructionCost BoUpSLP::getSpillCost() {
1244812448
// (for example, if spills and fills are required).
1244912449
InstructionCost Cost = 0;
1245012450

12451-
SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12452-
const TreeEntry *Prev = nullptr;
12453-
12454-
// The entries in VectorizableTree are not necessarily ordered by their
12455-
// position in basic blocks. Collect them and order them by dominance so later
12456-
// instructions are guaranteed to be visited first. For instructions in
12457-
// different basic blocks, we only scan to the beginning of the block, so
12458-
// their order does not matter, as long as all instructions in a basic block
12459-
// are grouped together. Using dominance ensures a deterministic order.
12460-
SmallVector<TreeEntry *, 16> OrderedEntries;
12461-
for (const auto &TEPtr : VectorizableTree) {
12462-
if (TEPtr->isGather())
12463-
continue;
12464-
OrderedEntries.push_back(TEPtr.get());
12465-
}
12466-
llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12467-
const TreeEntry *TB) {
12468-
Instruction &A = getLastInstructionInBundle(TA);
12469-
Instruction &B = getLastInstructionInBundle(TB);
12470-
auto *NodeA = DT->getNode(A.getParent());
12471-
auto *NodeB = DT->getNode(B.getParent());
12472-
assert(NodeA && "Should only process reachable instructions");
12473-
assert(NodeB && "Should only process reachable instructions");
12474-
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12475-
"Different nodes should have different DFS numbers");
12476-
if (NodeA != NodeB)
12477-
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12478-
return B.comesBefore(&A);
12479-
});
12480-
12481-
for (const TreeEntry *TE : OrderedEntries) {
12482-
if (!Prev) {
12483-
Prev = TE;
12484-
continue;
12485-
}
12451+
const TreeEntry *Root = VectorizableTree.front().get();
12452+
if (Root->isGather())
12453+
return Cost;
1248612454

12487-
LiveEntries.erase(Prev);
12488-
for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12489-
const TreeEntry *Op = getVectorizedOperand(Prev, I);
12490-
if (!Op)
12491-
continue;
12492-
assert(!Op->isGather() && "Expected vectorized operand.");
12493-
LiveEntries.insert(Op);
12455+
SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12456+
EntriesToOperands;
12457+
SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12458+
SmallPtrSet<const Instruction *, 8> LastInstructions;
12459+
for (const auto &TEPtr : VectorizableTree) {
12460+
if (!TEPtr->isGather()) {
12461+
Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12462+
EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12463+
LastInstructions.insert(LastInst);
1249412464
}
12465+
if (TEPtr->UserTreeIndex)
12466+
EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12467+
}
1249512468

12496-
LLVM_DEBUG({
12497-
dbgs() << "SLP: #LV: " << LiveEntries.size();
12498-
for (auto *X : LiveEntries)
12499-
X->dump();
12500-
dbgs() << ", Looking at ";
12501-
TE->dump();
12502-
});
12503-
12504-
// Now find the sequence of instructions between PrevInst and Inst.
12505-
unsigned NumCalls = 0;
12506-
const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12507-
BasicBlock::const_reverse_iterator
12508-
InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12509-
PrevInstIt = PrevInst->getIterator().getReverse();
12510-
while (InstIt != PrevInstIt) {
12511-
if (PrevInstIt == PrevInst->getParent()->rend()) {
12512-
PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
12513-
continue;
12514-
}
12515-
12516-
auto NoCallIntrinsic = [this](const Instruction *I) {
12517-
const auto *II = dyn_cast<IntrinsicInst>(I);
12518-
if (!II)
12519-
return false;
12520-
if (II->isAssumeLikeIntrinsic())
12521-
return true;
12522-
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12523-
InstructionCost IntrCost =
12524-
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12525-
InstructionCost CallCost =
12526-
TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
12527-
TTI::TCK_RecipThroughput);
12528-
return IntrCost < CallCost;
12529-
};
12469+
auto NoCallIntrinsic = [this](const Instruction *I) {
12470+
const auto *II = dyn_cast<IntrinsicInst>(I);
12471+
if (!II)
12472+
return false;
12473+
if (II->isAssumeLikeIntrinsic())
12474+
return true;
12475+
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12476+
InstructionCost IntrCost =
12477+
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12478+
InstructionCost CallCost = TTI->getCallInstrCost(
12479+
nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12480+
return IntrCost < CallCost;
12481+
};
1253012482

12483+
SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12484+
CheckedInstructions;
12485+
unsigned Budget = 0;
12486+
const unsigned BudgetLimit =
12487+
ScheduleRegionSizeBudget / VectorizableTree.size();
12488+
auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12489+
Instruction *Last) {
12490+
assert(First->getParent() == Last->getParent() &&
12491+
"Expected instructions in same block.");
12492+
if (Last == First || Last->comesBefore(First))
12493+
return true;
12494+
BasicBlock::const_reverse_iterator InstIt =
12495+
++First->getIterator().getReverse(),
12496+
PrevInstIt =
12497+
Last->getIterator().getReverse();
12498+
auto It = CheckedInstructions.find(Last);
12499+
if (It != CheckedInstructions.end()) {
12500+
const Instruction *Checked = It->second.getPointer();
12501+
if (Checked == First || Checked->comesBefore(First))
12502+
return It->second.getInt() != 0;
12503+
PrevInstIt = Checked->getIterator().getReverse();
12504+
}
12505+
SmallVector<const Instruction *> LastInstsInRange(1, Last);
12506+
while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
1253112507
// Debug information does not impact spill cost.
1253212508
// Vectorized calls, represented as vector intrinsics, do not impact spill
1253312509
// cost.
1253412510
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12535-
CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
12536-
NumCalls++;
12511+
CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12512+
for (const Instruction *LastInst : LastInstsInRange)
12513+
CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12514+
return false;
12515+
}
12516+
if (LastInstructions.contains(&*PrevInstIt))
12517+
LastInstsInRange.push_back(&*PrevInstIt);
1253712518

1253812519
++PrevInstIt;
12520+
++Budget;
1253912521
}
12540-
12541-
if (NumCalls) {
12542-
SmallVector<Type *, 4> EntriesTypes;
12543-
for (const TreeEntry *TE : LiveEntries) {
12544-
auto *ScalarTy = TE->getMainOp()->getType();
12545-
auto It = MinBWs.find(TE);
12546-
if (It != MinBWs.end())
12547-
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12548-
EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
12522+
for (const Instruction *LastInst : LastInstsInRange)
12523+
CheckedInstructions.try_emplace(
12524+
LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12525+
Budget <= BudgetLimit ? 1 : 0);
12526+
return Budget <= BudgetLimit;
12527+
};
12528+
auto AddCosts = [&](const TreeEntry *Op) {
12529+
Type *ScalarTy = Op->Scalars.front()->getType();
12530+
auto It = MinBWs.find(Op);
12531+
if (It != MinBWs.end())
12532+
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12533+
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12534+
Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12535+
if (ScalarTy->isVectorTy()) {
12536+
// Handle revec dead vector instructions.
12537+
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12538+
}
12539+
};
12540+
SmallDenseMap<const BasicBlock *, bool> BlocksToCalls;
12541+
auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12542+
BasicBlock *OpParent) {
12543+
SmallVector<BasicBlock *> Worklist;
12544+
if (Pred)
12545+
Worklist.push_back(Pred);
12546+
else
12547+
Worklist.append(pred_begin(Root), pred_end(Root));
12548+
SmallPtrSet<const BasicBlock *, 16> Visited;
12549+
while (!Worklist.empty()) {
12550+
BasicBlock *BB = Worklist.pop_back_val();
12551+
if (BB == OpParent || !Visited.insert(BB).second)
12552+
continue;
12553+
if (auto It = BlocksToCalls.find(BB); It != BlocksToCalls.end()) {
12554+
Worklist.append(pred_begin(BB), pred_end(BB));
12555+
if (!It->second)
12556+
return false;
12557+
continue;
12558+
}
12559+
BlocksToCalls[BB] = false;
12560+
if (BB->sizeWithoutDebug() > ScheduleRegionSizeBudget)
12561+
return false;
12562+
Budget += BB->sizeWithoutDebug();
12563+
if (Budget > BudgetLimit)
12564+
return false;
12565+
if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12566+
BB->getTerminator()))
12567+
return false;
12568+
BlocksToCalls[BB] = true;
12569+
Worklist.append(pred_begin(BB), pred_end(BB));
12570+
}
12571+
return true;
12572+
};
12573+
SmallVector<const TreeEntry *> LiveEntries(1, Root);
12574+
while (!LiveEntries.empty()) {
12575+
const TreeEntry *Entry = LiveEntries.pop_back_val();
12576+
SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12577+
if (Operands.empty())
12578+
continue;
12579+
Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12580+
for (const TreeEntry *Op : Operands) {
12581+
if (!Op->isGather())
12582+
LiveEntries.push_back(Op);
12583+
BasicBlock *Parent = Entry->getMainOp()->getParent();
12584+
if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12585+
(Op->isGather() && allConstant(Op->Scalars)))
12586+
continue;
12587+
Budget = 0;
12588+
BasicBlock *Pred = Entry->getOpcode() == Instruction::PHI
12589+
? cast<PHINode>(Entry->getMainOp())
12590+
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx)
12591+
: nullptr;
12592+
BasicBlock *OpParent;
12593+
Instruction *OpLastInst;
12594+
if (Op->isGather()) {
12595+
assert(Entry->getOpcode() == Instruction::PHI &&
12596+
"Expected phi node only.");
12597+
OpParent = cast<PHINode>(Entry->getMainOp())
12598+
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12599+
OpLastInst = OpParent->getTerminator();
12600+
for (Value *V : Op->Scalars) {
12601+
auto *Inst = dyn_cast<Instruction>(V);
12602+
if (!Inst)
12603+
continue;
12604+
if (isVectorized(V)) {
12605+
OpParent = Inst->getParent();
12606+
OpLastInst = Inst;
12607+
break;
12608+
}
12609+
}
12610+
} else {
12611+
OpLastInst = EntriesToLastInstruction.at(Op);
12612+
OpParent = Op->getMainOp()->getParent();
12613+
}
12614+
// Check the call instructions within the same basic blocks.
12615+
if (OpParent == Parent) {
12616+
if (Entry->getOpcode() == Instruction::PHI) {
12617+
if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12618+
AddCosts(Op);
12619+
continue;
12620+
}
12621+
if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12622+
AddCosts(Op);
12623+
continue;
12624+
}
12625+
// Check for call instruction in between blocks.
12626+
// 1. Check entry's block to the head.
12627+
if (Entry->getOpcode() != Instruction::PHI &&
12628+
!CheckForNonVecCallsInSameBlock(
12629+
&*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12630+
LastInst)) {
12631+
AddCosts(Op);
12632+
continue;
12633+
}
12634+
// 2. Check op's block from the end.
12635+
if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12636+
OpParent->getTerminator())) {
12637+
AddCosts(Op);
12638+
continue;
12639+
}
12640+
// 3. Check the predecessors of entry's block till op's block.
12641+
if (!CheckPredecessors(Parent, Pred, OpParent)) {
12642+
AddCosts(Op);
12643+
continue;
1254912644
}
12550-
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
1255112645
}
12552-
12553-
Prev = TE;
1255412646
}
1255512647

1255612648
return Cost;

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1740,7 +1740,9 @@ entry:
17401740
define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17411741
; CHECK-LABEL: define void @f
17421742
; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
1743-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
1743+
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
1744+
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
1745+
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
17441746
; CHECK-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
17451747
; CHECK: foo:
17461748
; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17511753
; CHECK-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
17521754
; CHECK-NEXT: br label [[BAZ]]
17531755
; CHECK: baz:
1754-
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
1756+
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
1757+
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
1758+
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
17551759
; CHECK-NEXT: ret void
17561760
;
17571761
; DEFAULT-LABEL: define void @f
17581762
; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
1759-
; DEFAULT-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
1763+
; DEFAULT-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
1764+
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
1765+
; DEFAULT-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
17601766
; DEFAULT-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
17611767
; DEFAULT: foo:
17621768
; DEFAULT-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17671773
; DEFAULT-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
17681774
; DEFAULT-NEXT: br label [[BAZ]]
17691775
; DEFAULT: baz:
1770-
; DEFAULT-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
1776+
; DEFAULT-NEXT: store i64 [[X0]], ptr [[Q]], align 8
1777+
; DEFAULT-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
1778+
; DEFAULT-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
17711779
; DEFAULT-NEXT: ret void
17721780
;
17731781
%x0 = load i64, ptr %p

llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ declare void @g()
77
define void @f0(i1 %c, ptr %p, ptr %q) {
88
; CHECK-LABEL: define void @f0(
99
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
10-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
10+
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
11+
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
12+
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
1113
; CHECK-NEXT: br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
1214
; CHECK: [[FOO]]:
1315
; CHECK-NEXT: call void @g()
@@ -20,7 +22,9 @@ define void @f0(i1 %c, ptr %p, ptr %q) {
2022
; CHECK-NEXT: call void @g()
2123
; CHECK-NEXT: br label %[[BAZ]]
2224
; CHECK: [[BAZ]]:
23-
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
25+
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
26+
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
27+
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
2428
; CHECK-NEXT: ret void
2529
;
2630
%x0 = load i64, ptr %p
@@ -45,7 +49,7 @@ baz:
4549
ret void
4650
}
4751

48-
; Shouldn't be vectorized
52+
; Should be vectorized - just one spill of TMP0
4953
define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
5054
; CHECK-LABEL: define void @f1(
5155
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR0]] {

0 commit comments

Comments
 (0)