Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 182 additions & 90 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12448,109 +12448,201 @@ InstructionCost BoUpSLP::getSpillCost() {
// (for example, if spills and fills are required).
InstructionCost Cost = 0;

SmallPtrSet<const TreeEntry *, 4> LiveEntries;
const TreeEntry *Prev = nullptr;

// The entries in VectorizableTree are not necessarily ordered by their
// position in basic blocks. Collect them and order them by dominance so later
// instructions are guaranteed to be visited first. For instructions in
// different basic blocks, we only scan to the beginning of the block, so
// their order does not matter, as long as all instructions in a basic block
// are grouped together. Using dominance ensures a deterministic order.
SmallVector<TreeEntry *, 16> OrderedEntries;
for (const auto &TEPtr : VectorizableTree) {
if (TEPtr->isGather())
continue;
OrderedEntries.push_back(TEPtr.get());
}
llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
const TreeEntry *TB) {
Instruction &A = getLastInstructionInBundle(TA);
Instruction &B = getLastInstructionInBundle(TB);
auto *NodeA = DT->getNode(A.getParent());
auto *NodeB = DT->getNode(B.getParent());
assert(NodeA && "Should only process reachable instructions");
assert(NodeB && "Should only process reachable instructions");
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
if (NodeA != NodeB)
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
return B.comesBefore(&A);
});

for (const TreeEntry *TE : OrderedEntries) {
if (!Prev) {
Prev = TE;
continue;
}
const TreeEntry *Root = VectorizableTree.front().get();
if (Root->isGather())
return Cost;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just return 0 here.


LiveEntries.erase(Prev);
for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
const TreeEntry *Op = getVectorizedOperand(Prev, I);
if (!Op)
continue;
assert(!Op->isGather() && "Expected vectorized operand.");
LiveEntries.insert(Op);
SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
EntriesToOperands;
SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
SmallPtrSet<const Instruction *, 8> LastInstructions;
for (const auto &TEPtr : VectorizableTree) {
if (!TEPtr->isGather()) {
Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
LastInstructions.insert(LastInst);
}
if (TEPtr->UserTreeIndex)
EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
}

LLVM_DEBUG({
dbgs() << "SLP: #LV: " << LiveEntries.size();
for (auto *X : LiveEntries)
X->dump();
dbgs() << ", Looking at ";
TE->dump();
});

// Now find the sequence of instructions between PrevInst and Inst.
unsigned NumCalls = 0;
const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
BasicBlock::const_reverse_iterator
InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
PrevInstIt = PrevInst->getIterator().getReverse();
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
continue;
}

auto NoCallIntrinsic = [this](const Instruction *I) {
const auto *II = dyn_cast<IntrinsicInst>(I);
if (!II)
return false;
if (II->isAssumeLikeIntrinsic())
return true;
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
InstructionCost IntrCost =
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
InstructionCost CallCost =
TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
TTI::TCK_RecipThroughput);
return IntrCost < CallCost;
};
auto NoCallIntrinsic = [this](const Instruction *I) {
const auto *II = dyn_cast<IntrinsicInst>(I);
if (!II)
return false;
if (II->isAssumeLikeIntrinsic())
return true;
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
InstructionCost IntrCost =
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
InstructionCost CallCost = TTI->getCallInstrCost(
nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
return IntrCost < CallCost;
};

SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment or two explaining the CheckedInstructions scheme? I'm not following this from the code structure. It seems like you're tying the key to the LastInstructions above, but I'm not getting why? It seems like you should be able to just remember a range of instructions which are already scanned, and bypass them?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it means the range. But the range won't work good, e.g. if you have entry with several operands. We need to memoize the "upper-most" operand instructions for the current entry to skip the analysis for the previously analyzed instructions (which we analyzed already during previous analysis of the operands) and not repeat it again. Range won't allow doing this, because for the second operand (which is First) will be different for different operands

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how this response relates to my comment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try to rephrase. It exactly remembers the range. But the range is represented as last instruction of the entry + topmost instruction checked (topmost operand last instruction). During the analysis, if it sees other entries (instructions, which are last instruction from other entries), it adds the same info for other entries in this map. It allows to avoid using of comesBefore, when checking if the range is analyzed already, and instead just perform a simple lookup in the CheckedInstructions map to skip the part of the instructions, which was analyzed already.

CheckedInstructions;
unsigned Budget = 0;
const unsigned BudgetLimit =
ScheduleRegionSizeBudget / VectorizableTree.size();
auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
Instruction *Last) {
assert(First->getParent() == Last->getParent() &&
"Expected instructions in same block.");
if (Last == First || Last->comesBefore(First))
return true;
BasicBlock::const_reverse_iterator InstIt =
++First->getIterator().getReverse(),
PrevInstIt =
Last->getIterator().getReverse();
auto It = CheckedInstructions.find(Last);
if (It != CheckedInstructions.end()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that if I'm reading this right, you can move the CheckedInstructions logic up, and common the two
if (Checked == First || Checked->comesBefore(First))
return It->second.getInt() != 0;

cases into one, and then get the reversed iterators.

const Instruction *Checked = It->second.getPointer();
if (Checked == First || Checked->comesBefore(First))
return It->second.getInt() != 0;
PrevInstIt = Checked->getIterator().getReverse();
}
SmallVector<const Instruction *> LastInstsInRange;
while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
// Debug information does not impact spill cost.
// Vectorized calls, represented as vector intrinsics, do not impact spill
// cost.
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
NumCalls++;
CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
for (const Instruction *LastInst : LastInstsInRange)
CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
return false;
}
if (LastInstructions.contains(&*PrevInstIt))
LastInstsInRange.push_back(&*PrevInstIt);

++PrevInstIt;
++Budget;
}

if (NumCalls) {
SmallVector<Type *, 4> EntriesTypes;
for (const TreeEntry *TE : LiveEntries) {
auto *ScalarTy = TE->getMainOp()->getType();
auto It = MinBWs.find(TE);
if (It != MinBWs.end())
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
for (const Instruction *LastInst : LastInstsInRange)
CheckedInstructions.try_emplace(
LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
Budget <= BudgetLimit ? 1 : 0);
return Budget <= BudgetLimit;
};
auto AddCosts = [&](const TreeEntry *Op) {
Type *ScalarTy = Op->Scalars.front()->getType();
auto It = MinBWs.find(Op);
if (It != MinBWs.end())
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
if (ScalarTy->isVectorTy()) {
// Handle revec dead vector instructions.
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
}
};
SmallDenseMap<const BasicBlock *, bool> BlocksToCalls;
auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
BasicBlock *OpParent) {
SmallVector<BasicBlock *> Worklist;
if (Pred)
Worklist.push_back(Pred);
else
Worklist.append(pred_begin(Root), pred_end(Root));
SmallPtrSet<const BasicBlock *, 16> Visited;
while (!Worklist.empty()) {
BasicBlock *BB = Worklist.pop_back_val();
if (BB == OpParent || !Visited.insert(BB).second)
continue;
if (auto It = BlocksToCalls.find(BB); It != BlocksToCalls.end()) {
Worklist.append(pred_begin(BB), pred_end(BB));
if (!It->second)
return false;
continue;
}
BlocksToCalls[BB] = false;
if (BB->sizeWithoutDebug() > ScheduleRegionSizeBudget)
return false;
Budget += BB->sizeWithoutDebug();
if (Budget > BudgetLimit)
return false;
if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
BB->getTerminator()))
return false;
BlocksToCalls[BB] = true;
Worklist.append(pred_begin(BB), pred_end(BB));
}
return true;
};
SmallVector<const TreeEntry *> LiveEntries(1, Root);
while (!LiveEntries.empty()) {
const TreeEntry *Entry = LiveEntries.pop_back_val();
SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
if (Operands.empty())
continue;
Instruction *LastInst = EntriesToLastInstruction.at(Entry);
for (const TreeEntry *Op : Operands) {
if (!Op->isGather())
LiveEntries.push_back(Op);
BasicBlock *Parent = Entry->getMainOp()->getParent();
if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
(Op->isGather() && allConstant(Op->Scalars)))
continue;
Budget = 0;
BasicBlock *Pred = Entry->getOpcode() == Instruction::PHI
? cast<PHINode>(Entry->getMainOp())
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx)
: nullptr;
BasicBlock *OpParent;
Instruction *OpLastInst;
if (Op->isGather()) {
assert(Entry->getOpcode() == Instruction::PHI &&
"Expected phi node only.");
OpParent = cast<PHINode>(Entry->getMainOp())
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
OpLastInst = OpParent->getTerminator();
for (Value *V : Op->Scalars) {
auto *Inst = dyn_cast<Instruction>(V);
if (!Inst)
continue;
if (isVectorized(V)) {
OpParent = Inst->getParent();
OpLastInst = Inst;
break;
}
}
} else {
OpLastInst = EntriesToLastInstruction.at(Op);
OpParent = OpLastInst->getParent();
}
// Check the call instructions within the same basic blocks.
if (OpParent == Parent) {
if (Entry->getOpcode() == Instruction::PHI) {
if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
AddCosts(Op);
continue;
}
if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
AddCosts(Op);
continue;
}
// Check for call instruction in between blocks.
// 1. Check entry's block to the head.
if (Entry->getOpcode() != Instruction::PHI &&
!CheckForNonVecCallsInSameBlock(
&*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
LastInst)) {
AddCosts(Op);
continue;
}
// 2. Check op's block from the end.
if (!CheckForNonVecCallsInSameBlock(OpLastInst,
OpParent->getTerminator())) {
AddCosts(Op);
continue;
}
// 3. Check the predecessors of entry's block till op's block.
if (!CheckPredecessors(Parent, Pred, OpParent)) {
AddCosts(Op);
continue;
}
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
}

Prev = TE;
}

return Cost;
Expand Down
16 changes: 12 additions & 4 deletions llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1740,7 +1740,9 @@ entry:
define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-LABEL: define void @f
; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; CHECK-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
; CHECK: foo:
; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
Expand All @@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
; CHECK-NEXT: br label [[BAZ]]
; CHECK: baz:
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; CHECK-NEXT: ret void
;
; DEFAULT-LABEL: define void @f
; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
; DEFAULT-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
; DEFAULT-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; DEFAULT-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
; DEFAULT: foo:
; DEFAULT-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
Expand All @@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; DEFAULT-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
; DEFAULT-NEXT: br label [[BAZ]]
; DEFAULT: baz:
; DEFAULT-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
; DEFAULT-NEXT: store i64 [[X0]], ptr [[Q]], align 8
; DEFAULT-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
; DEFAULT-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; DEFAULT-NEXT: ret void
;
%x0 = load i64, ptr %p
Expand Down
10 changes: 7 additions & 3 deletions llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ declare void @g()
define void @f0(i1 %c, ptr %p, ptr %q) {
; CHECK-LABEL: define void @f0(
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; CHECK-NEXT: br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
; CHECK: [[FOO]]:
; CHECK-NEXT: call void @g()
Expand All @@ -20,7 +22,9 @@ define void @f0(i1 %c, ptr %p, ptr %q) {
; CHECK-NEXT: call void @g()
; CHECK-NEXT: br label %[[BAZ]]
; CHECK: [[BAZ]]:
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; CHECK-NEXT: ret void
;
%x0 = load i64, ptr %p
Expand All @@ -45,7 +49,7 @@ baz:
ret void
}

; Shouldn't be vectorized
; Should be vectorized - just one spill of TMP0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for fixing the cost over multiple calls. I think this test was supposed to test diamond shaped control flow where the block with the calls wasn't in the tree though. Can we add more tree entires in entry + foo that are used in baz to trigger the cost?

define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-LABEL: define void @f1(
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR0]] {
Expand Down