Skip to content
Closed

qq #41

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
309 changes: 215 additions & 94 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12449,111 +12449,224 @@ InstructionCost BoUpSLP::getSpillCost() {
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
InstructionCost Cost = 0;

SmallPtrSet<const TreeEntry *, 4> LiveEntries;
const TreeEntry *Prev = nullptr;
const TreeEntry *Root = VectorizableTree.front().get();
if (Root->isGather())
return 0;

// The entries in VectorizableTree are not necessarily ordered by their
// position in basic blocks. Collect them and order them by dominance so later
// instructions are guaranteed to be visited first. For instructions in
// different basic blocks, we only scan to the beginning of the block, so
// their order does not matter, as long as all instructions in a basic block
// are grouped together. Using dominance ensures a deterministic order.
SmallVector<TreeEntry *, 16> OrderedEntries;
InstructionCost Cost = 0;
SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
EntriesToOperands;
SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
SmallPtrSet<const Instruction *, 8> LastInstructions;
for (const auto &TEPtr : VectorizableTree) {
if (TEPtr->isGather())
continue;
OrderedEntries.push_back(TEPtr.get());
}
llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
const TreeEntry *TB) {
Instruction &A = getLastInstructionInBundle(TA);
Instruction &B = getLastInstructionInBundle(TB);
auto *NodeA = DT->getNode(A.getParent());
auto *NodeB = DT->getNode(B.getParent());
assert(NodeA && "Should only process reachable instructions");
assert(NodeB && "Should only process reachable instructions");
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
if (NodeA != NodeB)
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
return B.comesBefore(&A);
});

for (const TreeEntry *TE : OrderedEntries) {
if (!Prev) {
Prev = TE;
continue;
}

LiveEntries.erase(Prev);
for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
const TreeEntry *Op = getVectorizedOperand(Prev, I);
if (!Op)
continue;
assert(!Op->isGather() && "Expected vectorized operand.");
LiveEntries.insert(Op);
if (!TEPtr->isGather()) {
Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
LastInstructions.insert(LastInst);
}
if (TEPtr->UserTreeIndex)
EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
}

LLVM_DEBUG({
dbgs() << "SLP: #LV: " << LiveEntries.size();
for (auto *X : LiveEntries)
X->dump();
dbgs() << ", Looking at ";
TE->dump();
});

// Now find the sequence of instructions between PrevInst and Inst.
unsigned NumCalls = 0;
const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
BasicBlock::const_reverse_iterator
InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
PrevInstIt = PrevInst->getIterator().getReverse();
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
continue;
}

auto NoCallIntrinsic = [this](const Instruction *I) {
const auto *II = dyn_cast<IntrinsicInst>(I);
if (!II)
return false;
if (II->isAssumeLikeIntrinsic())
return true;
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
InstructionCost IntrCost =
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
InstructionCost CallCost =
TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
TTI::TCK_RecipThroughput);
return IntrCost < CallCost;
};
auto NoCallIntrinsic = [this](const Instruction *I) {
const auto *II = dyn_cast<IntrinsicInst>(I);
if (!II)
return false;
if (II->isAssumeLikeIntrinsic())
return true;
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
InstructionCost IntrCost =
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
InstructionCost CallCost = TTI->getCallInstrCost(
nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
return IntrCost < CallCost;
};

// Maps last instruction in the entry to the last instruction for the one of
// operand entries and the flag. If the flag is true, there are no calls in
// between these instructions.
SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
CheckedInstructions;
unsigned Budget = 0;
const unsigned BudgetLimit =
ScheduleRegionSizeBudget / VectorizableTree.size();
auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
const Instruction *Last) {
assert(First->getParent() == Last->getParent() &&
"Expected instructions in same block.");
if (auto It = CheckedInstructions.find(Last);
It != CheckedInstructions.end()) {
const Instruction *Checked = It->second.getPointer();
if (Checked == First || Checked->comesBefore(First))
return It->second.getInt() != 0;
Last = Checked;
} else if (Last == First || Last->comesBefore(First)) {
return true;
}
BasicBlock::const_reverse_iterator InstIt =
++First->getIterator().getReverse(),
PrevInstIt =
Last->getIterator().getReverse();
SmallVector<const Instruction *> LastInstsInRange;
while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
// Debug information does not impact spill cost.
// Vectorized calls, represented as vector intrinsics, do not impact spill
// cost.
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
NumCalls++;
CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
for (const Instruction *LastInst : LastInstsInRange)
CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
return false;
}
if (LastInstructions.contains(&*PrevInstIt))
LastInstsInRange.push_back(&*PrevInstIt);

++PrevInstIt;
++Budget;
}

if (NumCalls) {
SmallVector<Type *, 4> EntriesTypes;
for (const TreeEntry *TE : LiveEntries) {
auto *ScalarTy = TE->getMainOp()->getType();
auto It = MinBWs.find(TE);
if (It != MinBWs.end())
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
for (const Instruction *LastInst : LastInstsInRange)
CheckedInstructions.try_emplace(
LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
Budget <= BudgetLimit ? 1 : 0);
return Budget <= BudgetLimit;
};
auto AddCosts = [&](const TreeEntry *Op) {
Type *ScalarTy = Op->Scalars.front()->getType();
auto It = MinBWs.find(Op);
if (It != MinBWs.end())
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
if (ScalarTy->isVectorTy()) {
// Handle revec dead vector instructions.
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
}
};
// Memoize the relationship between blocks, i.e. if there is (at least one)
// non-vectorized call between the blocks. This allows to skip the analysis of
// the same block paths multiple times.
SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
ParentOpParentToPreds;
auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
BasicBlock *OpParent) {
auto Key = std::make_pair(Root, OpParent);
if (auto It = ParentOpParentToPreds.find(Key);
It != ParentOpParentToPreds.end())
return It->second;
SmallVector<BasicBlock *> Worklist;
if (Pred)
Worklist.push_back(Pred);
else
Worklist.append(pred_begin(Root), pred_end(Root));
SmallPtrSet<const BasicBlock *, 16> Visited;
SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
ParentsPairsToAdd;
bool Res = false;
auto Cleanup = make_scope_exit([&]() {
for (const auto &KeyPair : ParentsPairsToAdd) {
assert(!ParentOpParentToPreds.contains(KeyPair) &&
"Should not have been added before.");
ParentOpParentToPreds.try_emplace(KeyPair, Res);
}
});
while (!Worklist.empty()) {
BasicBlock *BB = Worklist.pop_back_val();
if (BB == OpParent || !Visited.insert(BB).second)
continue;
if (auto It = ParentOpParentToPreds.find(std::make_pair(BB, OpParent));
It != ParentOpParentToPreds.end()) {
Res = It->second;
return Res;
}
ParentsPairsToAdd.insert(std::make_pair(BB, OpParent));
unsigned BlockSize = BB->size();
if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
return Res;
Budget += BlockSize;
if (Budget > BudgetLimit)
return Res;
if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
BB->getTerminator()))
return Res;
Worklist.append(pred_begin(BB), pred_end(BB));
}
Res = true;
return Res;
};
SmallVector<const TreeEntry *> LiveEntries(1, Root);
while (!LiveEntries.empty()) {
const TreeEntry *Entry = LiveEntries.pop_back_val();
SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
if (Operands.empty())
continue;
Instruction *LastInst = EntriesToLastInstruction.at(Entry);
BasicBlock *Parent = LastInst->getParent();
for (const TreeEntry *Op : Operands) {
if (!Op->isGather())
LiveEntries.push_back(Op);
if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
(Op->isGather() && allConstant(Op->Scalars)))
continue;
Budget = 0;
BasicBlock *Pred = Entry->getOpcode() == Instruction::PHI
? cast<PHINode>(Entry->getMainOp())
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx)
: nullptr;
BasicBlock *OpParent;
Instruction *OpLastInst;
if (Op->isGather()) {
assert(Entry->getOpcode() == Instruction::PHI &&
"Expected phi node only.");
OpParent = cast<PHINode>(Entry->getMainOp())
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
OpLastInst = OpParent->getTerminator();
for (Value *V : Op->Scalars) {
auto *Inst = dyn_cast<Instruction>(V);
if (!Inst)
continue;
if (isVectorized(V)) {
OpParent = Inst->getParent();
OpLastInst = Inst;
break;
}
}
} else {
OpLastInst = EntriesToLastInstruction.at(Op);
OpParent = OpLastInst->getParent();
}
// Check the call instructions within the same basic blocks.
if (OpParent == Parent) {
if (Entry->getOpcode() == Instruction::PHI) {
if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
AddCosts(Op);
continue;
}
if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
AddCosts(Op);
continue;
}
// Check for call instruction in between blocks.
// 1. Check entry's block to the head.
if (Entry->getOpcode() != Instruction::PHI &&
!CheckForNonVecCallsInSameBlock(
&*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
LastInst)) {
AddCosts(Op);
continue;
}
// 2. Check op's block from the end.
if (!CheckForNonVecCallsInSameBlock(OpLastInst,
OpParent->getTerminator())) {
AddCosts(Op);
continue;
}
// 3. Check the predecessors of entry's block till op's block.
if (!CheckPredecessors(Parent, Pred, OpParent)) {
AddCosts(Op);
continue;
}
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
}

Prev = TE;
}

return Cost;
Expand Down Expand Up @@ -13061,8 +13174,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}

InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
Cost += ExtractCost;
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
bool) {
InstructionCost C = 0;
Expand Down Expand Up @@ -13201,12 +13313,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}

InstructionCost SpillCost = InstructionCost::getInvalid();
if (Cost < -SLPCostThreshold) {
SpillCost = getSpillCost();
Cost += SpillCost;
}
#ifndef NDEBUG
SmallString<256> Str;
{
raw_svector_ostream OS(Str);
OS << "SLP: Spill Cost = " << SpillCost << ".\n"
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
OS << "SLP: Spill Cost = ";
if (SpillCost.isValid())
OS << SpillCost;
else
OS << "<skipped>";
OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
}
LLVM_DEBUG(dbgs() << Str);
Expand Down
16 changes: 12 additions & 4 deletions llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1740,7 +1740,9 @@ entry:
define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-LABEL: define void @f
; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; CHECK-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
; CHECK: foo:
; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
Expand All @@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
; CHECK-NEXT: br label [[BAZ]]
; CHECK: baz:
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; CHECK-NEXT: ret void
;
; DEFAULT-LABEL: define void @f
; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
; DEFAULT-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
; DEFAULT-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; DEFAULT-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
; DEFAULT: foo:
; DEFAULT-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
Expand All @@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; DEFAULT-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
; DEFAULT-NEXT: br label [[BAZ]]
; DEFAULT: baz:
; DEFAULT-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
; DEFAULT-NEXT: store i64 [[X0]], ptr [[Q]], align 8
; DEFAULT-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
; DEFAULT-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; DEFAULT-NEXT: ret void
;
%x0 = load i64, ptr %p
Expand Down
Loading