Skip to content

Commit 8a8462e

Browse files
committed
[VPlan] Implement VPWidenCastRecipe::computeCost(). (NFCI)
This patch implement VPWidenCastRecipe::computeCost() and skip more cast recipies in the in-loop reduction.
1 parent cd12ffb commit 8a8462e

File tree

4 files changed

+66
-2
lines changed

4 files changed

+66
-2
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7258,12 +7258,29 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
72587258
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
72597259
SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
72607260
ChainOps.end());
7261+
auto isZExtOrSExt = [](const unsigned Opcode) -> bool {
7262+
return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7263+
};
72617264
// Also include the operands of instructions in the chain, as the cost-model
72627265
// may mark extends as free.
7266+
//
7267+
// For ARM, some of the instruction can folded into the reducion
7268+
// instruction. So we need to mark all folded instructions free.
7269+
// For example: We can fold reduce(mul(ext(A), ext(B))) into one
7270+
// instruction.
72637271
for (auto *ChainOp : ChainOps) {
72647272
for (Value *Op : ChainOp->operands()) {
7265-
if (auto *I = dyn_cast<Instruction>(Op))
7273+
if (auto *I = dyn_cast<Instruction>(Op)) {
72667274
ChainOpsAndOperands.insert(I);
7275+
if (I->getOpcode() == Instruction::Mul) {
7276+
auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7277+
if (Ext0 && isZExtOrSExt(Ext0->getOpcode()))
7278+
ChainOpsAndOperands.insert(Ext0);
7279+
auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7280+
if (Ext1 && isZExtOrSExt(Ext1->getOpcode()))
7281+
ChainOpsAndOperands.insert(Ext1);
7282+
}
7283+
}
72677284
}
72687285
}
72697286

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1571,6 +1571,10 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15711571
/// Produce widened copies of the cast.
15721572
void execute(VPTransformState &State) override;
15731573

1574+
/// Return the cost of this VPWidenCastRecipe.
1575+
InstructionCost computeCost(ElementCount VF,
1576+
VPCostContext &Ctx) const override;
1577+
15741578
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
15751579
/// Print the recipe.
15761580
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1462,6 +1462,49 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
14621462
State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
14631463
}
14641464

1465+
InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
1466+
VPCostContext &Ctx) const {
1467+
auto *SrcTy = cast<VectorType>(
1468+
ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
1469+
auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF));
1470+
// Computes the CastContextHint from a VPWidenMemoryRecipe instruction.
1471+
auto ComputeCCH = [&](VPWidenMemoryRecipe *R) -> TTI::CastContextHint {
1472+
assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenStoreRecipe>(R)) &&
1473+
"Expected a load or a store!");
1474+
1475+
if (VF.isScalar())
1476+
return TTI::CastContextHint::Normal;
1477+
if (!R->isConsecutive())
1478+
return TTI::CastContextHint::GatherScatter;
1479+
if (R->isReverse())
1480+
return TTI::CastContextHint::Reversed;
1481+
if (R->isMasked())
1482+
return TTI::CastContextHint::Masked;
1483+
return TTI::CastContextHint::Normal;
1484+
};
1485+
1486+
TTI::CastContextHint CCH = TTI::CastContextHint::None;
1487+
// For Trunc, the context is the only user, which must be a
1488+
// VPWidenStoreRecipe.
1489+
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1490+
if (!cast<VPValue>(this)->hasMoreThanOneUniqueUser())
1491+
if (VPWidenMemoryRecipe *Store =
1492+
dyn_cast<VPWidenMemoryRecipe>(*this->user_begin()))
1493+
CCH = ComputeCCH(Store);
1494+
}
1495+
// For Z/Sext, the context is the operand, which must be a VPWidenLoadRecipe.
1496+
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1497+
Opcode == Instruction::FPExt) {
1498+
if (VPWidenMemoryRecipe *Load = dyn_cast<VPWidenMemoryRecipe>(
1499+
this->getOperand(0)->getDefiningRecipe()))
1500+
CCH = ComputeCCH(Load);
1501+
}
1502+
// Arm TTI will use the underlying instruction to determine the cost.
1503+
return Ctx.TTI.getCastInstrCost(
1504+
Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
1505+
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
1506+
}
1507+
14651508
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
14661509
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
14671510
VPSlotTracker &SlotTracker) const {

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ class VPValue {
135135
}
136136

137137
/// Returns true if the value has more than one unique user.
138-
bool hasMoreThanOneUniqueUser() {
138+
bool hasMoreThanOneUniqueUser() const {
139139
if (getNumUsers() == 0)
140140
return false;
141141

0 commit comments

Comments
 (0)