Skip to content

Commit b3edc76

Browse files
authored
[VPlan] Implement VPWidenCastRecipe::computeCost(). (NFCI) (#111339)
This patch implement `VPWidenCastRecipe::computeCost()` and skip cast recipies in the in-loop reduction.
1 parent a4819bd commit b3edc76

File tree

4 files changed

+73
-2
lines changed

4 files changed

+73
-2
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7307,12 +7307,30 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
73077307
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
73087308
SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
73097309
ChainOps.end());
7310+
auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7311+
return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7312+
};
73107313
// Also include the operands of instructions in the chain, as the cost-model
73117314
// may mark extends as free.
7315+
//
7316+
// For ARM, some of the instruction can folded into the reducion
7317+
// instruction. So we need to mark all folded instructions free.
7318+
// For example: We can fold reduce(mul(ext(A), ext(B))) into one
7319+
// instruction.
73127320
for (auto *ChainOp : ChainOps) {
73137321
for (Value *Op : ChainOp->operands()) {
7314-
if (auto *I = dyn_cast<Instruction>(Op))
7322+
if (auto *I = dyn_cast<Instruction>(Op)) {
73157323
ChainOpsAndOperands.insert(I);
7324+
if (I->getOpcode() == Instruction::Mul) {
7325+
auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7326+
auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7327+
if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7328+
Ext0->getOpcode() == Ext1->getOpcode()) {
7329+
ChainOpsAndOperands.insert(Ext0);
7330+
ChainOpsAndOperands.insert(Ext1);
7331+
}
7332+
}
7333+
}
73167334
}
73177335
}
73187336

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,6 +1603,10 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
16031603
/// Produce widened copies of the cast.
16041604
void execute(VPTransformState &State) override;
16051605

1606+
/// Return the cost of this VPWidenCastRecipe.
1607+
InstructionCost computeCost(ElementCount VF,
1608+
VPCostContext &Ctx) const override;
1609+
16061610
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
16071611
/// Print the recipe.
16081612
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,6 +1522,55 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
15221522
State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
15231523
}
15241524

1525+
InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
1526+
VPCostContext &Ctx) const {
1527+
// Computes the CastContextHint from a recipes that may access memory.
1528+
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1529+
if (VF.isScalar())
1530+
return TTI::CastContextHint::Normal;
1531+
if (isa<VPInterleaveRecipe>(R))
1532+
return TTI::CastContextHint::Interleave;
1533+
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
1534+
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1535+
: TTI::CastContextHint::Normal;
1536+
const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1537+
if (WidenMemoryRecipe == nullptr)
1538+
return TTI::CastContextHint::None;
1539+
if (!WidenMemoryRecipe->isConsecutive())
1540+
return TTI::CastContextHint::GatherScatter;
1541+
if (WidenMemoryRecipe->isReverse())
1542+
return TTI::CastContextHint::Reversed;
1543+
if (WidenMemoryRecipe->isMasked())
1544+
return TTI::CastContextHint::Masked;
1545+
return TTI::CastContextHint::Normal;
1546+
};
1547+
1548+
VPValue *Operand = getOperand(0);
1549+
TTI::CastContextHint CCH = TTI::CastContextHint::None;
1550+
// For Trunc/FPTrunc, get the context from the only user.
1551+
if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
1552+
!hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
1553+
if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
1554+
CCH = ComputeCCH(StoreRecipe);
1555+
}
1556+
// For Z/Sext, get the context from the operand.
1557+
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1558+
Opcode == Instruction::FPExt) {
1559+
if (Operand->isLiveIn())
1560+
CCH = TTI::CastContextHint::Normal;
1561+
else if (Operand->getDefiningRecipe())
1562+
CCH = ComputeCCH(Operand->getDefiningRecipe());
1563+
}
1564+
1565+
auto *SrcTy =
1566+
cast<VectorType>(ToVectorTy(Ctx.Types.inferScalarType(Operand), VF));
1567+
auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF));
1568+
// Arm TTI will use the underlying instruction to determine the cost.
1569+
return Ctx.TTI.getCastInstrCost(
1570+
Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
1571+
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
1572+
}
1573+
15251574
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
15261575
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
15271576
VPSlotTracker &SlotTracker) const {

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ class VPValue {
135135
}
136136

137137
/// Returns true if the value has more than one unique user.
138-
bool hasMoreThanOneUniqueUser() {
138+
bool hasMoreThanOneUniqueUser() const {
139139
if (getNumUsers() == 0)
140140
return false;
141141

0 commit comments

Comments
 (0)