Skip to content

Commit 8f4e5fe

Browse files
committed
[VPlan] Implement VPWidenCastRecipe::computeCost(). (NFCI)
This patch implement VPWidenCastRecipe::computeCost() and skip more cast recipies in the in-loop reduction.
1 parent af47038 commit 8f4e5fe

File tree

4 files changed

+66
-2
lines changed

4 files changed

+66
-2
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7218,12 +7218,29 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
72187218
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
72197219
SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
72207220
ChainOps.end());
7221+
auto isZExtOrSExt = [](const unsigned Opcode) -> bool {
7222+
return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7223+
};
72217224
// Also include the operands of instructions in the chain, as the cost-model
72227225
// may mark extends as free.
7226+
//
7227+
// For ARM, some of the instruction can folded into the reducion
7228+
// instruction. So we need to mark all folded instructions free.
7229+
// For example: We can fold reduce(mul(ext(A), ext(B))) into one
7230+
// instruction.
72237231
for (auto *ChainOp : ChainOps) {
72247232
for (Value *Op : ChainOp->operands()) {
7225-
if (auto *I = dyn_cast<Instruction>(Op))
7233+
if (auto *I = dyn_cast<Instruction>(Op)) {
72267234
ChainOpsAndOperands.insert(I);
7235+
if (I->getOpcode() == Instruction::Mul) {
7236+
auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7237+
if (Ext0 && isZExtOrSExt(Ext0->getOpcode()))
7238+
ChainOpsAndOperands.insert(Ext0);
7239+
auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7240+
if (Ext1 && isZExtOrSExt(Ext1->getOpcode()))
7241+
ChainOpsAndOperands.insert(Ext1);
7242+
}
7243+
}
72277244
}
72287245
}
72297246

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,6 +1557,10 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15571557
/// Produce widened copies of the cast.
15581558
void execute(VPTransformState &State) override;
15591559

1560+
/// Return the cost of this VPWidenCastRecipe.
1561+
InstructionCost computeCost(ElementCount VF,
1562+
VPCostContext &Ctx) const override;
1563+
15601564
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
15611565
/// Print the recipe.
15621566
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,6 +1429,49 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
14291429
State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
14301430
}
14311431

1432+
InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
1433+
VPCostContext &Ctx) const {
1434+
auto *SrcTy = cast<VectorType>(
1435+
ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
1436+
auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF));
1437+
// Computes the CastContextHint from a VPWidenMemoryRecipe instruction.
1438+
auto ComputeCCH = [&](VPWidenMemoryRecipe *R) -> TTI::CastContextHint {
1439+
assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenStoreRecipe>(R)) &&
1440+
"Expected a load or a store!");
1441+
1442+
if (VF.isScalar())
1443+
return TTI::CastContextHint::Normal;
1444+
if (!R->isConsecutive())
1445+
return TTI::CastContextHint::GatherScatter;
1446+
if (R->isReverse())
1447+
return TTI::CastContextHint::Reversed;
1448+
if (R->isMasked())
1449+
return TTI::CastContextHint::Masked;
1450+
return TTI::CastContextHint::Normal;
1451+
};
1452+
1453+
TTI::CastContextHint CCH = TTI::CastContextHint::None;
1454+
// For Trunc, the context is the only user, which must be a
1455+
// VPWidenStoreRecipe.
1456+
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1457+
if (!cast<VPValue>(this)->hasMoreThanOneUniqueUser())
1458+
if (VPWidenMemoryRecipe *Store =
1459+
dyn_cast<VPWidenMemoryRecipe>(*this->user_begin()))
1460+
CCH = ComputeCCH(Store);
1461+
}
1462+
// For Z/Sext, the context is the operand, which must be a VPWidenLoadRecipe.
1463+
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1464+
Opcode == Instruction::FPExt) {
1465+
if (VPWidenMemoryRecipe *Load = dyn_cast<VPWidenMemoryRecipe>(
1466+
this->getOperand(0)->getDefiningRecipe()))
1467+
CCH = ComputeCCH(Load);
1468+
}
1469+
// Arm TTI will use the underlying instruction to determine the cost.
1470+
return Ctx.TTI.getCastInstrCost(
1471+
Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
1472+
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
1473+
}
1474+
14321475
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
14331476
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
14341477
VPSlotTracker &SlotTracker) const {

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ class VPValue {
135135
}
136136

137137
/// Returns true if the value has more than one unique user.
138-
bool hasMoreThanOneUniqueUser() {
138+
bool hasMoreThanOneUniqueUser() const {
139139
if (getNumUsers() == 0)
140140
return false;
141141

0 commit comments

Comments
 (0)