Skip to content

Commit 071d1fb

Browse files
authored
[LV] Use VPReductionRecipe for partial reductions (#147513)
Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Stacked PRs: 1. #147026 2. #147255 3. #156976 4. #160154 5. #147302 6. #162503 7. -> #147513 Replaces #146073 .
1 parent 96cbbeb commit 071d1fb

File tree

9 files changed

+181
-308
lines changed

9 files changed

+181
-308
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7021,10 +7021,11 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
70217021
VPInstruction::FirstOrderRecurrenceSplice>())))
70227022
return true;
70237023
}
7024-
// The VPlan-based cost model is more accurate for partial reduction and
7024+
// The VPlan-based cost model is more accurate for partial reductions and
70257025
// comparing against the legacy cost isn't desirable.
7026-
if (isa<VPPartialReductionRecipe>(&R))
7027-
return true;
7026+
if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7027+
if (VPR->isPartialReduction())
7028+
return true;
70287029

70297030
// The VPlan-based cost model can analyze if recipes are scalar
70307031
// recursively, but the legacy cost model cannot.
@@ -8207,11 +8208,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
82078208
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
82088209

82098210
// If the PHI is used by a partial reduction, set the scale factor.
8211+
bool UseInLoopReduction = CM.isInLoopReduction(Phi);
8212+
bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
82108213
unsigned ScaleFactor =
82118214
getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8215+
82128216
PhiRecipe = new VPReductionPHIRecipe(
8213-
Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8214-
CM.useOrderedReductions(RdxDesc), ScaleFactor);
8217+
Phi, RdxDesc.getRecurrenceKind(), *StartV,
8218+
getReductionStyle(UseInLoopReduction, UseOrderedReductions,
8219+
ScaleFactor));
82158220
} else {
82168221
// TODO: Currently fixed-order recurrences are modeled as chains of
82178222
// first-order recurrences. If there are no users of the intermediate
@@ -8280,31 +8285,34 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
82808285

82818286
VPValue *BinOp = Reduction->getOperand(0);
82828287
VPValue *Accumulator = Reduction->getOperand(1);
8283-
if (isa<VPReductionPHIRecipe>(BinOp) || isa<VPPartialReductionRecipe>(BinOp))
8288+
VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8289+
if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8290+
(isa<VPReductionRecipe>(BinOpRecipe) &&
8291+
cast<VPReductionRecipe>(BinOpRecipe)->isPartialReduction()))
82848292
std::swap(BinOp, Accumulator);
82858293

82868294
assert(ScaleFactor ==
82878295
vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
82888296
"all accumulators in chain must have same scale factor");
82898297

8290-
unsigned ReductionOpcode = Reduction->getOpcode();
82918298
auto *ReductionI = Reduction->getUnderlyingInstr();
8292-
if (ReductionOpcode == Instruction::Sub) {
8299+
if (Reduction->getOpcode() == Instruction::Sub) {
82938300
auto *const Zero = ConstantInt::get(ReductionI->getType(), 0);
82948301
SmallVector<VPValue *, 2> Ops;
82958302
Ops.push_back(Plan.getOrAddLiveIn(Zero));
82968303
Ops.push_back(BinOp);
82978304
BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRFlags(*ReductionI),
82988305
VPIRMetadata(), ReductionI->getDebugLoc());
82998306
Builder.insert(BinOp->getDefiningRecipe());
8300-
ReductionOpcode = Instruction::Add;
83018307
}
83028308

83038309
VPValue *Cond = nullptr;
83048310
if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent()))
83058311
Cond = getBlockInMask(Builder.getInsertBlock());
8306-
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8307-
ScaleFactor, ReductionI);
8312+
8313+
return new VPReductionRecipe(
8314+
RecurKind::Add, FastMathFlags(), ReductionI, Accumulator, BinOp, Cond,
8315+
RdxUnordered{/*VFScaleFactor=*/ScaleFactor}, ReductionI->getDebugLoc());
83088316
}
83098317

83108318
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8795,9 +8803,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
87958803
if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent()))
87968804
CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
87978805

8798-
auto *RedRecipe = new VPReductionRecipe(
8799-
Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
8800-
PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
8806+
ReductionStyle Style = getReductionStyle(true, PhiR->isOrdered(), 1);
8807+
auto *RedRecipe =
8808+
new VPReductionRecipe(Kind, FMFs, CurrentLinkI, PreviousLink, VecOp,
8809+
CondOp, Style, CurrentLinkI->getDebugLoc());
88018810
// Append the recipe to the end of the VPBasicBlock because we need to
88028811
// ensure that it comes after all of it's inputs, including CondOp.
88038812
// Delete CurrentLink as it will be invalid if its operand is replaced
@@ -8832,8 +8841,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
88328841
// Don't output selects for partial reductions because they have an output
88338842
// with fewer lanes than the VF. So the operands of the select would have
88348843
// different numbers of lanes. Partial reductions mask the input instead.
8844+
auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
88358845
if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8836-
!isa<VPPartialReductionRecipe>(OrigExitingVPV)) {
8846+
(!RR || !RR->isPartialReduction())) {
88378847
VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
88388848
std::optional<FastMathFlags> FMFs =
88398849
PhiTy->isFloatingPointTy()

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 75 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include <functional>
4545
#include <string>
4646
#include <utility>
47+
#include <variant>
4748

4849
namespace llvm {
4950

@@ -566,7 +567,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
566567
case VPRecipeBase::VPWidenIntOrFpInductionSC:
567568
case VPRecipeBase::VPWidenPointerInductionSC:
568569
case VPRecipeBase::VPReductionPHISC:
569-
case VPRecipeBase::VPPartialReductionSC:
570570
return true;
571571
case VPRecipeBase::VPBranchOnMaskSC:
572572
case VPRecipeBase::VPInterleaveEVLSC:
@@ -2392,6 +2392,29 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
23922392
#endif
23932393
};
23942394

2395+
/// Possible variants of a reduction.
2396+
2397+
/// This reduction is ordered and in-loop.
2398+
struct RdxOrdered {};
2399+
/// This reduction is in-loop.
2400+
struct RdxInLoop {};
2401+
/// This reduction is unordered with the partial result scaled down by some
2402+
/// factor.
2403+
struct RdxUnordered {
2404+
unsigned VFScaleFactor;
2405+
};
2406+
using ReductionStyle = std::variant<RdxOrdered, RdxInLoop, RdxUnordered>;
2407+
2408+
inline ReductionStyle getReductionStyle(bool InLoop, bool Ordered,
2409+
unsigned ScaleFactor) {
2410+
assert((!Ordered || InLoop) && "Ordered implies in-loop");
2411+
if (Ordered)
2412+
return RdxOrdered{};
2413+
if (InLoop)
2414+
return RdxInLoop{};
2415+
return RdxUnordered{/*VFScaleFactor=*/ScaleFactor};
2416+
}
2417+
23952418
/// A recipe for handling reduction phis. The start value is the first operand
23962419
/// of the recipe and the incoming value from the backedge is the second
23972420
/// operand.
@@ -2400,32 +2423,21 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
24002423
/// The recurrence kind of the reduction.
24012424
const RecurKind Kind;
24022425

2403-
/// The phi is part of an in-loop reduction.
2404-
bool IsInLoop;
2405-
2406-
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
2407-
bool IsOrdered;
2408-
2409-
/// When expanding the reduction PHI, the plan's VF element count is divided
2410-
/// by this factor to form the reduction phi's VF.
2411-
unsigned VFScaleFactor = 1;
2426+
ReductionStyle Style;
24122427

24132428
public:
24142429
/// Create a new VPReductionPHIRecipe for the reduction \p Phi.
24152430
VPReductionPHIRecipe(PHINode *Phi, RecurKind Kind, VPValue &Start,
2416-
bool IsInLoop = false, bool IsOrdered = false,
2417-
unsigned VFScaleFactor = 1)
2431+
ReductionStyle Style)
24182432
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), Kind(Kind),
2419-
IsInLoop(IsInLoop), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
2420-
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
2421-
}
2433+
Style(Style) {}
24222434

24232435
~VPReductionPHIRecipe() override = default;
24242436

24252437
VPReductionPHIRecipe *clone() override {
24262438
auto *R = new VPReductionPHIRecipe(
24272439
dyn_cast_or_null<PHINode>(getUnderlyingValue()), getRecurrenceKind(),
2428-
*getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
2440+
*getOperand(0), Style);
24292441
R->addOperand(getBackedgeValue());
24302442
return R;
24312443
}
@@ -2435,8 +2447,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
24352447
/// Generate the phi/select nodes.
24362448
void execute(VPTransformState &State) override;
24372449

2438-
/// Get the factor that the VF of this recipe's output should be scaled by.
2439-
unsigned getVFScaleFactor() const { return VFScaleFactor; }
2450+
/// Get the factor that the VF of this recipe's output should be scaled by, or
2451+
/// 1 if it isn't scaled.
2452+
unsigned getVFScaleFactor() const {
2453+
auto *Partial = std::get_if<RdxUnordered>(&Style);
2454+
return Partial ? Partial->VFScaleFactor : 1;
2455+
}
24402456

24412457
/// Returns the number of incoming values, also number of incoming blocks.
24422458
/// Note that at the moment, VPWidenPointerInductionRecipe only has a single
@@ -2447,10 +2463,16 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
24472463
RecurKind getRecurrenceKind() const { return Kind; }
24482464

24492465
/// Returns true, if the phi is part of an ordered reduction.
2450-
bool isOrdered() const { return IsOrdered; }
2466+
bool isOrdered() const { return std::holds_alternative<RdxOrdered>(Style); }
24512467

2452-
/// Returns true, if the phi is part of an in-loop reduction.
2453-
bool isInLoop() const { return IsInLoop; }
2468+
/// Returns true if the phi is part of an in-loop reduction.
2469+
bool isInLoop() const {
2470+
return std::holds_alternative<RdxInLoop>(Style) ||
2471+
std::holds_alternative<RdxOrdered>(Style);
2472+
}
2473+
2474+
/// Returns true if the reduction outputs a vector with a scaled down VF.
2475+
bool isPartialReduction() const { return getVFScaleFactor() > 1; }
24542476

24552477
/// Returns true if the recipe only uses the first lane of operand \p Op.
24562478
bool usesFirstLaneOnly(const VPValue *Op) const override {
@@ -2732,23 +2754,25 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
27322754
#endif
27332755
};
27342756

2735-
/// A recipe to represent inloop reduction operations, performing a reduction on
2736-
/// a vector operand into a scalar value, and adding the result to a chain.
2737-
/// The Operands are {ChainOp, VecOp, [Condition]}.
2757+
/// A recipe to represent inloop, ordered or partial reduction operations. It
2758+
/// performs a reduction on a vector operand into a scalar (vector in the case
2759+
/// of a partial reduction) value, and adds the result to a chain. The Operands
2760+
/// are {ChainOp, VecOp, [Condition]}.
27382761
class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
2762+
27392763
/// The recurrence kind for the reduction in question.
27402764
RecurKind RdxKind;
2741-
bool IsOrdered;
27422765
/// Whether the reduction is conditional.
27432766
bool IsConditional = false;
2767+
ReductionStyle Style;
27442768

27452769
protected:
27462770
VPReductionRecipe(const unsigned char SC, RecurKind RdxKind,
27472771
FastMathFlags FMFs, Instruction *I,
27482772
ArrayRef<VPValue *> Operands, VPValue *CondOp,
2749-
bool IsOrdered, DebugLoc DL)
2773+
ReductionStyle Style, DebugLoc DL)
27502774
: VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind),
2751-
IsOrdered(IsOrdered) {
2775+
Style(Style) {
27522776
if (CondOp) {
27532777
IsConditional = true;
27542778
addOperand(CondOp);
@@ -2759,30 +2783,29 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
27592783
public:
27602784
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
27612785
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
2762-
bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
2786+
ReductionStyle Style, DebugLoc DL = DebugLoc::getUnknown())
27632787
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
2764-
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2765-
IsOrdered, DL) {}
2788+
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, Style,
2789+
DL) {}
27662790

27672791
VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
27682792
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
2769-
bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
2793+
ReductionStyle Style, DebugLoc DL = DebugLoc::getUnknown())
27702794
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
2771-
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2772-
IsOrdered, DL) {}
2795+
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, Style,
2796+
DL) {}
27732797

27742798
~VPReductionRecipe() override = default;
27752799

27762800
VPReductionRecipe *clone() override {
27772801
return new VPReductionRecipe(RdxKind, getFastMathFlags(),
27782802
getUnderlyingInstr(), getChainOp(), getVecOp(),
2779-
getCondOp(), IsOrdered, getDebugLoc());
2803+
getCondOp(), Style, getDebugLoc());
27802804
}
27812805

27822806
static inline bool classof(const VPRecipeBase *R) {
27832807
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
2784-
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
2785-
R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
2808+
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
27862809
}
27872810

27882811
static inline bool classof(const VPUser *U) {
@@ -2809,9 +2832,16 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
28092832
/// Return the recurrence kind for the in-loop reduction.
28102833
RecurKind getRecurrenceKind() const { return RdxKind; }
28112834
/// Return true if the in-loop reduction is ordered.
2812-
bool isOrdered() const { return IsOrdered; };
2835+
bool isOrdered() const { return std::holds_alternative<RdxOrdered>(Style); };
28132836
/// Return true if the in-loop reduction is conditional.
28142837
bool isConditional() const { return IsConditional; };
2838+
/// Returns true if the reduction outputs a vector with a scaled down VF.
2839+
bool isPartialReduction() const { return getVFScaleFactor() > 1; }
2840+
/// Returns true if the reduction is in-loop.
2841+
bool isInLoop() const {
2842+
return std::holds_alternative<RdxInLoop>(Style) ||
2843+
std::holds_alternative<RdxOrdered>(Style);
2844+
}
28152845
/// The VPValue of the scalar Chain being accumulated.
28162846
VPValue *getChainOp() const { return getOperand(0); }
28172847
/// The VPValue of the vector value to be reduced.
@@ -2820,69 +2850,12 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
28202850
VPValue *getCondOp() const {
28212851
return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
28222852
}
2823-
2824-
protected:
2825-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2826-
/// Print the recipe.
2827-
void printRecipe(raw_ostream &O, const Twine &Indent,
2828-
VPSlotTracker &SlotTracker) const override;
2829-
#endif
2830-
};
2831-
2832-
/// A recipe for forming partial reductions. In the loop, an accumulator and
2833-
/// vector operand are added together and passed to the next iteration as the
2834-
/// next accumulator. After the loop body, the accumulator is reduced to a
2835-
/// scalar value.
2836-
class VPPartialReductionRecipe : public VPReductionRecipe {
2837-
unsigned Opcode;
2838-
2839-
/// The divisor by which the VF of this recipe's output should be divided
2840-
/// during execution.
2841-
unsigned VFScaleFactor;
2842-
2843-
public:
2844-
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
2845-
VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor)
2846-
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond,
2847-
VFScaleFactor, ReductionInst) {}
2848-
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
2849-
VPValue *Cond, unsigned ScaleFactor,
2850-
Instruction *ReductionInst = nullptr)
2851-
: VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add,
2852-
FastMathFlags(), ReductionInst,
2853-
ArrayRef<VPValue *>({Op0, Op1}), Cond, false, {}),
2854-
Opcode(Opcode), VFScaleFactor(ScaleFactor) {
2855-
[[maybe_unused]] auto *AccumulatorRecipe =
2856-
getChainOp()->getDefiningRecipe();
2857-
// When cloning as part of a VPExpressionRecipe the chain op could have
2858-
// replaced by a temporary VPValue, so it doesn't have a defining recipe.
2859-
assert((!AccumulatorRecipe ||
2860-
isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
2861-
isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
2862-
"Unexpected operand order for partial reduction recipe");
2863-
}
2864-
~VPPartialReductionRecipe() override = default;
2865-
2866-
VPPartialReductionRecipe *clone() override {
2867-
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
2868-
getCondOp(), VFScaleFactor,
2869-
getUnderlyingInstr());
2870-
}
2871-
2872-
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
2873-
2874-
/// Generate the reduction in the loop.
2875-
void execute(VPTransformState &State) override;
2876-
2877-
/// Return the cost of this VPPartialReductionRecipe.
2878-
InstructionCost computeCost(ElementCount VF,
2879-
VPCostContext &Ctx) const override;
2880-
2881-
/// Get the binary op's opcode.
2882-
unsigned getOpcode() const { return Opcode; }
2883-
2884-
/// Get the factor that the VF of this recipe's output should be scaled by.
2885-
unsigned getVFScaleFactor() const { return VFScaleFactor; }
2853+
/// Get the factor that the VF of this recipe's output should be scaled by, or
2854+
/// 1 if it isn't scaled.
2855+
unsigned getVFScaleFactor() const {
2856+
auto *Partial = std::get_if<RdxUnordered>(&Style);
2857+
return Partial ? Partial->VFScaleFactor : 1;
2858+
}
28862859

28872860
protected:
28882861
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2905,7 +2878,7 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
29052878
R.getFastMathFlags(),
29062879
cast_or_null<Instruction>(R.getUnderlyingValue()),
29072880
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
2908-
R.isOrdered(), DL) {}
2881+
getReductionStyle(/*InLoop=*/true, R.isOrdered(), 1), DL) {}
29092882

29102883
~VPReductionEVLRecipe() override = default;
29112884

@@ -3173,7 +3146,7 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
31733146
void decompose();
31743147

31753148
unsigned getVFScaleFactor() const {
3176-
auto *PR = dyn_cast<VPPartialReductionRecipe>(ExpressionRecipes.back());
3149+
auto *PR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
31773150
return PR ? PR->getVFScaleFactor() : 1;
31783151
}
31793152

0 commit comments

Comments
 (0)