-
Couldn't load subscription status.
- Fork 15k
[VPlan] Run narrowInterleaveGroups during general VPlan optimizations. #149706
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
dc6be02
fb75c97
4f7770e
3839b69
44cd6f3
d4b9e4f
7001a6c
55d9148
c279fb9
ad32e35
4838ab8
be31afc
22f4153
80d59c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -4085,6 +4085,9 @@ class VPlan { | |||||||||
| /// Represents the vectorization factor of the loop. | ||||||||||
| VPValue VF; | ||||||||||
|
|
||||||||||
| /// Represents the symbolic unroll factor of the loop. | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
VF and VFxUF are also "symbolic", when VF is fixed. |
||||||||||
| VPValue UF; | ||||||||||
|
|
||||||||||
| /// Represents the loop-invariant VF * UF of the vector loop region. | ||||||||||
| VPValue VFxUF; | ||||||||||
|
|
||||||||||
|
|
@@ -4236,6 +4239,9 @@ class VPlan { | |||||||||
| /// Returns the VF of the vector loop region. | ||||||||||
| VPValue &getVF() { return VF; }; | ||||||||||
|
|
||||||||||
| /// Returns the symbolic UF of the vector loop region. | ||||||||||
| VPValue &getSymbolicUF() { return UF; }; | ||||||||||
|
||||||||||
| /// Returns the symbolic UF of the vector loop region. | |
| VPValue &getSymbolicUF() { return UF; }; | |
| /// Returns the UF of the vector loop region. | |
| VPValue &getUF() { return UF; }; |
to be consistent with VF and VFxUF which may also be symbolic; or at-least rename UF to be SymbolicUF.
This would require renaming the exiting getUF() which returns unsigned, say, to be getFixedUF(). (Can also provide getFixedVF(), getFixedVFxUF() to support fixed VF case.)
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3802,6 +3802,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, | |
| // used. | ||
| // TODO: Assert that they aren't used. | ||
|
Comment on lines
3960
to
3961
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Above comment and TODO apply to Plan.getUF as well? |
||
|
|
||
| VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); | ||
| Plan.getSymbolicUF().replaceAllUsesWith(UF); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better rename |
||
|
|
||
| // If there are no users of the runtime VF, compute VFxUF by constant folding | ||
| // the multiplication of VF and UF. | ||
| if (VF.getNumUsers() == 0) { | ||
|
|
@@ -3821,7 +3824,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, | |
| } | ||
| VF.replaceAllUsesWith(RuntimeVF); | ||
|
|
||
| VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); | ||
| VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); | ||
| VFxUF.replaceAllUsesWith(MulByUF); | ||
| } | ||
|
|
@@ -3930,16 +3932,26 @@ static bool isAlreadyNarrow(VPValue *VPV) { | |
| return RepR && RepR->isSingleScalar(); | ||
| } | ||
|
|
||
| void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, | ||
| unsigned VectorRegWidth) { | ||
| std::unique_ptr<VPlan> | ||
| VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, | ||
| const TargetTransformInfo &TTI) { | ||
| using namespace llvm::VPlanPatternMatch; | ||
| VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); | ||
|
|
||
| if (!VectorLoop) | ||
| return; | ||
| return nullptr; | ||
|
|
||
| VPTypeAnalysis TypeInfo(Plan); | ||
| auto GetVectorWidthForVF = [&TTI](ElementCount VF) { | ||
| return TTI | ||
| .getRegisterBitWidth(VF.isFixed() | ||
| ? TargetTransformInfo::RGK_FixedWidthVector | ||
| : TargetTransformInfo::RGK_ScalableVector) | ||
| .getKnownMinValue(); | ||
| }; | ||
|
|
||
| unsigned VFMinVal = VF.getKnownMinValue(); | ||
| VPTypeAnalysis TypeInfo(Plan); | ||
| SmallVector<VPInterleaveRecipe *> StoreGroups; | ||
| std::optional<ElementCount> VFToOptimize; | ||
| for (auto &R : *VectorLoop->getEntryBasicBlock()) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Independent) Checking recipes of entry BB only? |
||
| if (isa<VPCanonicalIVPHIRecipe>(&R) || | ||
| match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) | ||
|
|
@@ -3954,30 +3966,38 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, | |
| // * recipes writing to memory except interleave groups | ||
| // Only support plans with a canonical induction phi. | ||
| if (R.isPhi()) | ||
| return; | ||
| return nullptr; | ||
|
|
||
| auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R); | ||
| if (R.mayWriteToMemory() && !InterleaveR) | ||
| return; | ||
|
|
||
| // Do not narrow interleave groups if there are VectorPointer recipes and | ||
| // the plan was unrolled. The recipe implicitly uses VF from | ||
| // VPTransformState. | ||
| // TODO: Remove restriction once the VF for the VectorPointer offset is | ||
| // modeled explicitly as operand. | ||
| if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1) | ||
| return; | ||
|
Comment on lines
-4120
to
-4126
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This TODO taken care of? Below asserts that vector pointer recipes are absent. |
||
| return nullptr; | ||
|
|
||
| // All other ops are allowed, but we reject uses that cannot be converted | ||
| // when checking all allowed consumers (store interleave groups) below. | ||
| if (!InterleaveR) | ||
| continue; | ||
|
|
||
| // Bail out on non-consecutive interleave groups. | ||
| if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, | ||
| VectorRegWidth)) | ||
| return; | ||
|
|
||
| // Try to find a single VF, where all interleave groups are consecutive and | ||
| // saturate the full vector width. If we already have a candidate VF, check | ||
| // if it is applicable for the current InterleaveR, otherwise look for a | ||
| // suitable VF across the Plans VFs. | ||
| // | ||
| if (VFToOptimize) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unify using |
||
| if (!isConsecutiveInterleaveGroup( | ||
| InterleaveR, VFToOptimize->getKnownMinValue(), TypeInfo, | ||
| GetVectorWidthForVF(*VFToOptimize))) | ||
| return nullptr; | ||
| } else { | ||
| for (ElementCount VF : Plan.vectorFactors()) { | ||
|
||
| if (isConsecutiveInterleaveGroup(InterleaveR, VF.getKnownMinValue(), | ||
| TypeInfo, GetVectorWidthForVF(VF))) { | ||
| VFToOptimize = VF; | ||
| break; | ||
| } | ||
| } | ||
| if (!VFToOptimize) | ||
|
||
| return nullptr; | ||
| } | ||
| // Skip read interleave groups. | ||
| if (InterleaveR->getStoredValues().empty()) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Independent) May be good to rename |
||
| continue; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Independent) What if below Member0 is already narrow but not all stored values are the same? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Independent) Better to check indices of members in IG rather than match the order of VPValues defined by interleaved load recipe to the order of interleaved store operands? Or verify that these recipes retain these orders. |
||
|
|
@@ -4011,24 +4031,44 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, | |
| auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>( | ||
| InterleaveR->getStoredValues()[0]->getDefiningRecipe()); | ||
| if (!WideMember0) | ||
| return; | ||
| return nullptr; | ||
| for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { | ||
| auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe()); | ||
| if (!R || R->getOpcode() != WideMember0->getOpcode() || | ||
| R->getNumOperands() > 2) | ||
| return; | ||
| return nullptr; | ||
| if (any_of(enumerate(R->operands()), | ||
| [WideMember0, Idx = I](const auto &P) { | ||
| const auto &[OpIdx, OpV] = P; | ||
| return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx); | ||
| })) | ||
| return; | ||
| return nullptr; | ||
| } | ||
| StoreGroups.push_back(InterleaveR); | ||
| } | ||
|
|
||
| if (StoreGroups.empty()) | ||
| return; | ||
| return nullptr; | ||
|
|
||
| // All interleave groups in Plan can be narrowed for VFToOptimize. Split the | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth adding a |
||
| // original Plan into 2: a) a new clone which contains all VFs of Plan, except | ||
| // VFToOptimize, and b) the original Plan with VFToOptimize as single VF. | ||
| std::unique_ptr<VPlan> NewPlan; | ||
| if (size(Plan.vectorFactors()) != 1) { | ||
| NewPlan = std::unique_ptr<VPlan>(Plan.duplicate()); | ||
| Plan.setVF(*VFToOptimize); | ||
| bool First = true; | ||
| for (ElementCount VF : NewPlan->vectorFactors()) { | ||
| if (VF == VFToOptimize) | ||
| continue; | ||
| if (First) { | ||
| NewPlan->setVF(VF); | ||
| First = false; | ||
| continue; | ||
| } | ||
| NewPlan->addVF(VF); | ||
|
||
| } | ||
| } | ||
|
|
||
| // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. | ||
| SmallPtrSet<VPValue *, 4> NarrowedOps; | ||
|
|
@@ -4099,9 +4139,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, | |
| auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); | ||
| VPBuilder PHBuilder(Plan.getVectorPreheader()); | ||
|
|
||
| VPValue *UF = Plan.getOrAddLiveIn( | ||
| ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); | ||
| if (VF.isScalable()) { | ||
| VPValue *UF = &Plan.getSymbolicUF(); | ||
| if (VFToOptimize->isScalable()) { | ||
| VPValue *VScale = PHBuilder.createElementCount( | ||
| CanIV->getScalarType(), ElementCount::getScalable(1)); | ||
| VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); | ||
|
|
@@ -4113,6 +4152,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, | |
| Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The VF of Plan is set to 1 to affect the induction recipes that use it, in order to de-vectorize the loop, but the widen loads and stores recipes (that replace the interleaved loads and stores) are to still generate vectors instructions according to the original VF. Would be good to clarify this discrepancy. |
||
| } | ||
| removeDeadRecipes(Plan); | ||
| assert(none_of(*VectorLoop->getEntryBasicBlock(), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again attention is given to entry BB only. |
||
| IsaPred<VPVectorPointerRecipe>) && | ||
| "All VPVectorPointerRecipes should have been removed"); | ||
|
Comment on lines
+4303
to
+4305
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This corresponds to the original constraint that UF must be 1 if vector pointer recipes are present? |
||
| return NewPlan; | ||
| } | ||
|
|
||
| /// Add branch weight metadata, if the \p Plan's middle block is terminated by a | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -333,14 +333,19 @@ struct VPlanTransforms { | |
| static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan, | ||
| ScalarEvolution &SE); | ||
|
|
||
| /// Try to convert a plan with interleave groups with VF elements to a plan | ||
| /// with the interleave groups replaced by wide loads and stores processing VF | ||
| /// elements, if all transformed interleave groups access the full vector | ||
| /// width (checked via \o VectorRegWidth). This effectively is a very simple | ||
| /// form of loop-aware SLP, where we use interleave groups to identify | ||
| /// candidates. | ||
| static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, | ||
| unsigned VectorRegWidth); | ||
| /// Try to find a single VF among \p Plan's VFs for which all interleave | ||
| /// groups (with VF elements) can be replaced by wide loads ans tores | ||
|
||
| /// processing VF elements, if all transformed interleave groups access the | ||
| /// full vector width (checked via \o VectorRegWidth). If the transformation | ||
|
||
| /// can be applied, the original \p Plan will be split in 2, if is has | ||
|
||
| /// multiple VFs: a) a new clone which contains all VFs of Plan, except | ||
| /// VFToOptimize, and b) the original Plan with VFToOptimize as single VF. In | ||
| /// that case, the new clone is returned. | ||
| /// | ||
| /// This effectively is a very simple form of loop-aware SLP, where we use | ||
| /// interleave groups to identify candidates. | ||
| static std::unique_ptr<VPlan> | ||
| narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More important than "narrowing" is the "pivoting" of the vectorization dimension from being loop-based to being SLP-based, thereby eliminating shuffle-de-shuffle redundancies. This can be achieved w/o narrowing, provided support for very-wide load/store recipes or emission of multiple wide load/store recipes instead of emitting only single ones. |
||
|
|
||
| /// Predicate and linearize the control-flow in the only loop region of | ||
| /// \p Plan. If \p FoldTail is true, create a mask guarding the loop | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a scalable vector version of at least one of these tests please? I tested this file with this PR and ran There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a RUN line to the scalable test file w/o forced interleaving. I think that should add the missing coverage. Could also add additional tests there. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that in addition to moving narrowInterleaveGroups from VPlan execution to planning, it also changes relative transform order - being moved from LVP::executePlan() after optimizing for final VF and UF, to be the last transform of buildVPlansWithVPRecipes(), skipping over several transforms in LVP::executePlan().
Perhaps worth first hoisting it to appear earlier/earliest in LVP::executePlan(), still operating on the final VPlan but before it is unrolled etc., and then move it to the end of LVP::executePlan() where it operates on multiple VPlan's?