-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. #149042
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
b7b23a9
9985387
6b0c9d2
2a45f94
7d1ab3c
8ca0426
96c827b
72f3796
f3fdac7
af82079
cc7e913
2d3012f
4fa7442
ae81e08
f10fa37
2758f96
73b3197
d8e77ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -44,11 +44,6 @@ class VPPredicator { | |||||
| /// possibly inserting new recipes at \p Dst (using Builder's insertion point) | ||||||
| VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); | ||||||
|
|
||||||
| /// Returns the *entry* mask for \p VPBB. | ||||||
| VPValue *getBlockInMask(VPBasicBlock *VPBB) const { | ||||||
| return BlockMaskCache.lookup(VPBB); | ||||||
| } | ||||||
|
|
||||||
| /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not | ||||||
| /// already have a mask. | ||||||
| void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) { | ||||||
|
|
@@ -68,6 +63,11 @@ class VPPredicator { | |||||
| } | ||||||
|
|
||||||
| public: | ||||||
| /// Returns the *entry* mask for \p VPBB. | ||||||
| VPValue *getBlockInMask(VPBasicBlock *VPBB) const { | ||||||
| return BlockMaskCache.lookup(VPBB); | ||||||
| } | ||||||
|
|
||||||
| /// Returns the precomputed predicate of the edge from \p Src to \p Dst. | ||||||
| VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { | ||||||
| return EdgeMaskCache.lookup({Src, Dst}); | ||||||
|
|
@@ -300,5 +300,39 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { | |||||
|
|
||||||
| PrevVPBB = VPBB; | ||||||
| } | ||||||
|
|
||||||
| // If we folded the tail and introduced a header mask, any extract of the | ||||||
| // last element must be updated to extract from the last active lane of the | ||||||
| // header mask instead (i.e., the lane corresponding to the last active | ||||||
| // iteration). | ||||||
| if (FoldTail) { | ||||||
| assert(Plan.getExitBlocks().size() == 1 && | ||||||
| "only a single-exit block is supported currently"); | ||||||
| VPBasicBlock *EB = Plan.getExitBlocks().front(); | ||||||
| assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() && | ||||||
| "the exit block must have middle block as single predecessor"); | ||||||
|
|
||||||
| VPValue *LastActiveLane = nullptr; | ||||||
| VPBuilder B(Plan.getMiddleBlock()->getTerminator()); | ||||||
| for (auto &P : EB->phis()) { | ||||||
| auto *ExitIRI = cast<VPIRPhi>(&P); | ||||||
| VPValue *Inc = ExitIRI->getIncomingValue(0); | ||||||
| VPValue *Op; | ||||||
| if (!match(Inc, m_VPInstruction<VPInstruction::ExtractLastElement>( | ||||||
| m_VPValue(Op)))) | ||||||
| continue; | ||||||
|
|
||||||
| if (!LastActiveLane) { | ||||||
| // Compute the index of the last active lane. | ||||||
| VPValue *HeaderMask = Predicator.getBlockInMask( | ||||||
| Plan.getVectorLoopRegion()->getEntryBasicBlock()); | ||||||
| LastActiveLane = | ||||||
| B.createNaryOp(VPInstruction::LastActiveLane, {HeaderMask}); | ||||||
|
||||||
| B.createNaryOp(VPInstruction::LastActiveLane, {HeaderMask}); | |
| B.createNaryOp(VPInstruction::LastActiveLane, HeaderMask); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, updated thanks
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -513,6 +513,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { | |
| case VPInstruction::ExtractLastElement: | ||
| case VPInstruction::ExtractPenultimateElement: | ||
| case VPInstruction::FirstActiveLane: | ||
| case VPInstruction::LastActiveLane: | ||
| case VPInstruction::Not: | ||
| return 1; | ||
| case Instruction::ICmp: | ||
|
|
@@ -638,7 +639,12 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
| llvm_unreachable("should be handled by VPPhi::execute"); | ||
| } | ||
| case Instruction::Select: { | ||
| bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); | ||
| bool OnlyFirstLaneUsed = | ||
| vputils::onlyFirstLaneUsed(this) || | ||
|
||
| (isa<VPInstruction>(getOperand(1)) && | ||
|
||
| cast<VPInstruction>(getOperand(1))->isVectorToScalar() && | ||
| isa<VPInstruction>(getOperand(2)) && | ||
| cast<VPInstruction>(getOperand(2))->isVectorToScalar()); | ||
| Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); | ||
| Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); | ||
| Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); | ||
|
|
@@ -1117,6 +1123,29 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, | |
| {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); | ||
| return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); | ||
| } | ||
| case VPInstruction::LastActiveLane: { | ||
| Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); | ||
| if (VF.isScalar()) | ||
| return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, | ||
| CmpInst::makeCmpResultType(ScalarTy), | ||
| CmpInst::ICMP_EQ, Ctx.CostKind); | ||
| // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB. | ||
| auto *PredTy = toVectorTy(ScalarTy, VF); | ||
| IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, | ||
| Type::getInt64Ty(Ctx.LLVMCtx), | ||
| {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); | ||
| InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); | ||
| // Add cost of NOT operation on the predicate. | ||
| Cost += Ctx.TTI.getArithmeticInstrCost( | ||
| Instruction::Xor, PredTy, Ctx.CostKind, | ||
| {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, | ||
| {TargetTransformInfo::OK_UniformConstantValue, | ||
| TargetTransformInfo::OP_None}); | ||
| // Add cost of SUB operation on the index. | ||
| Cost += Ctx.TTI.getArithmeticInstrCost( | ||
| Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind); | ||
| return Cost; | ||
| } | ||
| case VPInstruction::FirstOrderRecurrenceSplice: { | ||
| assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); | ||
| SmallVector<int> Mask(VF.getKnownMinValue()); | ||
|
|
@@ -1170,6 +1199,7 @@ bool VPInstruction::isVectorToScalar() const { | |
| getOpcode() == Instruction::ExtractElement || | ||
| getOpcode() == VPInstruction::ExtractLane || | ||
| getOpcode() == VPInstruction::FirstActiveLane || | ||
| getOpcode() == VPInstruction::LastActiveLane || | ||
| getOpcode() == VPInstruction::ComputeAnyOfResult || | ||
| getOpcode() == VPInstruction::ComputeFindIVResult || | ||
| getOpcode() == VPInstruction::ComputeReductionResult || | ||
|
|
@@ -1232,6 +1262,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { | |
| case VPInstruction::ExtractPenultimateElement: | ||
| case VPInstruction::ActiveLaneMask: | ||
| case VPInstruction::FirstActiveLane: | ||
| case VPInstruction::LastActiveLane: | ||
| case VPInstruction::FirstOrderRecurrenceSplice: | ||
| case VPInstruction::LogicalAnd: | ||
| case VPInstruction::Not: | ||
|
|
@@ -1403,6 +1434,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, | |
| case VPInstruction::FirstActiveLane: | ||
| O << "first-active-lane"; | ||
| break; | ||
| case VPInstruction::LastActiveLane: | ||
| O << "last-active-lane"; | ||
| break; | ||
| case VPInstruction::ReductionStartVector: | ||
| O << "reduction-start-vector"; | ||
| break; | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1950,6 +1950,34 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, | |||||
| // Set the first operand of RecurSplice to FOR again, after replacing | ||||||
| // all users. | ||||||
| RecurSplice->setOperand(0, FOR); | ||||||
|
|
||||||
| // Check for users extracting the second-to-last active lane of the FOR. If | ||||||
| // only a single lane is active in the current iteration, we need to select | ||||||
| // the last element of the value from the previous iteration, directly from | ||||||
| // the FOR phi. | ||||||
| for (VPUser *U : RecurSplice->users()) { | ||||||
| if (!match(U, m_VPInstruction<VPInstruction::ExtractLane>( | ||||||
| m_VPInstruction<VPInstruction::LastActiveLane>( | ||||||
| m_VPValue()), | ||||||
| m_Specific(RecurSplice)))) | ||||||
| continue; | ||||||
|
|
||||||
| VPBuilder B(cast<VPInstruction>(U)); | ||||||
| VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0); | ||||||
| Type *I64Ty = Type::getInt64Ty(Plan.getContext()); | ||||||
| VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0)); | ||||||
| VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1)); | ||||||
| VPValue *PenultimateIndex = | ||||||
| B.createNaryOp(Instruction::Sub, {LastActiveLane, One}); | ||||||
| VPValue *PenultimateLastIter = | ||||||
| B.createNaryOp(VPInstruction::ExtractLane, | ||||||
| {PenultimateIndex, FOR->getBackedgeValue()}); | ||||||
| VPValue *LastPrevIter = | ||||||
| B.createNaryOp(VPInstruction::ExtractLastElement, {FOR}); | ||||||
|
||||||
| B.createNaryOp(VPInstruction::ExtractLastElement, {FOR}); | |
| B.createNaryOp(VPInstruction::ExtractLastElement, FOR); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we have to be careful about this transformation because it's making the assumption that there are no holes in the mask. For example, you'd presumably get something unexpected if you transform LastActiveLane(11100111000) -> FirstActiveLane(00011000111).
It might require adding documentation to the opcodes saying that holes are not permitted when using the LastActiveLane opcode. I don't know if there is a way to add an assert here that the mask doesn't contain holes? Presumably it should only be used when the mask has originally come get.active.lane.mask?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, for now the operands will always be header masks (or EVL based masks), so we can verify that (added to VPlanVerifier). Updated and extended the comment for the opcode, thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I've added a similar opcode (ExtractLastActive) as part of https://github.com/llvm/llvm-project/pull/158088/files#diff-dabd8bf9956285f5ddab712af4e0494647a106234f07283b0bd9f8b4d9b887ca , and that does expect that there might be holes.
Should we keep the opcodes separate, or express one in terms of the other and enforce the 'contiguousness' of active lanes directly for your code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having the LastActiveLane check separate has the potential advantage that it could expose more CSE/simplification opportuntiies. The restriction for prefix-masks at the moment is only there due to how it gets lowered using FirstActiveLane.
I think we can just remove the restriction once we have a use-case for non-prefix-masks and add dedicated lowering that supports them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really can't think of when we'd end up using LastActiveLane with anything but the header mask.
Would it be simpler to just make LastActiveLane a nullary op, and just fetch the header mask with findHeaderMask when converting to a concrete recipe? That way we don't have to worry about the non-contiguous invariant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately I don't think that's possible at the moment, as at the point when we call covnertToConcreteRecipes, the region may have been removed (or if the LastActiveLane would be the only user of the header mask it may have been removed), so I've left it as-is for now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm also realising now that we need to model the use of the header mask to prevent removeDeadRecipes from stripping it if e.g. it's EVL tail folded and all other uses of it are removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the PR handle the IV liveout user as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch still leaves inductions untouched for now