-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[LoopVectorize] Generate wide active lane masks #147535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
ddf8971
5b43b47
5c7c30d
4f13c3c
2e2a3f6
c810c48
03a5815
d1419db
1277ce9
31f83bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4214,9 +4214,15 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { | |
| } | ||
| } | ||
| } | ||
| [[fallthrough]]; | ||
| break; | ||
| } | ||
| case VPInstruction::ActiveLaneMask: { | ||
| unsigned Multiplier = | ||
| cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) | ||
| ->getZExtValue(); | ||
| C += VPI->cost(VF * Multiplier, CostCtx); | ||
|
||
| break; | ||
| } | ||
| case VPInstruction::ActiveLaneMask: | ||
| case VPInstruction::ExplicitVectorLength: | ||
| C += VPI->cost(VF, CostCtx); | ||
| break; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -471,7 +471,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { | |
| case Instruction::ICmp: | ||
| case Instruction::FCmp: | ||
| case Instruction::Store: | ||
| case VPInstruction::ActiveLaneMask: | ||
| case VPInstruction::BranchOnCount: | ||
| case VPInstruction::ComputeReductionResult: | ||
| case VPInstruction::FirstOrderRecurrenceSplice: | ||
|
|
@@ -481,6 +480,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { | |
| case VPInstruction::WideIVStep: | ||
| return 2; | ||
| case Instruction::Select: | ||
| case VPInstruction::ActiveLaneMask: | ||
|
||
| case VPInstruction::ComputeAnyOfResult: | ||
| case VPInstruction::ReductionStartVector: | ||
| return 3; | ||
|
|
@@ -620,7 +620,9 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
| Name); | ||
|
|
||
| auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); | ||
| auto *PredTy = VectorType::get(Int1Ty, State.VF); | ||
| auto PredTy = VectorType::get( | ||
| Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue()) | ||
| ->getZExtValue()); | ||
|
||
| return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, | ||
| {PredTy, ScalarTC->getType()}, | ||
| {VIVElem0, ScalarTC}, nullptr, Name); | ||
|
|
@@ -1091,7 +1093,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, | |
| } | ||
| case VPInstruction::ActiveLaneMask: { | ||
| Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); | ||
| Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); | ||
| unsigned Multiplier = | ||
| cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue(); | ||
| Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier); | ||
| IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, | ||
| {ArgTy, ArgTy}); | ||
| return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,10 @@ | |
| using namespace llvm; | ||
| using namespace VPlanPatternMatch; | ||
|
|
||
| cl::opt<bool> EnableWideActiveLaneMask( | ||
| "enable-wide-lane-mask", cl::init(false), cl::Hidden, | ||
| cl::desc("Enable use of wide get active lane mask instructions")); | ||
|
|
||
| bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( | ||
| VPlanPtr &Plan, | ||
| function_ref<const InductionDescriptor *(PHINode *)> | ||
|
|
@@ -1467,6 +1471,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, | |
| return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C); | ||
| } | ||
|
|
||
| /// Try to replace multiple active lane masks used for control flow with | ||
| /// a single, wide active lane mask instruction followed by multiple | ||
| /// extract subvector intrinsics. This applies to the active lane mask | ||
| /// instructions both in the loop and in the preheader. | ||
| /// Incoming values of all ActiveLaneMaskPHIs are updated to use the | ||
| /// new extracts from the first active lane mask, which has it's last | ||
| /// operand (multiplier) set to UF. | ||
| static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, | ||
| unsigned UF) { | ||
| if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1) | ||
| return false; | ||
|
|
||
| VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); | ||
|
||
| VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); | ||
| auto *Term = &ExitingVPBB->back(); | ||
|
|
||
| using namespace llvm::VPlanPatternMatch; | ||
| if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( | ||
| m_VPValue(), m_VPValue(), m_VPValue()))))) | ||
| return false; | ||
|
|
||
| auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); | ||
| LLVMContext &Ctx = Plan.getContext(); | ||
|
|
||
| auto ExtractFromALM = [&](VPInstruction *ALM, | ||
| SmallVectorImpl<VPValue *> &Extracts) { | ||
| DebugLoc DL = ALM->getDebugLoc(); | ||
| for (unsigned Part = 0; Part < UF; ++Part) { | ||
| SmallVector<VPValue *> Ops; | ||
| Ops.append({ALM, Plan.getOrAddLiveIn( | ||
| ConstantInt::get(IntegerType::getInt64Ty(Ctx), | ||
| VF.getKnownMinValue() * Part))}); | ||
| auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops, | ||
| IntegerType::getInt1Ty(Ctx), DL); | ||
| Extracts[Part] = Ext; | ||
| Ext->insertAfter(ALM); | ||
| } | ||
| }; | ||
|
|
||
| // Create a list of each active lane mask phi, ordered by unroll part. | ||
| SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr); | ||
| for (VPRecipeBase &R : Header->phis()) { | ||
| auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R); | ||
| if (!Phi) | ||
| continue; | ||
| VPValue *Index = nullptr; | ||
| match(Phi->getBackedgeValue(), | ||
| m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue())); | ||
| assert(Index && "Expected index from ActiveLaneMask instruction"); | ||
|
|
||
| auto *II = dyn_cast<VPInstruction>(Index); | ||
| if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) { | ||
| auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue()); | ||
| Phis[Part->getZExtValue()] = Phi; | ||
| } else | ||
| // Anything other than a CanonicalIVIncrementForPart is part 0 | ||
| Phis[0] = Phi; | ||
| } | ||
|
|
||
| assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && | ||
| "Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); | ||
|
|
||
| auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue()); | ||
| auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue()); | ||
|
|
||
| assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask && | ||
kmclaughlin-arm marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) && | ||
| "Expected incoming values of Phi to be ActiveLaneMasks"); | ||
|
|
||
| // When using wide lane masks, the return type of the get.active.lane.mask | ||
| // intrinsic is VF x UF (last operand). | ||
| VPValue *ALMMultiplier = | ||
| Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); | ||
| EntryALM->setOperand(2, ALMMultiplier); | ||
| LoopALM->setOperand(2, ALMMultiplier); | ||
|
|
||
| // Create UF x extract vectors and insert into preheader. | ||
| SmallVector<VPValue *> EntryExtracts(UF); | ||
| ExtractFromALM(EntryALM, EntryExtracts); | ||
|
|
||
| // Create UF x extract vectors and insert before the loop compare & branch, | ||
| // updating the compare to use the first extract. | ||
| SmallVector<VPValue *> LoopExtracts(UF); | ||
| ExtractFromALM(LoopALM, LoopExtracts); | ||
| VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0)); | ||
| Not->setOperand(0, LoopExtracts[0]); | ||
|
|
||
| // Update the incoming values of active lane mask phis. | ||
| for (unsigned Part = 0; Part < UF; ++Part) { | ||
| Phis[Part]->setStartValue(EntryExtracts[Part]); | ||
| Phis[Part]->setBackedgeValue(LoopExtracts[Part]); | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| /// Try to simplify the branch condition of \p Plan. This may restrict the | ||
| /// resulting plan to \p BestVF and \p BestUF. | ||
| static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, | ||
|
|
@@ -1478,8 +1578,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, | |
| VPValue *Cond; | ||
| ScalarEvolution &SE = *PSE.getSE(); | ||
| if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || | ||
| match(Term, m_BranchOnCond( | ||
| m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { | ||
| match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( | ||
| m_VPValue(), m_VPValue(), m_VPValue()))))) { | ||
| // Try to simplify the branch condition if TC <= VF * UF when the latch | ||
| // terminator is BranchOnCount or BranchOnCond where the input is | ||
| // Not(ActiveLaneMask). | ||
|
|
@@ -1558,8 +1658,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |
| assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); | ||
| assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); | ||
|
|
||
| bool MadeChange = | ||
| simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); | ||
| bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF); | ||
| MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); | ||
| MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); | ||
|
|
||
| if (MadeChange) { | ||
|
|
@@ -2042,9 +2142,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( | |
| "index.part.next"); | ||
|
|
||
| // Create the active lane mask instruction in the VPlan preheader. | ||
| auto *EntryALM = | ||
| Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, | ||
| DL, "active.lane.mask.entry"); | ||
| VPValue *ALMMultiplier = Plan.getOrAddLiveIn( | ||
| ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); | ||
| auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, | ||
| {EntryIncrement, TC, ALMMultiplier}, DL, | ||
| "active.lane.mask.entry"); | ||
|
|
||
| // Now create the ActiveLaneMaskPhi recipe in the main loop using the | ||
| // preheader ActiveLaneMask instruction. | ||
|
|
@@ -2059,8 +2161,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( | |
| Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, | ||
| {IncrementValue}, {false, false}, DL); | ||
| auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, | ||
| {InLoopIncrement, TripCount}, DL, | ||
| "active.lane.mask.next"); | ||
| {InLoopIncrement, TripCount, ALMMultiplier}, | ||
| DL, "active.lane.mask.next"); | ||
| LaneMaskPhi->addOperand(ALM); | ||
|
|
||
| // Replace the original terminator with BranchOnCond. We have to invert the | ||
|
|
@@ -2139,9 +2241,12 @@ void VPlanTransforms::addActiveLaneMask( | |
| Plan, DataAndControlFlowWithoutRuntimeCheck); | ||
| } else { | ||
| VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); | ||
| LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, | ||
| {WideCanonicalIV, Plan.getTripCount()}, nullptr, | ||
| "active.lane.mask"); | ||
| VPValue *ALMMultiplier = Plan.getOrAddLiveIn( | ||
| ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); | ||
| LaneMask = | ||
| B.createNaryOp(VPInstruction::ActiveLaneMask, | ||
| {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, | ||
| nullptr, "active.lane.mask"); | ||
| } | ||
|
|
||
| // Walk users of WideCanonicalIV and replace the header mask of the form | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, I don't think this is right because previously when falling through it was hitting the VPInstruction::ExplicitVectorLength case below and adding on the cost of the instruction, i.e.
C += VPI->cost(VF, CostCtx).If you change this to
then it will be the same as before.