4040using namespace llvm ;
4141using namespace VPlanPatternMatch ;
4242
43+ cl::opt<bool > EnableWideActiveLaneMask (
44+ " enable-wide-lane-mask" , cl::init(false ), cl::Hidden,
45+ cl::desc(" Enable use of wide get active lane mask instructions" ));
46+
4347bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes (
4448 VPlanPtr &Plan,
4549 function_ref<const InductionDescriptor *(PHINode *)>
@@ -1475,6 +1479,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
14751479 return SE.isKnownPredicate (CmpInst::ICMP_EQ, VectorTripCount, C);
14761480}
14771481
1482+ // / Try to replace multiple active lane masks used for control flow with
1483+ // / a single, wide active lane mask instruction followed by multiple
1484+ // / extract subvector intrinsics. This applies to the active lane mask
1485+ // / instructions both in the loop and in the preheader.
1486+ // / Incoming values of all ActiveLaneMaskPHIs are updated to use the
1487+ // / new extracts from the first active lane mask, which has it's last
1488+ // / operand (multiplier) set to UF.
1489+ static bool tryToReplaceALMWithWideALM (VPlan &Plan, ElementCount VF,
1490+ unsigned UF) {
1491+ if (!EnableWideActiveLaneMask || !VF.isVector () || UF == 1 )
1492+ return false ;
1493+
1494+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion ();
1495+ VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock ();
1496+ auto *Term = &ExitingVPBB->back ();
1497+
1498+ using namespace llvm ::VPlanPatternMatch;
1499+ if (!match (Term, m_BranchOnCond (m_Not (m_ActiveLaneMask (
1500+ m_VPValue (), m_VPValue (), m_VPValue ())))))
1501+ return false ;
1502+
1503+ auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry ());
1504+ LLVMContext &Ctx = Plan.getContext ();
1505+
1506+ auto ExtractFromALM = [&](VPInstruction *ALM,
1507+ SmallVectorImpl<VPValue *> &Extracts) {
1508+ DebugLoc DL = ALM->getDebugLoc ();
1509+ for (unsigned Part = 0 ; Part < UF; ++Part) {
1510+ SmallVector<VPValue *> Ops;
1511+ Ops.append ({ALM, Plan.getOrAddLiveIn (
1512+ ConstantInt::get (IntegerType::getInt64Ty (Ctx),
1513+ VF.getKnownMinValue () * Part))});
1514+ auto *Ext = new VPWidenIntrinsicRecipe (Intrinsic::vector_extract, Ops,
1515+ IntegerType::getInt1Ty (Ctx), DL);
1516+ Extracts[Part] = Ext;
1517+ Ext->insertAfter (ALM);
1518+ }
1519+ };
1520+
1521+ // Create a list of each active lane mask phi, ordered by unroll part.
1522+ SmallVector<VPActiveLaneMaskPHIRecipe *> Phis (UF, nullptr );
1523+ for (VPRecipeBase &R : Header->phis ()) {
1524+ auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
1525+ if (!Phi)
1526+ continue ;
1527+ VPValue *Index = nullptr ;
1528+ match (Phi->getBackedgeValue (),
1529+ m_ActiveLaneMask (m_VPValue (Index), m_VPValue (), m_VPValue ()));
1530+ assert (Index && " Expected index from ActiveLaneMask instruction" );
1531+
1532+ auto *II = dyn_cast<VPInstruction>(Index);
1533+ if (II && II->getOpcode () == VPInstruction::CanonicalIVIncrementForPart) {
1534+ auto Part = cast<ConstantInt>(II->getOperand (1 )->getLiveInIRValue ());
1535+ Phis[Part->getZExtValue ()] = Phi;
1536+ } else
1537+ // Anything other than a CanonicalIVIncrementForPart is part 0
1538+ Phis[0 ] = Phi;
1539+ }
1540+
1541+ assert (all_of (Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1542+ " Expected one VPActiveLaneMaskPHIRecipe for each unroll part" );
1543+
1544+ auto *EntryALM = cast<VPInstruction>(Phis[0 ]->getStartValue ());
1545+ auto *LoopALM = cast<VPInstruction>(Phis[0 ]->getBackedgeValue ());
1546+
1547+ assert ((EntryALM->getOpcode () == VPInstruction::ActiveLaneMask &&
1548+ LoopALM->getOpcode () == VPInstruction::ActiveLaneMask) &&
1549+ " Expected incoming values of Phi to be ActiveLaneMasks" );
1550+
1551+ // When using wide lane masks, the return type of the get.active.lane.mask
1552+ // intrinsic is VF x UF (last operand).
1553+ VPValue *ALMMultiplier =
1554+ Plan.getOrAddLiveIn (ConstantInt::get (IntegerType::getInt64Ty (Ctx), UF));
1555+ EntryALM->setOperand (2 , ALMMultiplier);
1556+ LoopALM->setOperand (2 , ALMMultiplier);
1557+
1558+ // Create UF x extract vectors and insert into preheader.
1559+ SmallVector<VPValue *> EntryExtracts (UF);
1560+ ExtractFromALM (EntryALM, EntryExtracts);
1561+
1562+ // Create UF x extract vectors and insert before the loop compare & branch,
1563+ // updating the compare to use the first extract.
1564+ SmallVector<VPValue *> LoopExtracts (UF);
1565+ ExtractFromALM (LoopALM, LoopExtracts);
1566+ VPInstruction *Not = cast<VPInstruction>(Term->getOperand (0 ));
1567+ Not->setOperand (0 , LoopExtracts[0 ]);
1568+
1569+ // Update the incoming values of active lane mask phis.
1570+ for (unsigned Part = 0 ; Part < UF; ++Part) {
1571+ Phis[Part]->setStartValue (EntryExtracts[Part]);
1572+ Phis[Part]->setBackedgeValue (LoopExtracts[Part]);
1573+ }
1574+
1575+ return true ;
1576+ }
1577+
14781578// / Try to simplify the branch condition of \p Plan. This may restrict the
14791579// / resulting plan to \p BestVF and \p BestUF.
14801580static bool simplifyBranchConditionForVFAndUF (VPlan &Plan, ElementCount BestVF,
@@ -1486,8 +1586,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
14861586 VPValue *Cond;
14871587 ScalarEvolution &SE = *PSE.getSE ();
14881588 if (match (Term, m_BranchOnCount (m_VPValue (), m_VPValue ())) ||
1489- match (Term, m_BranchOnCond (
1490- m_Not ( m_ActiveLaneMask ( m_VPValue (), m_VPValue ()))))) {
1589+ match (Term, m_BranchOnCond (m_Not ( m_ActiveLaneMask (
1590+ m_VPValue (), m_VPValue (), m_VPValue ()))))) {
14911591 // Try to simplify the branch condition if TC <= VF * UF when the latch
14921592 // terminator is BranchOnCount or BranchOnCond where the input is
14931593 // Not(ActiveLaneMask).
@@ -1566,8 +1666,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
15661666 assert (Plan.hasVF (BestVF) && " BestVF is not available in Plan" );
15671667 assert (Plan.hasUF (BestUF) && " BestUF is not available in Plan" );
15681668
1569- bool MadeChange =
1570- simplifyBranchConditionForVFAndUF (Plan, BestVF, BestUF, PSE);
1669+ bool MadeChange = tryToReplaceALMWithWideALM (Plan, BestVF, BestUF);
1670+ MadeChange |= simplifyBranchConditionForVFAndUF (Plan, BestVF, BestUF, PSE);
15711671 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF (Plan, BestVF, BestUF);
15721672
15731673 if (MadeChange) {
@@ -2050,9 +2150,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
20502150 " index.part.next" );
20512151
20522152 // Create the active lane mask instruction in the VPlan preheader.
2053- auto *EntryALM =
2054- Builder.createNaryOp (VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
2055- DL, " active.lane.mask.entry" );
2153+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn (
2154+ ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
2155+ auto *EntryALM = Builder.createNaryOp (VPInstruction::ActiveLaneMask,
2156+ {EntryIncrement, TC, ALMMultiplier}, DL,
2157+ " active.lane.mask.entry" );
20562158
20572159 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
20582160 // preheader ActiveLaneMask instruction.
@@ -2067,8 +2169,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
20672169 Builder.createOverflowingOp (VPInstruction::CanonicalIVIncrementForPart,
20682170 {IncrementValue}, {false , false }, DL);
20692171 auto *ALM = Builder.createNaryOp (VPInstruction::ActiveLaneMask,
2070- {InLoopIncrement, TripCount}, DL ,
2071- " active.lane.mask.next" );
2172+ {InLoopIncrement, TripCount, ALMMultiplier} ,
2173+ DL, " active.lane.mask.next" );
20722174 LaneMaskPhi->addOperand (ALM);
20732175
20742176 // Replace the original terminator with BranchOnCond. We have to invert the
@@ -2144,9 +2246,12 @@ void VPlanTransforms::addActiveLaneMask(
21442246 Plan, DataAndControlFlowWithoutRuntimeCheck);
21452247 } else {
21462248 VPBuilder B = VPBuilder::getToInsertAfter (WideCanonicalIV);
2147- LaneMask = B.createNaryOp (VPInstruction::ActiveLaneMask,
2148- {WideCanonicalIV, Plan.getTripCount ()}, nullptr ,
2149- " active.lane.mask" );
2249+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn (
2250+ ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
2251+ LaneMask =
2252+ B.createNaryOp (VPInstruction::ActiveLaneMask,
2253+ {WideCanonicalIV, Plan.getTripCount (), ALMMultiplier},
2254+ nullptr , " active.lane.mask" );
21502255 }
21512256
21522257 // Walk users of WideCanonicalIV and replace the header mask of the form
0 commit comments