4040using  namespace  llvm ; 
4141using  namespace  VPlanPatternMatch ; 
4242
43+ cl::opt<bool > EnableWideActiveLaneMask (
44+     " enable-wide-lane-mask"  , cl::init(false ), cl::Hidden,
45+     cl::desc(" Enable use of wide get active lane mask instructions"  ));
46+ 
4347bool  VPlanTransforms::tryToConvertVPInstructionsToVPRecipes (
4448    VPlanPtr &Plan,
4549    function_ref<const  InductionDescriptor *(PHINode *)>
@@ -1475,6 +1479,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
14751479  return  SE.isKnownPredicate (CmpInst::ICMP_EQ, VectorTripCount, C);
14761480}
14771481
1482+ // / Try to replace multiple active lane masks used for control flow with
1483+ // / a single, wide active lane mask instruction followed by multiple
1484+ // / extract subvector intrinsics. This applies to the active lane mask
1485+ // / instructions both in the loop and in the preheader.
1486+ // / Incoming values of all ActiveLaneMaskPHIs are updated to use the
1487+ // / new extracts from the first active lane mask, which has it's last
1488+ // / operand (multiplier) set to UF.
1489+ static  bool  tryToReplaceALMWithWideALM (VPlan &Plan, ElementCount VF,
1490+                                        unsigned  UF) {
1491+   if  (!EnableWideActiveLaneMask || !VF.isVector () || UF == 1 )
1492+     return  false ;
1493+ 
1494+   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion ();
1495+   VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock ();
1496+   auto  *Term = &ExitingVPBB->back ();
1497+ 
1498+   using  namespace  llvm ::VPlanPatternMatch; 
1499+   if  (!match (Term, m_BranchOnCond (m_Not (m_ActiveLaneMask (
1500+                        m_VPValue (), m_VPValue (), m_VPValue ())))))
1501+     return  false ;
1502+ 
1503+   auto  *Header = cast<VPBasicBlock>(VectorRegion->getEntry ());
1504+   LLVMContext &Ctx = Plan.getContext ();
1505+ 
1506+   auto  ExtractFromALM = [&](VPInstruction *ALM,
1507+                             SmallVectorImpl<VPValue *> &Extracts) {
1508+     DebugLoc DL = ALM->getDebugLoc ();
1509+     for  (unsigned  Part = 0 ; Part < UF; ++Part) {
1510+       SmallVector<VPValue *> Ops;
1511+       Ops.append ({ALM, Plan.getOrAddLiveIn (
1512+                            ConstantInt::get (IntegerType::getInt64Ty (Ctx),
1513+                                             VF.getKnownMinValue () * Part))});
1514+       auto  *Ext = new  VPWidenIntrinsicRecipe (Intrinsic::vector_extract, Ops,
1515+                                              IntegerType::getInt1Ty (Ctx), DL);
1516+       Extracts[Part] = Ext;
1517+       Ext->insertAfter (ALM);
1518+     }
1519+   };
1520+ 
1521+   //  Create a list of each active lane mask phi, ordered by unroll part.
1522+   SmallVector<VPActiveLaneMaskPHIRecipe *> Phis (UF, nullptr );
1523+   for  (VPRecipeBase &R : Header->phis ()) {
1524+     auto  *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
1525+     if  (!Phi)
1526+       continue ;
1527+     VPValue *Index = nullptr ;
1528+     match (Phi->getBackedgeValue (),
1529+           m_ActiveLaneMask (m_VPValue (Index), m_VPValue (), m_VPValue ()));
1530+     assert (Index && " Expected index from ActiveLaneMask instruction"  );
1531+ 
1532+     auto  *II = dyn_cast<VPInstruction>(Index);
1533+     if  (II && II->getOpcode () == VPInstruction::CanonicalIVIncrementForPart) {
1534+       auto  Part = cast<ConstantInt>(II->getOperand (1 )->getLiveInIRValue ());
1535+       Phis[Part->getZExtValue ()] = Phi;
1536+     } else 
1537+       //  Anything other than a CanonicalIVIncrementForPart is part 0
1538+       Phis[0 ] = Phi;
1539+   }
1540+ 
1541+   assert (all_of (Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return  Phi; }) &&
1542+          " Expected one VPActiveLaneMaskPHIRecipe for each unroll part"  );
1543+ 
1544+   auto  *EntryALM = cast<VPInstruction>(Phis[0 ]->getStartValue ());
1545+   auto  *LoopALM = cast<VPInstruction>(Phis[0 ]->getBackedgeValue ());
1546+ 
1547+   assert ((EntryALM->getOpcode () == VPInstruction::ActiveLaneMask &&
1548+           LoopALM->getOpcode () == VPInstruction::ActiveLaneMask) &&
1549+          " Expected incoming values of Phi to be ActiveLaneMasks"  );
1550+ 
1551+   //  When using wide lane masks, the return type of the get.active.lane.mask
1552+   //  intrinsic is VF x UF (last operand).
1553+   VPValue *ALMMultiplier =
1554+       Plan.getOrAddLiveIn (ConstantInt::get (IntegerType::getInt64Ty (Ctx), UF));
1555+   EntryALM->setOperand (2 , ALMMultiplier);
1556+   LoopALM->setOperand (2 , ALMMultiplier);
1557+ 
1558+   //  Create UF x extract vectors and insert into preheader.
1559+   SmallVector<VPValue *> EntryExtracts (UF);
1560+   ExtractFromALM (EntryALM, EntryExtracts);
1561+ 
1562+   //  Create UF x extract vectors and insert before the loop compare & branch,
1563+   //  updating the compare to use the first extract.
1564+   SmallVector<VPValue *> LoopExtracts (UF);
1565+   ExtractFromALM (LoopALM, LoopExtracts);
1566+   VPInstruction *Not = cast<VPInstruction>(Term->getOperand (0 ));
1567+   Not->setOperand (0 , LoopExtracts[0 ]);
1568+ 
1569+   //  Update the incoming values of active lane mask phis.
1570+   for  (unsigned  Part = 0 ; Part < UF; ++Part) {
1571+     Phis[Part]->setStartValue (EntryExtracts[Part]);
1572+     Phis[Part]->setBackedgeValue (LoopExtracts[Part]);
1573+   }
1574+ 
1575+   return  true ;
1576+ }
1577+ 
14781578// / Try to simplify the branch condition of \p Plan. This may restrict the
14791579// / resulting plan to \p BestVF and \p BestUF.
14801580static  bool  simplifyBranchConditionForVFAndUF (VPlan &Plan, ElementCount BestVF,
@@ -1486,8 +1586,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
14861586  VPValue *Cond;
14871587  ScalarEvolution &SE = *PSE.getSE ();
14881588  if  (match (Term, m_BranchOnCount (m_VPValue (), m_VPValue ())) ||
1489-       match (Term, m_BranchOnCond (
1490-                       m_Not ( m_ActiveLaneMask ( m_VPValue (), m_VPValue ()))))) {
1589+       match (Term, m_BranchOnCond (m_Not ( m_ActiveLaneMask ( 
1590+                       m_VPValue (),  m_VPValue (), m_VPValue ()))))) {
14911591    //  Try to simplify the branch condition if TC <= VF * UF when the latch
14921592    //  terminator is   BranchOnCount or BranchOnCond where the input is
14931593    //  Not(ActiveLaneMask).
@@ -1566,8 +1666,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
15661666  assert (Plan.hasVF (BestVF) && " BestVF is not available in Plan"  );
15671667  assert (Plan.hasUF (BestUF) && " BestUF is not available in Plan"  );
15681668
1569-   bool  MadeChange =
1570-        simplifyBranchConditionForVFAndUF (Plan, BestVF, BestUF, PSE);
1669+   bool  MadeChange =  tryToReplaceALMWithWideALM (Plan, BestVF, BestUF); 
1670+   MadeChange |=  simplifyBranchConditionForVFAndUF (Plan, BestVF, BestUF, PSE);
15711671  MadeChange |= optimizeVectorInductionWidthForTCAndVFUF (Plan, BestVF, BestUF);
15721672
15731673  if  (MadeChange) {
@@ -2050,9 +2150,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
20502150      " index.part.next"  );
20512151
20522152  //  Create the active lane mask instruction in the VPlan preheader.
2053-   auto  *EntryALM =
2054-       Builder.createNaryOp (VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
2055-                            DL, " active.lane.mask.entry"  );
2153+   VPValue *ALMMultiplier = Plan.getOrAddLiveIn (
2154+       ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
2155+   auto  *EntryALM = Builder.createNaryOp (VPInstruction::ActiveLaneMask,
2156+                                         {EntryIncrement, TC, ALMMultiplier}, DL,
2157+                                         " active.lane.mask.entry"  );
20562158
20572159  //  Now create the ActiveLaneMaskPhi recipe in the main loop using the
20582160  //  preheader ActiveLaneMask instruction.
@@ -2067,8 +2169,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
20672169      Builder.createOverflowingOp (VPInstruction::CanonicalIVIncrementForPart,
20682170                                  {IncrementValue}, {false , false }, DL);
20692171  auto  *ALM = Builder.createNaryOp (VPInstruction::ActiveLaneMask,
2070-                                    {InLoopIncrement, TripCount}, DL ,
2071-                                    " active.lane.mask.next"  );
2172+                                    {InLoopIncrement, TripCount, ALMMultiplier} ,
2173+                                    DL,  " active.lane.mask.next"  );
20722174  LaneMaskPhi->addOperand (ALM);
20732175
20742176  //  Replace the original terminator with BranchOnCond. We have to invert the
@@ -2144,9 +2246,12 @@ void VPlanTransforms::addActiveLaneMask(
21442246        Plan, DataAndControlFlowWithoutRuntimeCheck);
21452247  } else  {
21462248    VPBuilder B = VPBuilder::getToInsertAfter (WideCanonicalIV);
2147-     LaneMask = B.createNaryOp (VPInstruction::ActiveLaneMask,
2148-                               {WideCanonicalIV, Plan.getTripCount ()}, nullptr ,
2149-                               " active.lane.mask"  );
2249+     VPValue *ALMMultiplier = Plan.getOrAddLiveIn (
2250+         ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
2251+     LaneMask =
2252+         B.createNaryOp (VPInstruction::ActiveLaneMask,
2253+                        {WideCanonicalIV, Plan.getTripCount (), ALMMultiplier},
2254+                        nullptr , " active.lane.mask"  );
21502255  }
21512256
21522257  //  Walk users of WideCanonicalIV and replace the header mask of the form
0 commit comments