@@ -861,8 +861,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
861861 VPValue *Op,
862862 ScalarEvolution &SE) {
863863 VPValue *Incoming, *Mask;
864- if (!match (Op, m_VPInstruction<VPInstruction::ExtractLane>(
865- m_FirstActiveLane ( m_VPValue (Mask)), m_VPValue (Incoming))))
864+ if (!match (Op, m_ExtractLane ( m_FirstActiveLane ( m_VPValue (Mask)),
865+ m_VPValue (Incoming))))
866866 return nullptr ;
867867
868868 auto *WideIV = getOptimizableIVOf (Incoming, SE);
@@ -1362,8 +1362,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
13621362 }
13631363
13641364 // Look through ExtractPenultimateElement (BuildVector ....).
1365- if (match (Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1366- m_BuildVector ()))) {
1365+ if (match (Def, m_ExtractPenultimateElement (m_BuildVector ()))) {
13671366 auto *BuildVector = cast<VPInstruction>(Def->getOperand (0 ));
13681367 Def->replaceAllUsesWith (
13691368 BuildVector->getOperand (BuildVector->getNumOperands () - 2 ));
@@ -2175,6 +2174,32 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
21752174 // Set the first operand of RecurSplice to FOR again, after replacing
21762175 // all users.
21772176 RecurSplice->setOperand (0 , FOR);
2177+
2178+ // Check for users extracting at the penultimate active lane of the FOR.
2179+ // If only a single lane is active in the current iteration, we need to
2180+ // select the last element from the previous iteration (from the FOR phi
2181+ // directly).
2182+ for (VPUser *U : RecurSplice->users ()) {
2183+ if (!match (U, m_ExtractLane (m_LastActiveLane (m_VPValue ()),
2184+ m_Specific (RecurSplice))))
2185+ continue ;
2186+
2187+ VPBuilder B (cast<VPInstruction>(U));
2188+ VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand (0 );
2189+ Type *I64Ty = Type::getInt64Ty (Plan.getContext ());
2190+ VPValue *Zero = Plan.getOrAddLiveIn (ConstantInt::get (I64Ty, 0 ));
2191+ VPValue *One = Plan.getOrAddLiveIn (ConstantInt::get (I64Ty, 1 ));
2192+ VPValue *PenultimateIndex =
2193+ B.createNaryOp (Instruction::Sub, {LastActiveLane, One});
2194+ VPValue *PenultimateLastIter =
2195+ B.createNaryOp (VPInstruction::ExtractLane,
2196+ {PenultimateIndex, FOR->getBackedgeValue ()});
2197+ VPValue *LastPrevIter =
2198+ B.createNaryOp (VPInstruction::ExtractLastElement, FOR);
2199+ VPValue *Cmp = B.createICmp (CmpInst::ICMP_EQ, LastActiveLane, Zero);
2200+ VPValue *Sel = B.createSelect (Cmp, LastPrevIter, PenultimateLastIter);
2201+ cast<VPInstruction>(U)->replaceAllUsesWith (Sel);
2202+ }
21782203 }
21792204 return true ;
21802205}
@@ -3563,6 +3588,34 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
35633588 ToRemove.push_back (Expr);
35643589 }
35653590
3591+ // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3592+ auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3593+ if (LastActiveL &&
3594+ LastActiveL->getOpcode () == VPInstruction::LastActiveLane) {
3595+ // Create Not(Mask) for all operands.
3596+ SmallVector<VPValue *, 2 > NotMasks;
3597+ for (VPValue *Op : LastActiveL->operands ()) {
3598+ VPValue *NotMask = Builder.createNot (Op, LastActiveL->getDebugLoc ());
3599+ NotMasks.push_back (NotMask);
3600+ }
3601+
3602+ // Create FirstActiveLane on the inverted masks.
3603+ VPValue *FirstInactiveLane = Builder.createNaryOp (
3604+ VPInstruction::FirstActiveLane, NotMasks,
3605+ LastActiveL->getDebugLoc (), " first.inactive.lane" );
3606+
3607+ // Subtract 1 to get the last active lane.
3608+ VPValue *One = Plan.getOrAddLiveIn (
3609+ ConstantInt::get (Type::getInt64Ty (Plan.getContext ()), 1 ));
3610+ VPValue *LastLane = Builder.createNaryOp (
3611+ Instruction::Sub, {FirstInactiveLane, One},
3612+ LastActiveL->getDebugLoc (), " last.active.lane" );
3613+
3614+ LastActiveL->replaceAllUsesWith (LastLane);
3615+ ToRemove.push_back (LastActiveL);
3616+ continue ;
3617+ }
3618+
35663619 VPValue *VectorStep;
35673620 VPValue *ScalarStep;
35683621 if (!match (&R, m_VPInstruction<VPInstruction::WideIVStep>(
0 commit comments