@@ -826,8 +826,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
826826 VPValue *Op,
827827 ScalarEvolution &SE) {
828828 VPValue *Incoming, *Mask;
829- if (!match (Op, m_VPInstruction<VPInstruction::ExtractLane>(
830- m_FirstActiveLane ( m_VPValue (Mask)), m_VPValue (Incoming))))
829+ if (!match (Op, m_ExtractLane ( m_FirstActiveLane ( m_VPValue (Mask)),
830+ m_VPValue (Incoming))))
831831 return nullptr ;
832832
833833 auto *WideIV = getOptimizableIVOf (Incoming, SE);
@@ -1295,8 +1295,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
12951295 }
12961296
12971297 // Look through ExtractPenultimateElement (BuildVector ....).
1298- if (match (Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1299- m_BuildVector ()))) {
1298+ if (match (Def, m_ExtractPenultimateElement (m_BuildVector ()))) {
13001299 auto *BuildVector = cast<VPInstruction>(Def->getOperand (0 ));
13011300 Def->replaceAllUsesWith (
13021301 BuildVector->getOperand (BuildVector->getNumOperands () - 2 ));
@@ -2106,6 +2105,32 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
21062105 // Set the first operand of RecurSplice to FOR again, after replacing
21072106 // all users.
21082107 RecurSplice->setOperand (0 , FOR);
2108+
2109+ // Check for users extracting at the penultimate active lane of the FOR.
2110+ // If only a single lane is active in the current iteration, we need to
2111+ // select the last element from the previous iteration (from the FOR phi
2112+ // directly).
2113+ for (VPUser *U : RecurSplice->users ()) {
2114+ if (!match (U, m_ExtractLane (m_LastActiveLane (m_VPValue ()),
2115+ m_Specific (RecurSplice))))
2116+ continue ;
2117+
2118+ VPBuilder B (cast<VPInstruction>(U));
2119+ VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand (0 );
2120+ Type *I64Ty = Type::getInt64Ty (Plan.getContext ());
2121+ VPValue *Zero = Plan.getOrAddLiveIn (ConstantInt::get (I64Ty, 0 ));
2122+ VPValue *One = Plan.getOrAddLiveIn (ConstantInt::get (I64Ty, 1 ));
2123+ VPValue *PenultimateIndex =
2124+ B.createNaryOp (Instruction::Sub, {LastActiveLane, One});
2125+ VPValue *PenultimateLastIter =
2126+ B.createNaryOp (VPInstruction::ExtractLane,
2127+ {PenultimateIndex, FOR->getBackedgeValue ()});
2128+ VPValue *LastPrevIter =
2129+ B.createNaryOp (VPInstruction::ExtractLastElement, FOR);
2130+ VPValue *Cmp = B.createICmp (CmpInst::ICMP_EQ, LastActiveLane, Zero);
2131+ VPValue *Sel = B.createSelect (Cmp, LastPrevIter, PenultimateLastIter);
2132+ cast<VPInstruction>(U)->replaceAllUsesWith (Sel);
2133+ }
21092134 }
21102135 return true ;
21112136}
@@ -3492,6 +3517,34 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
34923517 ToRemove.push_back (Expr);
34933518 }
34943519
3520+ // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3521+ auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3522+ if (LastActiveL &&
3523+ LastActiveL->getOpcode () == VPInstruction::LastActiveLane) {
3524+ // Create Not(Mask) for all operands.
3525+ SmallVector<VPValue *, 2 > NotMasks;
3526+ for (VPValue *Op : LastActiveL->operands ()) {
3527+ VPValue *NotMask = Builder.createNot (Op, LastActiveL->getDebugLoc ());
3528+ NotMasks.push_back (NotMask);
3529+ }
3530+
3531+ // Create FirstActiveLane on the inverted masks.
3532+ VPValue *FirstInactiveLane = Builder.createNaryOp (
3533+ VPInstruction::FirstActiveLane, NotMasks,
3534+ LastActiveL->getDebugLoc (), " first.inactive.lane" );
3535+
3536+ // Subtract 1 to get the last active lane.
3537+ VPValue *One = Plan.getOrAddLiveIn (
3538+ ConstantInt::get (Type::getInt64Ty (Plan.getContext ()), 1 ));
3539+ VPValue *LastLane = Builder.createNaryOp (
3540+ Instruction::Sub, {FirstInactiveLane, One},
3541+ LastActiveL->getDebugLoc (), " last.active.lane" );
3542+
3543+ LastActiveL->replaceAllUsesWith (LastLane);
3544+ ToRemove.push_back (LastActiveL);
3545+ continue ;
3546+ }
3547+
34953548 VPValue *VectorStep;
34963549 VPValue *ScalarStep;
34973550 if (!match (&R, m_VPInstruction<VPInstruction::WideIVStep>(
0 commit comments