@@ -683,7 +683,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
683683 // scalars.
684684 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
685685 if (HasOnlyVectorVFs && none_of (WideIV->users (), [WideIV](VPUser *U) {
686- return U->usesScalars (WideIV);
686+ return U->usesScalars (WideIV) ||
687+ match (U, m_ExtractLastElement (m_VPValue ()));
687688 }))
688689 continue ;
689690
@@ -694,12 +695,19 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
694695 WideIV->getTruncInst (), WideIV->getStartValue (), WideIV->getStepValue (),
695696 WideIV->getDebugLoc (), Builder);
696697
698+ bool HasWideOps = any_of (WideIV->users (), [WideIV](VPUser *U) {
699+ return !U->usesScalars (WideIV) &&
700+ !match (U, m_ExtractLastElement (m_VPValue ()));
701+ });
702+
697703 // Update scalar users of IV to use Step instead.
698704 if (!HasOnlyVectorVFs)
699705 WideIV->replaceAllUsesWith (Steps);
700706 else
701- WideIV->replaceUsesWithIf (Steps, [WideIV](VPUser &U, unsigned ) {
702- return U.usesScalars (WideIV);
707+ WideIV->replaceUsesWithIf (Steps, [WideIV, HasWideOps](VPUser &U,
708+ unsigned ) {
709+ return U.usesScalars (WideIV) ||
710+ (!HasWideOps && match (&U, m_ExtractLastElement (m_VPValue ())));
703711 });
704712 }
705713}
@@ -1209,6 +1217,20 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
12091217 continue ;
12101218
12111219 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1220+
1221+ if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr ()) &&
1222+ vputils::isSingleScalar (RepR->getOperand (1 ))) {
1223+ auto *Clone = new VPReplicateRecipe (
1224+ RepOrWidenR->getUnderlyingInstr (), RepOrWidenR->operands (),
1225+ true /* IsSingleScalar*/ , nullptr /* Mask*/ , *RepR /* Metadata*/ );
1226+ Clone->insertBefore (RepOrWidenR);
1227+ auto *Ext = new VPInstruction (VPInstruction::ExtractLastElement,
1228+ {Clone->getOperand (0 )});
1229+ Ext->insertBefore (Clone);
1230+ Clone->setOperand (0 , Ext);
1231+ RepR->eraseFromParent ();
1232+ continue ;
1233+ }
12121234 // Skip recipes that aren't single scalars or don't have only their
12131235 // scalar results used. In the latter case, we would introduce extra
12141236 // broadcasts.
0 commit comments