@@ -4878,6 +4878,16 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48784878 }
48794879}
48804880
4881+ // / Get the VF scaling factor applied to the recipe's output, if the recipe has
4882+ // / one.
4883+ static unsigned getVFScaleFactor (VPRecipeBase *R) {
4884+ if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
4885+ return RR->getVFScaleFactor ();
4886+ if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
4887+ return RR->getVFScaleFactor ();
4888+ return 1 ;
4889+ }
4890+
48814891// / Estimate the register usage for \p Plan and vectorization factors in \p VFs
48824892// / by calculating the highest number of values that are live at a single
48834893// / location as a rough estimate. Returns the register usage for each VF in \p
@@ -5032,10 +5042,19 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
50325042 // even in the scalar case.
50335043 RegUsage[ClassID] += 1 ;
50345044 } else {
5045+ // The output from scaled phis and scaled reductions actually has
5046+ // fewer lanes than the VF.
5047+ unsigned ScaleFactor = getVFScaleFactor (R);
5048+ ElementCount VF = VFs[J].divideCoefficientBy (ScaleFactor);
5049+ LLVM_DEBUG (if (VF != VFs[J]) {
5050+ dbgs () << " LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
5051+ << " for " << *R << " \n " ;
5052+ });
5053+
50355054 for (VPValue *DefV : R->definedValues ()) {
50365055 Type *ScalarTy = TypeInfo.inferScalarType (DefV);
50375056 unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5038- RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J] );
5057+ RegUsage[ClassID] += GetRegUsage (ScalarTy, VF );
50395058 }
50405059 }
50415060 }
@@ -9141,8 +9160,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
91419160 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
91429161 return tryToWidenMemory (Instr, Operands, Range);
91439162
9144- if (getScalingForReduction (Instr))
9145- return tryToCreatePartialReduction (Instr, Operands);
9163+ if (std::optional< unsigned > ScaleFactor = getScalingForReduction (Instr))
9164+ return tryToCreatePartialReduction (Instr, Operands, ScaleFactor. value () );
91469165
91479166 if (!shouldWiden (Instr, Range))
91489167 return nullptr ;
@@ -9166,7 +9185,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
91669185
91679186VPRecipeBase *
91689187VPRecipeBuilder::tryToCreatePartialReduction (Instruction *Reduction,
9169- ArrayRef<VPValue *> Operands) {
9188+ ArrayRef<VPValue *> Operands,
9189+ unsigned ScaleFactor) {
91709190 assert (Operands.size () == 2 &&
91719191 " Unexpected number of operands for partial reduction" );
91729192
@@ -9199,7 +9219,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
91999219 BinOp = Builder.createSelect (Mask, BinOp, Zero, Reduction->getDebugLoc ());
92009220 }
92019221 return new VPPartialReductionRecipe (ReductionOpcode, BinOp, Accumulator,
9202- Reduction);
9222+ ScaleFactor, Reduction);
92039223}
92049224
92059225void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
0 commit comments