@@ -1371,6 +1371,18 @@ class BoUpSLP {
13711371 return MinBWs.at(VectorizableTree.front().get()).second;
13721372 }
13731373
1374+ /// Returns reduction bitwidth and signedness, if it does not match the
1375+ /// original requested size.
1376+ std::optional<std::pair<unsigned, bool>> getReductionBitWidthAndSign() const {
1377+ if (ReductionBitWidth == 0 ||
1378+ ReductionBitWidth ==
1379+ DL->getTypeSizeInBits(
1380+ VectorizableTree.front()->Scalars.front()->getType()))
1381+ return std::nullopt;
1382+ return std::make_pair(ReductionBitWidth,
1383+ MinBWs.at(VectorizableTree.front().get()).second);
1384+ }
1385+
13741386 /// Builds external uses of the vectorized scalars, i.e. the list of
13751387 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
13761388 /// ExternallyUsedValues contains additional list of external uses to handle
@@ -17887,24 +17899,37 @@ void BoUpSLP::computeMinimumValueSizes() {
1788717899 // Add reduction ops sizes, if any.
1788817900 if (UserIgnoreList &&
1788917901 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
17890- for (Value *V : *UserIgnoreList) {
17891- auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17892- auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
17893- unsigned BitWidth1 = NumTypeBits - NumSignBits;
17894- if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17895- ++BitWidth1;
17896- unsigned BitWidth2 = BitWidth1;
17897- if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17898- auto Mask = DB->getDemandedBits(cast<Instruction>(V));
17899- BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17902+ // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
17903+ // x i1> to in)).
17904+ if (all_of(*UserIgnoreList,
17905+ [](Value *V) {
17906+ return cast<Instruction>(V)->getOpcode() == Instruction::Add;
17907+ }) &&
17908+ VectorizableTree.front()->State == TreeEntry::Vectorize &&
17909+ VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
17910+ cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
17911+ Builder.getInt1Ty()) {
17912+ ReductionBitWidth = 1;
17913+ } else {
17914+ for (Value *V : *UserIgnoreList) {
17915+ unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17916+ TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
17917+ unsigned BitWidth1 = NumTypeBits - NumSignBits;
17918+ if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17919+ ++BitWidth1;
17920+ unsigned BitWidth2 = BitWidth1;
17921+ if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17922+ APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
17923+ BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17924+ }
17925+ ReductionBitWidth =
17926+ std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
1790017927 }
17901- ReductionBitWidth =
17902- std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
17903- }
17904- if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17905- ReductionBitWidth = 8;
17928+ if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17929+ ReductionBitWidth = 8;
1790617930
17907- ReductionBitWidth = bit_ceil(ReductionBitWidth);
17931+ ReductionBitWidth = bit_ceil(ReductionBitWidth);
17932+ }
1790817933 }
1790917934 bool IsTopRoot = NodeIdx == 0;
1791017935 while (NodeIdx < VectorizableTree.size() &&
@@ -19760,8 +19785,8 @@ class HorizontalReduction {
1976019785
1976119786 // Estimate cost.
1976219787 InstructionCost TreeCost = V.getTreeCost(VL);
19763- InstructionCost ReductionCost =
19764- getReductionCost( TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF );
19788+ InstructionCost ReductionCost = getReductionCost(
19789+ TTI, VL, IsCmpSelMinMax, RdxFMF, V.getReductionBitWidthAndSign() );
1976519790 InstructionCost Cost = TreeCost + ReductionCost;
1976619791 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
1976719792 << " for reduction\n");
@@ -19866,10 +19891,12 @@ class HorizontalReduction {
1986619891 createStrideMask(I, ScalarTyNumElements, VL.size());
1986719892 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
1986819893 ReducedSubTree = Builder.CreateInsertElement(
19869- ReducedSubTree, emitReduction(Lane, Builder, TTI), I);
19894+ ReducedSubTree,
19895+ emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
1987019896 }
1987119897 } else {
19872- ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI);
19898+ ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
19899+ RdxRootInst->getType());
1987319900 }
1987419901 if (ReducedSubTree->getType() != VL.front()->getType()) {
1987519902 assert(ReducedSubTree->getType() != VL.front()->getType() &&
@@ -20050,12 +20077,13 @@ class HorizontalReduction {
2005020077
2005120078private:
2005220079 /// Calculate the cost of a reduction.
20053- InstructionCost getReductionCost(TargetTransformInfo *TTI,
20054- ArrayRef<Value *> ReducedVals,
20055- bool IsCmpSelMinMax, unsigned ReduxWidth ,
20056- FastMathFlags FMF ) {
20080+ InstructionCost getReductionCost(
20081+ TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals,
20082+ bool IsCmpSelMinMax, FastMathFlags FMF ,
20083+ const std::optional<std::pair<unsigned, bool>> BitwidthAndSign ) {
2005720084 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2005820085 Type *ScalarTy = ReducedVals.front()->getType();
20086+ unsigned ReduxWidth = ReducedVals.size();
2005920087 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
2006020088 InstructionCost VectorCost = 0, ScalarCost;
2006120089 // If all of the reduced values are constant, the vector cost is 0, since
@@ -20114,8 +20142,22 @@ class HorizontalReduction {
2011420142 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
2011520143 /*Extract*/ false, TTI::TCK_RecipThroughput);
2011620144 } else {
20117- VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF,
20118- CostKind);
20145+ auto [Bitwidth, IsSigned] =
20146+ BitwidthAndSign.value_or(std::make_pair(0u, false));
20147+ if (RdxKind == RecurKind::Add && Bitwidth == 1) {
20148+ // Represent vector_reduce_add(ZExt(<n x i1>)) to
20149+ // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20150+ auto *IntTy = IntegerType::get(ScalarTy->getContext(), ReduxWidth);
20151+ IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
20152+ VectorCost =
20153+ TTI->getCastInstrCost(Instruction::BitCast, IntTy,
20154+ getWidenedType(ScalarTy, ReduxWidth),
20155+ TTI::CastContextHint::None, CostKind) +
20156+ TTI->getIntrinsicInstrCost(ICA, CostKind);
20157+ } else {
20158+ VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20159+ FMF, CostKind);
20160+ }
2011920161 }
2012020162 }
2012120163 ScalarCost = EvaluateScalarCost([&]() {
@@ -20152,11 +20194,22 @@ class HorizontalReduction {
2015220194
2015320195 /// Emit a horizontal reduction of the vectorized value.
2015420196 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20155- const TargetTransformInfo *TTI) {
20197+ const TargetTransformInfo *TTI, Type *DestTy ) {
2015620198 assert(VectorizedValue && "Need to have a vectorized tree node");
2015720199 assert(RdxKind != RecurKind::FMulAdd &&
2015820200 "A call to the llvm.fmuladd intrinsic is not handled yet");
2015920201
20202+ auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20203+ if (FTy->getScalarType() == Builder.getInt1Ty() &&
20204+ RdxKind == RecurKind::Add &&
20205+ DestTy->getScalarType() != FTy->getScalarType()) {
20206+ // Convert vector_reduce_add(ZExt(<n x i1>)) to
20207+ // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20208+ Value *V = Builder.CreateBitCast(
20209+ VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20210+ ++NumVectorInstructions;
20211+ return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20212+ }
2016020213 ++NumVectorInstructions;
2016120214 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
2016220215 }
0 commit comments