@@ -7056,19 +7056,16 @@ bool BoUpSLP::areAllUsersVectorized(
70567056
70577057static std::pair<InstructionCost, InstructionCost>
70587058getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7059- TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
7059+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7060+ ArrayRef<Type *> ArgTys) {
70607061 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
70617062
70627063 // Calculate the cost of the scalar and vector calls.
7063- SmallVector<Type *, 4> VecTys;
7064- for (Use &Arg : CI->args())
7065- VecTys.push_back(
7066- FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
70677064 FastMathFlags FMF;
70687065 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
70697066 FMF = FPCI->getFastMathFlags();
70707067 SmallVector<const Value *> Arguments(CI->args());
7071- IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys , FMF,
7068+ IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys , FMF,
70727069 dyn_cast<IntrinsicInst>(CI));
70737070 auto IntrinsicCost =
70747071 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
@@ -7081,8 +7078,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
70817078 if (!CI->isNoBuiltin() && VecFunc) {
70827079 // Calculate the cost of the vector library call.
70837080 // If the corresponding vector call is cheaper, return its cost.
7084- LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
7085- TTI::TCK_RecipThroughput);
7081+ LibCost =
7082+ TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
70867083 }
70877084 return {IntrinsicCost, LibCost};
70887085}
@@ -8508,6 +8505,30 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
85088505 return TTI::CastContextHint::None;
85098506}
85108507
8508+ /// Builds the arguments types vector for the given call instruction with the
8509+ /// given \p ID for the specified vector factor.
8510+ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
8511+ const Intrinsic::ID ID,
8512+ const unsigned VF,
8513+ unsigned MinBW) {
8514+ SmallVector<Type *> ArgTys;
8515+ for (auto [Idx, Arg] : enumerate(CI->args())) {
8516+ if (ID != Intrinsic::not_intrinsic) {
8517+ if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
8518+ ArgTys.push_back(Arg->getType());
8519+ continue;
8520+ }
8521+ if (MinBW > 0) {
8522+ ArgTys.push_back(FixedVectorType::get(
8523+ IntegerType::get(CI->getContext(), MinBW), VF));
8524+ continue;
8525+ }
8526+ }
8527+ ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
8528+ }
8529+ return ArgTys;
8530+ }
8531+
85118532InstructionCost
85128533BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
85138534 SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9074,7 +9095,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
90749095 };
90759096 auto GetVectorCost = [=](InstructionCost CommonCost) {
90769097 auto *CI = cast<CallInst>(VL0);
9077- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
9098+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9099+ SmallVector<Type *> ArgTys =
9100+ buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
9101+ It != MinBWs.end() ? It->second.first : 0);
9102+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
90789103 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
90799104 };
90809105 return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -12546,7 +12571,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1254612571
1254712572 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1254812573
12549- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
12574+ SmallVector<Type *> ArgTys =
12575+ buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
12576+ It != MinBWs.end() ? It->second.first : 0);
12577+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
1255012578 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
1255112579 VecCallCosts.first <= VecCallCosts.second;
1255212580
@@ -12555,16 +12583,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1255512583 SmallVector<Type *, 2> TysForDecl;
1255612584 // Add return type if intrinsic is overloaded on it.
1255712585 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
12558- TysForDecl.push_back(
12559- FixedVectorType::get(CI->getType(), E->Scalars.size()));
12586+ TysForDecl.push_back(VecTy);
1256012587 auto *CEI = cast<CallInst>(VL0);
1256112588 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
1256212589 ValueList OpVL;
1256312590 // Some intrinsics have scalar arguments. This argument should not be
1256412591 // vectorized.
1256512592 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
1256612593 ScalarArg = CEI->getArgOperand(I);
12567- OpVecs.push_back(CEI->getArgOperand(I));
12594+ // if decided to reduce bitwidth of abs intrinsic, it second argument
12595+ // must be set false (do not return poison, if value issigned min).
12596+ if (ID == Intrinsic::abs && It != MinBWs.end() &&
12597+ It->second.first < DL->getTypeSizeInBits(CEI->getType()))
12598+ ScalarArg = Builder.getFalse();
12599+ OpVecs.push_back(ScalarArg);
1256812600 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
1256912601 TysForDecl.push_back(ScalarArg->getType());
1257012602 continue;
@@ -12577,10 +12609,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1257712609 }
1257812610 ScalarArg = CEI->getArgOperand(I);
1257912611 if (cast<VectorType>(OpVec->getType())->getElementType() !=
12580- ScalarArg->getType()) {
12612+ ScalarArg->getType() &&
12613+ It == MinBWs.end()) {
1258112614 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
1258212615 VecTy->getNumElements());
1258312616 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
12617+ } else if (It != MinBWs.end()) {
12618+ OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
1258412619 }
1258512620 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
1258612621 OpVecs.push_back(OpVec);
@@ -14324,6 +14359,45 @@ bool BoUpSLP::collectValuesToDemote(
1432414359 return TryProcessInstruction(I, *ITE, BitWidth, Ops);
1432514360 }
1432614361
14362+ case Instruction::Call: {
14363+ auto *IC = dyn_cast<IntrinsicInst>(I);
14364+ if (!IC)
14365+ break;
14366+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI);
14367+ if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14368+ ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14369+ break;
14370+ SmallVector<Value *> Operands(1, I->getOperand(0));
14371+ End = 1;
14372+ if (ID != Intrinsic::abs) {
14373+ Operands.push_back(I->getOperand(1));
14374+ End = 2;
14375+ }
14376+ InstructionCost BestCost =
14377+ std::numeric_limits<InstructionCost::CostType>::max();
14378+ unsigned BestBitWidth = BitWidth;
14379+ unsigned VF = ITE->Scalars.size();
14380+ // Choose the best bitwidth based on cost estimations.
14381+ auto Checker = [&](unsigned BitWidth, unsigned) {
14382+ unsigned MinBW = PowerOf2Ceil(BitWidth);
14383+ SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
14384+ auto VecCallCosts = getVectorCallCosts(
14385+ IC,
14386+ FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
14387+ TTI, TLI, ArgTys);
14388+ InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
14389+ if (Cost < BestCost) {
14390+ BestCost = Cost;
14391+ BestBitWidth = BitWidth;
14392+ }
14393+ return false;
14394+ };
14395+ [[maybe_unused]] bool NeedToExit;
14396+ (void)AttemptCheckBitwidth(Checker, NeedToExit);
14397+ BitWidth = BestBitWidth;
14398+ return TryProcessInstruction(I, *ITE, BitWidth, Operands);
14399+ }
14400+
1432714401 // Otherwise, conservatively give up.
1432814402 default:
1432914403 break;
0 commit comments