@@ -4846,8 +4846,21 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
48464846 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
48474847 int Index = 0, VectorType *SubTp = nullptr,
48484848 ArrayRef<const Value *> Args = {}) {
4849- if (Kind != TTI::SK_PermuteTwoSrc)
4849+ if (Kind != TTI::SK_PermuteTwoSrc) {
4850+ int SplatIdx = PoisonMaskElem;
4851+ if (!Mask.empty() && all_of(Mask, [&](int Idx) {
4852+ if (Idx == PoisonMaskElem)
4853+ return true;
4854+ if (SplatIdx == PoisonMaskElem) {
4855+ SplatIdx = Idx;
4856+ return true;
4857+ }
4858+ return SplatIdx == Idx;
4859+ }))
4860+ return TTI.getShuffleCost(TTI::SK_Broadcast, Tp, Mask, CostKind, Index,
4861+ SubTp, Args);
48504862 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4863+ }
48514864 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
48524865 int NumSubElts;
48534866 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
@@ -10257,10 +10270,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1025710270 Idx = EMask[Idx];
1025810271 }
1025910272 CommonVF = E->Scalars.size();
10260- } else if (std::optional< unsigned> Factor = E->getInterleaveFactor();
10261- Factor && E->Scalars.size() != Mask.size() &&
10273+ } else if (unsigned Factor = E->getInterleaveFactor();
10274+ Factor > 0 && E->Scalars.size() != Mask.size() &&
1026210275 ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
10263- * Factor)) {
10276+ Factor)) {
1026410277 // Deinterleaved nodes are free.
1026510278 std::iota(CommonMask.begin(), CommonMask.end(), 0);
1026610279 }
@@ -12935,6 +12948,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1293512948 // No perfect match, just shuffle, so choose the first tree node from the
1293612949 // tree.
1293712950 Entries.push_back(FirstEntries.front());
12951+ VF = FirstEntries.front()->getVectorFactor();
1293812952 } else {
1293912953 // Try to find nodes with the same vector factor.
1294012954 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
@@ -12975,6 +12989,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1297512989 Entries.push_back(SecondEntries.front());
1297612990 VF = std::max(Entries.front()->getVectorFactor(),
1297712991 Entries.back()->getVectorFactor());
12992+ } else {
12993+ VF = Entries.front()->getVectorFactor();
1297812994 }
1297912995 }
1298012996
@@ -13077,26 +13093,149 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1307713093 // Pair.first is the offset to the vector, while Pair.second is the index of
1307813094 // scalar in the list.
1307913095 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13080- unsigned Idx = Part * VL.size() + Pair.second;
13096+ int Idx = Part * VL.size() + Pair.second;
1308113097 Mask[Idx] =
1308213098 Pair.first * VF +
1308313099 (ForOrder ? std::distance(
1308413100 Entries[Pair.first]->Scalars.begin(),
1308513101 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
1308613102 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13087- IsIdentity &= Mask[Idx] == Pair.second ;
13103+ IsIdentity &= Mask[Idx] % VL.size() == Idx % VL.size() ;
1308813104 }
13089- switch (Entries.size()) {
13090- case 1:
13091- if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13092- return TargetTransformInfo::SK_PermuteSingleSrc;
13093- break;
13094- case 2:
13095- if (EntryLanes.size() > 2 || VL.size() <= 2)
13096- return TargetTransformInfo::SK_PermuteTwoSrc;
13097- break;
13098- default:
13099- break;
13105+ if (ForOrder || IsIdentity || Entries.empty()) {
13106+ switch (Entries.size()) {
13107+ case 1:
13108+ if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13109+ return TargetTransformInfo::SK_PermuteSingleSrc;
13110+ break;
13111+ case 2:
13112+ if (EntryLanes.size() > 2 || VL.size() <= 2)
13113+ return TargetTransformInfo::SK_PermuteTwoSrc;
13114+ break;
13115+ default:
13116+ break;
13117+ }
13118+ } else if (!isa<VectorType>(VL.front()->getType()) &&
13119+ (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13120+ // Do the cost estimation if shuffle beneficial than buildvector.
13121+ SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13122+ std::next(Mask.begin(), (Part + 1) * VL.size()));
13123+ int MinElement = SubMask.front(), MaxElement = SubMask.front();
13124+ for (int Idx : SubMask) {
13125+ if (Idx == PoisonMaskElem)
13126+ continue;
13127+ if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13128+ MinElement = Idx;
13129+ if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13130+ MaxElement = Idx;
13131+ }
13132+ assert(MaxElement >= 0 && MinElement >= 0 &&
13133+ "Expected at least single element.");
13134+ unsigned NewVF = std::max<unsigned>(
13135+ VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13136+ (MaxElement % VF) -
13137+ (MinElement % VF) + 1));
13138+ if (NewVF < VF) {
13139+ for_each(SubMask, [&](int &Idx) {
13140+ if (Idx == PoisonMaskElem)
13141+ return;
13142+ Idx = (Idx % VF) - (MinElement % VF) +
13143+ (Idx >= static_cast<int>(VF) ? NewVF : 0);
13144+ });
13145+ VF = NewVF;
13146+ }
13147+
13148+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13149+ auto *VecTy = getWidenedType(VL.front()->getType(), VF);
13150+ auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13151+ auto GetShuffleCost = [&,
13152+ &TTI = *TTI](ArrayRef<int> Mask,
13153+ ArrayRef<const TreeEntry *> Entries,
13154+ VectorType *VecTy) -> InstructionCost {
13155+ if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13156+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(
13157+ Mask, Entries.front()->getInterleaveFactor()))
13158+ return TTI::TCC_Free;
13159+ return ::getShuffleCost(TTI,
13160+ Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13161+ : TTI::SK_PermuteSingleSrc,
13162+ VecTy, Mask, CostKind);
13163+ };
13164+ InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13165+ InstructionCost FirstShuffleCost = 0;
13166+ SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13167+ if (Entries.size() == 1 || !Entries[0]->isGather()) {
13168+ FirstShuffleCost = ShuffleCost;
13169+ } else {
13170+ // Transform mask to include only first entry.
13171+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13172+ bool IsIdentity = true;
13173+ for (auto [I, Idx] : enumerate(FirstMask)) {
13174+ if (Idx >= static_cast<int>(VF)) {
13175+ Idx = PoisonMaskElem;
13176+ } else {
13177+ DemandedElts.clearBit(I);
13178+ if (Idx != PoisonMaskElem)
13179+ IsIdentity &= static_cast<int>(I) == Idx;
13180+ }
13181+ }
13182+ if (!IsIdentity)
13183+ FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13184+ FirstShuffleCost += TTI->getScalarizationOverhead(
13185+ MaskVecTy, DemandedElts, /*Insert=*/true,
13186+ /*Extract=*/false, CostKind);
13187+ }
13188+ InstructionCost SecondShuffleCost = 0;
13189+ SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13190+ if (Entries.size() == 1 || !Entries[1]->isGather()) {
13191+ SecondShuffleCost = ShuffleCost;
13192+ } else {
13193+ // Transform mask to include only first entry.
13194+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13195+ bool IsIdentity = true;
13196+ for (auto [I, Idx] : enumerate(SecondMask)) {
13197+ if (Idx < static_cast<int>(VF) && Idx >= 0) {
13198+ Idx = PoisonMaskElem;
13199+ } else {
13200+ DemandedElts.clearBit(I);
13201+ if (Idx != PoisonMaskElem) {
13202+ Idx -= VF;
13203+ IsIdentity &= static_cast<int>(I) == Idx;
13204+ }
13205+ }
13206+ }
13207+ if (!IsIdentity)
13208+ SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13209+ SecondShuffleCost += TTI->getScalarizationOverhead(
13210+ MaskVecTy, DemandedElts, /*Insert=*/true,
13211+ /*Extract=*/false, CostKind);
13212+ }
13213+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13214+ for (auto [I, Idx] : enumerate(SubMask))
13215+ if (Idx == PoisonMaskElem)
13216+ DemandedElts.clearBit(I);
13217+ InstructionCost BuildVectorCost =
13218+ TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13219+ /*Extract=*/false, CostKind);
13220+ const TreeEntry *BestEntry = nullptr;
13221+ if (FirstShuffleCost < ShuffleCost) {
13222+ copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
13223+ BestEntry = Entries.front();
13224+ ShuffleCost = FirstShuffleCost;
13225+ }
13226+ if (SecondShuffleCost < ShuffleCost) {
13227+ copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
13228+ BestEntry = Entries[1];
13229+ ShuffleCost = SecondShuffleCost;
13230+ }
13231+ if (BuildVectorCost >= ShuffleCost) {
13232+ if (BestEntry) {
13233+ Entries.clear();
13234+ Entries.push_back(BestEntry);
13235+ }
13236+ return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13237+ : TargetTransformInfo::SK_PermuteSingleSrc;
13238+ }
1310013239 }
1310113240 Entries.clear();
1310213241 // Clear the corresponding mask elements.
0 commit comments