Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 156 additions & 17 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4846,8 +4846,21 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
int Index = 0, VectorType *SubTp = nullptr,
ArrayRef<const Value *> Args = {}) {
if (Kind != TTI::SK_PermuteTwoSrc)
if (Kind != TTI::SK_PermuteTwoSrc) {
int SplatIdx = PoisonMaskElem;
if (!Mask.empty() && all_of(Mask, [&](int Idx) {
if (Idx == PoisonMaskElem)
return true;
if (SplatIdx == PoisonMaskElem) {
SplatIdx = Idx;
return true;
}
return SplatIdx == Idx;
}))
return TTI.getShuffleCost(TTI::SK_Broadcast, Tp, Mask, CostKind, Index,
SubTp, Args);
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
}
int NumSrcElts = Tp->getElementCount().getKnownMinValue();
int NumSubElts;
if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
Expand Down Expand Up @@ -10257,10 +10270,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
Idx = EMask[Idx];
}
CommonVF = E->Scalars.size();
} else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
Factor && E->Scalars.size() != Mask.size() &&
} else if (unsigned Factor = E->getInterleaveFactor();
Factor > 0 && E->Scalars.size() != Mask.size() &&
ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
*Factor)) {
Factor)) {
// Deinterleaved nodes are free.
std::iota(CommonMask.begin(), CommonMask.end(), 0);
}
Expand Down Expand Up @@ -12935,6 +12948,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// No perfect match, just shuffle, so choose the first tree node from the
// tree.
Entries.push_back(FirstEntries.front());
VF = FirstEntries.front()->getVectorFactor();
} else {
// Try to find nodes with the same vector factor.
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
Expand Down Expand Up @@ -12975,6 +12989,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
Entries.push_back(SecondEntries.front());
VF = std::max(Entries.front()->getVectorFactor(),
Entries.back()->getVectorFactor());
} else {
VF = Entries.front()->getVectorFactor();
}
}

Expand Down Expand Up @@ -13077,26 +13093,149 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// Pair.first is the offset to the vector, while Pair.second is the index of
// scalar in the list.
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
unsigned Idx = Part * VL.size() + Pair.second;
int Idx = Part * VL.size() + Pair.second;
Mask[Idx] =
Pair.first * VF +
(ForOrder ? std::distance(
Entries[Pair.first]->Scalars.begin(),
find(Entries[Pair.first]->Scalars, VL[Pair.second]))
: Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
IsIdentity &= Mask[Idx] == Pair.second;
IsIdentity &= Mask[Idx] % VL.size() == Idx % VL.size();
}
switch (Entries.size()) {
case 1:
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteSingleSrc;
break;
case 2:
if (EntryLanes.size() > 2 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteTwoSrc;
break;
default:
break;
if (ForOrder || IsIdentity || Entries.empty()) {
switch (Entries.size()) {
case 1:
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteSingleSrc;
break;
case 2:
if (EntryLanes.size() > 2 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteTwoSrc;
break;
default:
break;
}
} else if (!isa<VectorType>(VL.front()->getType()) &&
(EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
// Do the cost estimation if shuffle beneficial than buildvector.
SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
std::next(Mask.begin(), (Part + 1) * VL.size()));
int MinElement = SubMask.front(), MaxElement = SubMask.front();
for (int Idx : SubMask) {
if (Idx == PoisonMaskElem)
continue;
if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
MinElement = Idx;
if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
MaxElement = Idx;
}
assert(MaxElement >= 0 && MinElement >= 0 &&
"Expected at least single element.");
unsigned NewVF = std::max<unsigned>(
VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
(MaxElement % VF) -
(MinElement % VF) + 1));
if (NewVF < VF) {
for_each(SubMask, [&](int &Idx) {
if (Idx == PoisonMaskElem)
return;
Idx = (Idx % VF) - (MinElement % VF) +
(Idx >= static_cast<int>(VF) ? NewVF : 0);
});
VF = NewVF;
}

constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto *VecTy = getWidenedType(VL.front()->getType(), VF);
auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
auto GetShuffleCost = [&,
&TTI = *TTI](ArrayRef<int> Mask,
ArrayRef<const TreeEntry *> Entries,
VectorType *VecTy) -> InstructionCost {
if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
ShuffleVectorInst::isDeInterleaveMaskOfFactor(
Mask, Entries.front()->getInterleaveFactor()))
return TTI::TCC_Free;
return ::getShuffleCost(TTI,
Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
: TTI::SK_PermuteSingleSrc,
VecTy, Mask, CostKind);
};
InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
InstructionCost FirstShuffleCost = 0;
SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
if (Entries.size() == 1 || !Entries[0]->isGather()) {
FirstShuffleCost = ShuffleCost;
} else {
// Transform mask to include only first entry.
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
bool IsIdentity = true;
for (auto [I, Idx] : enumerate(FirstMask)) {
if (Idx >= static_cast<int>(VF)) {
Idx = PoisonMaskElem;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is Idx a reference? Make it explicit in the auto?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cannot do it here, since only Idx is reference. Syntax does not allow to express that only Idx is reference here

} else {
DemandedElts.clearBit(I);
if (Idx != PoisonMaskElem)
IsIdentity &= static_cast<int>(I) == Idx;
}
}
if (!IsIdentity)
FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
FirstShuffleCost += TTI->getScalarizationOverhead(
MaskVecTy, DemandedElts, /*Insert=*/true,
/*Extract=*/false, CostKind);
}
InstructionCost SecondShuffleCost = 0;
SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
if (Entries.size() == 1 || !Entries[1]->isGather()) {
SecondShuffleCost = ShuffleCost;
} else {
// Transform mask to include only first entry.
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
bool IsIdentity = true;
for (auto [I, Idx] : enumerate(SecondMask)) {
if (Idx < static_cast<int>(VF) && Idx >= 0) {
Idx = PoisonMaskElem;
} else {
DemandedElts.clearBit(I);
if (Idx != PoisonMaskElem) {
Idx -= VF;
IsIdentity &= static_cast<int>(I) == Idx;
}
}
}
if (!IsIdentity)
SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
SecondShuffleCost += TTI->getScalarizationOverhead(
MaskVecTy, DemandedElts, /*Insert=*/true,
/*Extract=*/false, CostKind);
}
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
for (auto [I, Idx] : enumerate(SubMask))
if (Idx == PoisonMaskElem)
DemandedElts.clearBit(I);
InstructionCost BuildVectorCost =
TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
/*Extract=*/false, CostKind);
const TreeEntry *BestEntry = nullptr;
if (FirstShuffleCost < ShuffleCost) {
copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
BestEntry = Entries.front();
ShuffleCost = FirstShuffleCost;
}
if (SecondShuffleCost < ShuffleCost) {
copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
BestEntry = Entries[1];
ShuffleCost = SecondShuffleCost;
}
if (BuildVectorCost >= ShuffleCost) {
if (BestEntry) {
Entries.clear();
Entries.push_back(BestEntry);
}
return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
}
}
Entries.clear();
// Clear the corresponding mask elements.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@ define void @test() {
; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x float> [ poison, %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <16 x float> [[TMP17]], [[TMP13]]
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 1, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 6, i32 7, i32 7>
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 1, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 17, i32 6, i32 7, i32 8, i32 23, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 17, i32 6, i32 7, i32 8, i32 23, i32 10, i32 11, i32 12, i32 22, i32 14, i32 15>
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 5, i32 13, i32 9, i32 9>
; CHECK-NEXT: [[TMP25:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP14]], <2 x float> [[TMP0]], i64 2)
; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
Expand Down
Loading
Loading