Skip to content

Commit 9e240ac

Browse files
committed
[𝘀𝗽𝗿] initial version
Created using spr 1.3.5
1 parent 18d2539 commit 9e240ac

File tree

9 files changed

+509
-305
lines changed

9 files changed

+509
-305
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 156 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4846,8 +4846,21 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
48464846
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
48474847
int Index = 0, VectorType *SubTp = nullptr,
48484848
ArrayRef<const Value *> Args = {}) {
4849-
if (Kind != TTI::SK_PermuteTwoSrc)
4849+
if (Kind != TTI::SK_PermuteTwoSrc) {
4850+
int SplatIdx = PoisonMaskElem;
4851+
if (!Mask.empty() && all_of(Mask, [&](int Idx) {
4852+
if (Idx == PoisonMaskElem)
4853+
return true;
4854+
if (SplatIdx == PoisonMaskElem) {
4855+
SplatIdx = Idx;
4856+
return true;
4857+
}
4858+
return SplatIdx == Idx;
4859+
}))
4860+
return TTI.getShuffleCost(TTI::SK_Broadcast, Tp, Mask, CostKind, Index,
4861+
SubTp, Args);
48504862
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4863+
}
48514864
int NumSrcElts = Tp->getElementCount().getKnownMinValue();
48524865
int NumSubElts;
48534866
if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
@@ -10257,10 +10270,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1025710270
Idx = EMask[Idx];
1025810271
}
1025910272
CommonVF = E->Scalars.size();
10260-
} else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
10261-
Factor && E->Scalars.size() != Mask.size() &&
10273+
} else if (unsigned Factor = E->getInterleaveFactor();
10274+
Factor > 0 && E->Scalars.size() != Mask.size() &&
1026210275
ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
10263-
*Factor)) {
10276+
Factor)) {
1026410277
// Deinterleaved nodes are free.
1026510278
std::iota(CommonMask.begin(), CommonMask.end(), 0);
1026610279
}
@@ -12935,6 +12948,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1293512948
// No perfect match, just shuffle, so choose the first tree node from the
1293612949
// tree.
1293712950
Entries.push_back(FirstEntries.front());
12951+
VF = FirstEntries.front()->getVectorFactor();
1293812952
} else {
1293912953
// Try to find nodes with the same vector factor.
1294012954
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
@@ -12975,6 +12989,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1297512989
Entries.push_back(SecondEntries.front());
1297612990
VF = std::max(Entries.front()->getVectorFactor(),
1297712991
Entries.back()->getVectorFactor());
12992+
} else {
12993+
VF = Entries.front()->getVectorFactor();
1297812994
}
1297912995
}
1298012996

@@ -13077,26 +13093,149 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1307713093
// Pair.first is the offset to the vector, while Pair.second is the index of
1307813094
// scalar in the list.
1307913095
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13080-
unsigned Idx = Part * VL.size() + Pair.second;
13096+
int Idx = Part * VL.size() + Pair.second;
1308113097
Mask[Idx] =
1308213098
Pair.first * VF +
1308313099
(ForOrder ? std::distance(
1308413100
Entries[Pair.first]->Scalars.begin(),
1308513101
find(Entries[Pair.first]->Scalars, VL[Pair.second]))
1308613102
: Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13087-
IsIdentity &= Mask[Idx] == Pair.second;
13103+
IsIdentity &= Mask[Idx] % VL.size() == Idx % VL.size();
1308813104
}
13089-
switch (Entries.size()) {
13090-
case 1:
13091-
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13092-
return TargetTransformInfo::SK_PermuteSingleSrc;
13093-
break;
13094-
case 2:
13095-
if (EntryLanes.size() > 2 || VL.size() <= 2)
13096-
return TargetTransformInfo::SK_PermuteTwoSrc;
13097-
break;
13098-
default:
13099-
break;
13105+
if (ForOrder || IsIdentity || Entries.empty()) {
13106+
switch (Entries.size()) {
13107+
case 1:
13108+
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13109+
return TargetTransformInfo::SK_PermuteSingleSrc;
13110+
break;
13111+
case 2:
13112+
if (EntryLanes.size() > 2 || VL.size() <= 2)
13113+
return TargetTransformInfo::SK_PermuteTwoSrc;
13114+
break;
13115+
default:
13116+
break;
13117+
}
13118+
} else if (!isa<VectorType>(VL.front()->getType()) &&
13119+
(EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13120+
// Do the cost estimation if shuffle beneficial than buildvector.
13121+
SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13122+
std::next(Mask.begin(), (Part + 1) * VL.size()));
13123+
int MinElement = SubMask.front(), MaxElement = SubMask.front();
13124+
for (int Idx : SubMask) {
13125+
if (Idx == PoisonMaskElem)
13126+
continue;
13127+
if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13128+
MinElement = Idx;
13129+
if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13130+
MaxElement = Idx;
13131+
}
13132+
assert(MaxElement >= 0 && MinElement >= 0 &&
13133+
"Expected at least single element.");
13134+
unsigned NewVF = std::max<unsigned>(
13135+
VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13136+
(MaxElement % VF) -
13137+
(MinElement % VF) + 1));
13138+
if (NewVF < VF) {
13139+
for_each(SubMask, [&](int &Idx) {
13140+
if (Idx == PoisonMaskElem)
13141+
return;
13142+
Idx = (Idx % VF) - (MinElement % VF) +
13143+
(Idx >= static_cast<int>(VF) ? NewVF : 0);
13144+
});
13145+
VF = NewVF;
13146+
}
13147+
13148+
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13149+
auto *VecTy = getWidenedType(VL.front()->getType(), VF);
13150+
auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13151+
auto GetShuffleCost = [&,
13152+
&TTI = *TTI](ArrayRef<int> Mask,
13153+
ArrayRef<const TreeEntry *> Entries,
13154+
VectorType *VecTy) -> InstructionCost {
13155+
if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13156+
ShuffleVectorInst::isDeInterleaveMaskOfFactor(
13157+
Mask, Entries.front()->getInterleaveFactor()))
13158+
return TTI::TCC_Free;
13159+
return ::getShuffleCost(TTI,
13160+
Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13161+
: TTI::SK_PermuteSingleSrc,
13162+
VecTy, Mask, CostKind);
13163+
};
13164+
InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13165+
InstructionCost FirstShuffleCost = 0;
13166+
SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13167+
if (Entries.size() == 1 || !Entries[0]->isGather()) {
13168+
FirstShuffleCost = ShuffleCost;
13169+
} else {
13170+
// Transform mask to include only first entry.
13171+
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13172+
bool IsIdentity = true;
13173+
for (auto [I, Idx] : enumerate(FirstMask)) {
13174+
if (Idx >= static_cast<int>(VF)) {
13175+
Idx = PoisonMaskElem;
13176+
} else {
13177+
DemandedElts.clearBit(I);
13178+
if (Idx != PoisonMaskElem)
13179+
IsIdentity &= static_cast<int>(I) == Idx;
13180+
}
13181+
}
13182+
if (!IsIdentity)
13183+
FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13184+
FirstShuffleCost += TTI->getScalarizationOverhead(
13185+
MaskVecTy, DemandedElts, /*Insert=*/true,
13186+
/*Extract=*/false, CostKind);
13187+
}
13188+
InstructionCost SecondShuffleCost = 0;
13189+
SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13190+
if (Entries.size() == 1 || !Entries[1]->isGather()) {
13191+
SecondShuffleCost = ShuffleCost;
13192+
} else {
13193+
// Transform mask to include only first entry.
13194+
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13195+
bool IsIdentity = true;
13196+
for (auto [I, Idx] : enumerate(SecondMask)) {
13197+
if (Idx < static_cast<int>(VF) && Idx >= 0) {
13198+
Idx = PoisonMaskElem;
13199+
} else {
13200+
DemandedElts.clearBit(I);
13201+
if (Idx != PoisonMaskElem) {
13202+
Idx -= VF;
13203+
IsIdentity &= static_cast<int>(I) == Idx;
13204+
}
13205+
}
13206+
}
13207+
if (!IsIdentity)
13208+
SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13209+
SecondShuffleCost += TTI->getScalarizationOverhead(
13210+
MaskVecTy, DemandedElts, /*Insert=*/true,
13211+
/*Extract=*/false, CostKind);
13212+
}
13213+
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13214+
for (auto [I, Idx] : enumerate(SubMask))
13215+
if (Idx == PoisonMaskElem)
13216+
DemandedElts.clearBit(I);
13217+
InstructionCost BuildVectorCost =
13218+
TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13219+
/*Extract=*/false, CostKind);
13220+
const TreeEntry *BestEntry = nullptr;
13221+
if (FirstShuffleCost < ShuffleCost) {
13222+
copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
13223+
BestEntry = Entries.front();
13224+
ShuffleCost = FirstShuffleCost;
13225+
}
13226+
if (SecondShuffleCost < ShuffleCost) {
13227+
copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
13228+
BestEntry = Entries[1];
13229+
ShuffleCost = SecondShuffleCost;
13230+
}
13231+
if (BuildVectorCost >= ShuffleCost) {
13232+
if (BestEntry) {
13233+
Entries.clear();
13234+
Entries.push_back(BestEntry);
13235+
}
13236+
return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13237+
: TargetTransformInfo::SK_PermuteSingleSrc;
13238+
}
1310013239
}
1310113240
Entries.clear();
1310213241
// Clear the corresponding mask elements.

llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ define void @test() {
4646
; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x float> [ poison, %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
4747
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
4848
; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <16 x float> [[TMP17]], [[TMP13]]
49-
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 1, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 6, i32 7, i32 7>
49+
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 1, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
5050
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
51-
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 15>
51+
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
5252
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
53-
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 17, i32 6, i32 7, i32 8, i32 23, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
54-
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 12, i32 13, i32 14, i32 15>
53+
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 17, i32 6, i32 7, i32 8, i32 23, i32 10, i32 11, i32 12, i32 22, i32 14, i32 15>
54+
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 5, i32 13, i32 9, i32 9>
5555
; CHECK-NEXT: [[TMP25:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP14]], <2 x float> [[TMP0]], i64 2)
5656
; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP25]]
5757
; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]

0 commit comments

Comments
 (0)