Skip to content

Commit ebd516d

Browse files
committed
[𝘀𝗽𝗿] initial version
Created using spr 1.3.5
1 parent a1ab5b4 commit ebd516d

File tree

2 files changed

+62
-54
lines changed

2 files changed

+62
-54
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 52 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,22 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
13141314
Sz % NumParts == 0;
13151315
}
13161316

1317+
/// Returns number of parts, the type \p VecTy will be split at the codegen
1318+
/// phase. If the type is going to be scalarized or does not uses whole
1319+
/// registers, returns 1.
1320+
static unsigned
1321+
getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1322+
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1323+
unsigned NumParts = TTI.getNumberOfParts(VecTy);
1324+
if (NumParts == 0 || NumParts >= Limit)
1325+
return 1;
1326+
unsigned Sz = getNumElements(VecTy);
1327+
if (NumParts >= Sz || Sz % NumParts != 0 ||
1328+
!hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1329+
return 1;
1330+
return NumParts;
1331+
}
1332+
13171333
namespace slpvectorizer {
13181334

13191335
/// Bottom Up SLP Vectorizer.
@@ -4618,12 +4634,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
46184634
if (!isValidElementType(ScalarTy))
46194635
return std::nullopt;
46204636
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4621-
int NumParts = TTI->getNumberOfParts(VecTy);
4622-
if (NumParts == 0 || NumParts >= NumScalars ||
4623-
VecTy->getNumElements() % NumParts != 0 ||
4624-
!hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4625-
VecTy->getNumElements() / NumParts))
4626-
NumParts = 1;
4637+
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
46274638
SmallVector<int> ExtractMask;
46284639
SmallVector<int> Mask;
46294640
SmallVector<SmallVector<const TreeEntry *>> Entries;
@@ -5574,8 +5585,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
55745585
}
55755586
}
55765587
if (Sz == 2 && TE.getVectorFactor() == 4 &&
5577-
TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5578-
2 * TE.getVectorFactor())) == 1)
5588+
::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
5589+
2 * TE.getVectorFactor())) == 1)
55795590
return std::nullopt;
55805591
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
55815592
Sz)) {
@@ -9847,12 +9858,15 @@ void BoUpSLP::transformNodes() {
98479858
// only with the single non-undef element).
98489859
bool IsSplat = isSplat(Slice);
98499860
if (Slices.empty() || !IsSplat ||
9850-
(VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9851-
Slice.front()->getType(), VF)),
9852-
1U, VF - 1) !=
9853-
std::clamp(TTI->getNumberOfParts(getWidenedType(
9854-
Slice.front()->getType(), 2 * VF)),
9855-
1U, 2 * VF)) ||
9861+
(VF <= 2 &&
9862+
2 * std::clamp(
9863+
::getNumberOfParts(
9864+
*TTI, getWidenedType(Slice.front()->getType(), VF)),
9865+
1U, VF - 1) !=
9866+
std::clamp(::getNumberOfParts(
9867+
*TTI, getWidenedType(Slice.front()->getType(),
9868+
2 * VF)),
9869+
1U, 2 * VF)) ||
98569870
count(Slice, Slice.front()) ==
98579871
static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
98589872
: 1)) {
@@ -10793,12 +10807,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1079310807
}
1079410808
assert(!CommonMask.empty() && "Expected non-empty common mask.");
1079510809
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10796-
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10797-
if (NumParts == 0 || NumParts >= Mask.size() ||
10798-
MaskVecTy->getNumElements() % NumParts != 0 ||
10799-
!hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10800-
MaskVecTy->getNumElements() / NumParts))
10801-
NumParts = 1;
10810+
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
1080210811
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1080310812
const auto *It =
1080410813
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -10813,12 +10822,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1081310822
}
1081410823
assert(!CommonMask.empty() && "Expected non-empty common mask.");
1081510824
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10816-
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10817-
if (NumParts == 0 || NumParts >= Mask.size() ||
10818-
MaskVecTy->getNumElements() % NumParts != 0 ||
10819-
!hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10820-
MaskVecTy->getNumElements() / NumParts))
10821-
NumParts = 1;
10825+
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
1082210826
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1082310827
const auto *It =
1082410828
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -11351,7 +11355,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1135111355
unsigned const NumElts = SrcVecTy->getNumElements();
1135211356
unsigned const NumScalars = VL.size();
1135311357

11354-
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11358+
unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
1135511359

1135611360
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
1135711361
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -14862,12 +14866,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1486214866
SmallVector<SmallVector<const TreeEntry *>> Entries;
1486314867
Type *OrigScalarTy = GatheredScalars.front()->getType();
1486414868
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14865-
unsigned NumParts = TTI->getNumberOfParts(VecTy);
14866-
if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14867-
VecTy->getNumElements() % NumParts != 0 ||
14868-
!hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14869-
VecTy->getNumElements() / NumParts))
14870-
NumParts = 1;
14869+
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
1487114870
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
1487214871
// Check for gathered extracts.
1487314872
bool Resized = false;
@@ -14899,12 +14898,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1489914898
Resized = true;
1490014899
GatheredScalars.append(VF - GatheredScalars.size(),
1490114900
PoisonValue::get(OrigScalarTy));
14902-
NumParts = TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF));
14903-
if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14904-
VecTy->getNumElements() % NumParts != 0 ||
14905-
!hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14906-
VecTy->getNumElements() / NumParts))
14907-
NumParts = 1;
14901+
NumParts =
14902+
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
1490814903
}
1490914904
}
1491014905
}
@@ -17049,10 +17044,10 @@ void BoUpSLP::optimizeGatherSequence() {
1704917044
// Check if the last undefs actually change the final number of used vector
1705017045
// registers.
1705117046
return SM1.size() - LastUndefsCnt > 1 &&
17052-
TTI->getNumberOfParts(SI1->getType()) ==
17053-
TTI->getNumberOfParts(
17054-
getWidenedType(SI1->getType()->getElementType(),
17055-
SM1.size() - LastUndefsCnt));
17047+
::getNumberOfParts(*TTI, SI1->getType()) ==
17048+
::getNumberOfParts(
17049+
*TTI, getWidenedType(SI1->getType()->getElementType(),
17050+
SM1.size() - LastUndefsCnt));
1705617051
};
1705717052
// Perform O(N^2) search over the gather/shuffle sequences and merge identical
1705817053
// instructions. TODO: We can further optimize this scan if we split the
@@ -17829,9 +17824,12 @@ bool BoUpSLP::collectValuesToDemote(
1782917824
const unsigned VF = E.Scalars.size();
1783017825
Type *OrigScalarTy = E.Scalars.front()->getType();
1783117826
if (UniqueBases.size() <= 2 ||
17832-
TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17833-
TTI->getNumberOfParts(getWidenedType(
17834-
IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17827+
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
17828+
::getNumberOfParts(
17829+
*TTI,
17830+
getWidenedType(
17831+
IntegerType::get(OrigScalarTy->getContext(), BitWidth),
17832+
VF)))
1783517833
ToDemote.push_back(E.Idx);
1783617834
}
1783717835
return Res;
@@ -18241,8 +18239,8 @@ void BoUpSLP::computeMinimumValueSizes() {
1824118239
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
1824218240
return 0u;
1824318241

18244-
unsigned NumParts = TTI->getNumberOfParts(
18245-
getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18242+
unsigned NumParts = ::getNumberOfParts(
18243+
*TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
1824618244

1824718245
// The maximum bit width required to represent all the values that can be
1824818246
// demoted without loss of precision. It would be safe to truncate the roots
@@ -18302,8 +18300,10 @@ void BoUpSLP::computeMinimumValueSizes() {
1830218300
// use - ignore it.
1830318301
if (NumParts > 1 &&
1830418302
NumParts ==
18305-
TTI->getNumberOfParts(getWidenedType(
18306-
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18303+
::getNumberOfParts(
18304+
*TTI, getWidenedType(IntegerType::get(F->getContext(),
18305+
bit_ceil(MaxBitWidth)),
18306+
VF)))
1830718307
return 0u;
1830818308

1830918309
unsigned Opcode = E.getOpcode();
@@ -20086,14 +20086,14 @@ class HorizontalReduction {
2008620086
ReduxWidth =
2008720087
getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
2008820088
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20089-
NumParts = TTI.getNumberOfParts(Tp);
20089+
NumParts = ::getNumberOfParts(TTI, Tp);
2009020090
NumRegs =
2009120091
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2009220092
while (NumParts > NumRegs) {
2009320093
assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
2009420094
ReduxWidth = bit_floor(ReduxWidth - 1);
2009520095
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20096-
NumParts = TTI.getNumberOfParts(Tp);
20096+
NumParts = ::getNumberOfParts(TTI, Tp);
2009720097
NumRegs =
2009820098
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2009920099
}

llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,17 @@ define void @partial_vec_invalid_cost() #0 {
77
; CHECK-LABEL: define void @partial_vec_invalid_cost(
88
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
99
; CHECK-NEXT: entry:
10-
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer)
10+
; CHECK-NEXT: [[LSHR_1:%.*]] = lshr i96 0, 0
11+
; CHECK-NEXT: [[LSHR_2:%.*]] = lshr i96 0, 0
12+
; CHECK-NEXT: [[TRUNC_I96_1:%.*]] = trunc i96 [[LSHR_1]] to i32
13+
; CHECK-NEXT: [[TRUNC_I96_2:%.*]] = trunc i96 [[LSHR_2]] to i32
14+
; CHECK-NEXT: [[TRUNC_I96_3:%.*]] = trunc i96 0 to i32
15+
; CHECK-NEXT: [[TRUNC_I96_4:%.*]] = trunc i96 0 to i32
1116
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer)
12-
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i32 [[TMP0]], [[TMP1]]
17+
; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP1]], [[TRUNC_I96_1]]
18+
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i32 [[TRUNC_I96_2]], [[TRUNC_I96_3]]
19+
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i32 [[OP_RDX]], [[OP_RDX1]]
20+
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i32 [[OP_RDX2]], [[TRUNC_I96_4]]
1321
; CHECK-NEXT: [[STORE_THIS:%.*]] = zext i32 [[OP_RDX3]] to i96
1422
; CHECK-NEXT: store i96 [[STORE_THIS]], ptr null, align 16
1523
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)