Skip to content

Commit 57b0008

Browse files
committed
[𝘀𝗽𝗿] initial version
Created using spr 1.3.5
1 parent f74879c commit 57b0008

File tree

2 files changed

+57
-38
lines changed

2 files changed

+57
-38
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,22 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
278278
return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
279279
}
280280

281+
/// Returns the number of elements of the given type \p Ty, not greater than \p
282+
/// Sz, which forms type, which splits by \p TTI into whole vector types during
283+
/// legalization.
284+
static unsigned
285+
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
286+
unsigned Sz) {
287+
if (!isValidElementType(Ty))
288+
return bit_floor(Sz);
289+
// Find the number of elements, which forms full vectors.
290+
unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
291+
if (NumParts == 0 || NumParts >= Sz)
292+
return bit_floor(Sz);
293+
unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294+
return (Sz / RegVF) * RegVF;
295+
}
296+
281297
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
282298
SmallVectorImpl<int> &Mask) {
283299
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -7651,7 +7667,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
76517667
}
76527668
size_t NumUniqueScalarValues = UniqueValues.size();
76537669
bool IsFullVectors = hasFullVectorsOrPowerOf2(
7654-
*TTI, UniqueValues.front()->getType(), NumUniqueScalarValues);
7670+
*TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
76557671
if (NumUniqueScalarValues == VL.size() &&
76567672
(VectorizeNonPowerOf2 || IsFullVectors)) {
76577673
ReuseShuffleIndices.clear();
@@ -17385,7 +17401,11 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1738517401
const unsigned Sz = R.getVectorElementSize(Chain[0]);
1738617402
unsigned VF = Chain.size();
1738717403

17388-
if (!has_single_bit(Sz) || !has_single_bit(VF) || VF < 2 || VF < MinVF) {
17404+
if (!has_single_bit(Sz) ||
17405+
!hasFullVectorsOrPowerOf2(
17406+
*TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
17407+
VF) ||
17408+
VF < 2 || VF < MinVF) {
1738917409
// Check if vectorizing with a non-power-of-2 VF should be considered. At
1739017410
// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
1739117411
// all vector lanes are used.
@@ -17403,10 +17423,12 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1740317423
InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
1740417424
if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
1740517425
DenseSet<Value *> Stores(Chain.begin(), Chain.end());
17406-
bool IsPowerOf2 =
17407-
has_single_bit(ValOps.size()) ||
17426+
bool IsAllowedSize =
17427+
hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
17428+
ValOps.size()) ||
1740817429
(VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
17409-
if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
17430+
if ((!IsAllowedSize && S.getOpcode() &&
17431+
S.getOpcode() != Instruction::Load &&
1741017432
(!S.MainOp->isSafeToRemove() ||
1741117433
any_of(ValOps.getArrayRef(),
1741217434
[&](Value *V) {
@@ -17417,7 +17439,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1741717439
}));
1741817440
}))) ||
1741917441
(ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
17420-
Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
17442+
Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
1742117443
return false;
1742217444
}
1742317445
}
@@ -17545,15 +17567,11 @@ bool SLPVectorizerPass::vectorizeStores(
1754517567

1754617568
unsigned MaxVF =
1754717569
std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
17548-
unsigned MaxRegVF = MaxVF;
1754917570
auto *Store = cast<StoreInst>(Operands[0]);
1755017571
Type *StoreTy = Store->getValueOperand()->getType();
1755117572
Type *ValueTy = StoreTy;
1755217573
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
1755317574
ValueTy = Trunc->getSrcTy();
17554-
if (ValueTy == StoreTy &&
17555-
R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
17556-
MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
1755717575
unsigned MinVF = std::max<unsigned>(
1755817576
2, PowerOf2Ceil(TTI->getStoreMinimumVF(
1755917577
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
@@ -17571,10 +17589,21 @@ bool SLPVectorizerPass::vectorizeStores(
1757117589
// First try vectorizing with a non-power-of-2 VF. At the moment, only
1757217590
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
1757317591
// lanes are used.
17574-
unsigned CandVF =
17575-
std::clamp<unsigned>(Operands.size(), MaxVF, MaxRegVF);
17576-
if (has_single_bit(CandVF + 1))
17592+
unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
17593+
if (has_single_bit(CandVF + 1)) {
1757717594
NonPowerOf2VF = CandVF;
17595+
assert(NonPowerOf2VF != MaxVF &&
17596+
"Non-power-of-2 VF should not be equal to MaxVF");
17597+
}
17598+
}
17599+
17600+
unsigned MaxRegVF = MaxVF;
17601+
MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
17602+
if (MaxVF < MinVF) {
17603+
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
17604+
<< ") < "
17605+
<< "MinVF (" << MinVF << ")\n");
17606+
continue;
1757817607
}
1757917608

1758017609
unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
@@ -17742,16 +17771,21 @@ bool SLPVectorizerPass::vectorizeStores(
1774217771
(Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
1774317772
break;
1774417773
constexpr unsigned StoresLimit = 64;
17745-
const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
17774+
const unsigned MaxTotalNum = std::min<unsigned>(
1774617775
Operands.size(),
1774717776
static_cast<unsigned>(
1774817777
End -
1774917778
std::distance(
1775017779
RangeSizes.begin(),
1775117780
find_if(RangeSizes, std::bind(IsNotVectorized, true,
1775217781
std::placeholders::_1))) +
17753-
1)));
17754-
unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
17782+
1));
17783+
unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
17784+
unsigned Limit =
17785+
getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
17786+
CandidateVFs.clear();
17787+
if (bit_floor(Limit) == VF)
17788+
CandidateVFs.push_back(Limit);
1775517789
if (VF > MaxTotalNum || VF >= StoresLimit)
1775617790
break;
1775717791
for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
@@ -17760,7 +17794,6 @@ bool SLPVectorizerPass::vectorizeStores(
1776017794
});
1776117795
// Last attempt to vectorize max number of elements, if all previous
1776217796
// attempts were unsuccessful because of the cost issues.
17763-
CandidateVFs.clear();
1776417797
CandidateVFs.push_back(VF);
1776517798
}
1776617799
}

llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,16 @@
44
define void @test(ptr noalias %0, ptr noalias %1) {
55
; CHECK-LABEL: define void @test(
66
; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) {
7-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 24
8-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48
97
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 8
10-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16
11-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
12-
; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP7]], align 8
13-
; CHECK-NEXT: store double [[TMP8]], ptr [[TMP5]], align 8
148
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP0]], i64 48
15-
; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP9]], align 16
16-
; CHECK-NEXT: store double [[TMP10]], ptr [[TMP6]], align 16
179
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
18-
; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr [[TMP11]], align 8
19-
; CHECK-NEXT: store double [[TMP12]], ptr [[TMP3]], align 8
20-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
21-
; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP13]], align 16
22-
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
23-
; CHECK-NEXT: store double [[TMP14]], ptr [[TMP15]], align 16
24-
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP0]], i64 56
25-
; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
26-
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP1]], i64 40
27-
; CHECK-NEXT: store double [[TMP17]], ptr [[TMP18]], align 8
28-
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16
29-
; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr [[TMP19]], align 16
30-
; CHECK-NEXT: store double [[TMP20]], ptr [[TMP4]], align 16
10+
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16
11+
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
12+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
13+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
14+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
15+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 5, i32 1>
16+
; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8
3117
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
3218
; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8
3319
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP1]], i64 56

0 commit comments

Comments
 (0)