@@ -1926,6 +1926,19 @@ class BoUpSLP {
1926
1926
class ShuffleCostEstimator;
1927
1927
class ShuffleInstructionBuilder;
1928
1928
1929
+ /// If we decide to generate strided load / store, this struct contains all
1930
+ /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1931
+ /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1932
+ /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1933
+ /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1934
+ /// size of element of FixedVectorType.
1935
+ struct StridedPtrInfo {
1936
+ Value *StrideVal = nullptr;
1937
+ const SCEV *StrideSCEV = nullptr;
1938
+ FixedVectorType *Ty = nullptr;
1939
+ };
1940
+ SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1941
+
1929
1942
public:
1930
1943
/// Tracks the state we can represent the loads in the given sequence.
1931
1944
enum class LoadsState {
@@ -2221,6 +2234,11 @@ class BoUpSLP {
2221
2234
/// TODO: If load combining is allowed in the IR optimizer, this analysis
2222
2235
/// may not be necessary.
2223
2236
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2237
+ bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
2238
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
2239
+ const DataLayout &DL, ScalarEvolution &SE,
2240
+ const bool IsAnyPointerUsedOutGraph, const int64_t Diff,
2241
+ StridedPtrInfo &SPtrInfo) const;
2224
2242
2225
2243
/// Checks if the given array of loads can be represented as a vectorized,
2226
2244
/// scatter or just simple gather.
@@ -2235,6 +2253,7 @@ class BoUpSLP {
2235
2253
LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
2236
2254
SmallVectorImpl<unsigned> &Order,
2237
2255
SmallVectorImpl<Value *> &PointerOps,
2256
+ StridedPtrInfo &SPtrInfo,
2238
2257
unsigned *BestVF = nullptr,
2239
2258
bool TryRecursiveCheck = true) const;
2240
2259
@@ -4479,11 +4498,10 @@ class BoUpSLP {
4479
4498
4480
4499
/// Checks if the specified list of the instructions/values can be vectorized
4481
4500
/// and fills required data before actual scheduling of the instructions.
4482
- TreeEntry::EntryState
4483
- getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
4484
- bool IsScatterVectorizeUserTE,
4485
- OrdersType &CurrentOrder,
4486
- SmallVectorImpl<Value *> &PointerOps);
4501
+ TreeEntry::EntryState getScalarsVectorizationState(
4502
+ const InstructionsState &S, ArrayRef<Value *> VL,
4503
+ bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4504
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4487
4505
4488
4506
/// Maps a specific scalar to its tree entry(ies).
4489
4507
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6800,12 +6818,13 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6800
6818
/// 4. Any pointer operand is an instruction with the users outside of the
6801
6819
/// current graph (for masked gathers extra extractelement instructions
6802
6820
/// might be required).
6803
- static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6804
- ArrayRef<unsigned> Order,
6805
- const TargetTransformInfo &TTI, const DataLayout &DL,
6806
- ScalarEvolution &SE,
6807
- const bool IsAnyPointerUsedOutGraph,
6808
- const int64_t Diff) {
6821
+ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6822
+ ArrayRef<unsigned> Order,
6823
+ const TargetTransformInfo &TTI,
6824
+ const DataLayout &DL, ScalarEvolution &SE,
6825
+ const bool IsAnyPointerUsedOutGraph,
6826
+ const int64_t Diff,
6827
+ StridedPtrInfo &SPtrInfo) const {
6809
6828
const size_t Sz = VL.size();
6810
6829
const uint64_t AbsoluteDiff = std::abs(Diff);
6811
6830
Type *ScalarTy = VL.front()->getType();
@@ -6847,17 +6866,20 @@ static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6847
6866
if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6848
6867
break;
6849
6868
}
6850
- if (Dists.size() == Sz)
6869
+ if (Dists.size() == Sz) {
6870
+ Type *StrideTy = DL.getIndexType(Ptr0->getType());
6871
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6872
+ SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6851
6873
return true;
6874
+ }
6852
6875
}
6853
6876
return false;
6854
6877
}
6855
6878
6856
- BoUpSLP::LoadsState
6857
- BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
6858
- SmallVectorImpl<unsigned> &Order,
6859
- SmallVectorImpl<Value *> &PointerOps,
6860
- unsigned *BestVF, bool TryRecursiveCheck) const {
6879
+ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
6880
+ ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
6881
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
6882
+ unsigned *BestVF, bool TryRecursiveCheck) const {
6861
6883
// Check that a vectorized load would load the same memory as a scalar
6862
6884
// load. For example, we don't want to vectorize loads that are smaller
6863
6885
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6895,9 +6917,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
6895
6917
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6896
6918
if (!IsSorted) {
6897
6919
if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
6898
- if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
6899
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
6920
+ if (const SCEV *Stride =
6921
+ calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
6922
+ Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6923
+ SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
6924
+ SPtrInfo.StrideSCEV = Stride;
6900
6925
return LoadsState::StridedVectorize;
6926
+ }
6901
6927
}
6902
6928
6903
6929
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -6941,7 +6967,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
6941
6967
});
6942
6968
if (IsPossibleStrided &&
6943
6969
isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
6944
- IsAnyPointerUsedOutGraph, *Diff))
6970
+ IsAnyPointerUsedOutGraph, *Diff, SPtrInfo ))
6945
6971
return LoadsState::StridedVectorize;
6946
6972
}
6947
6973
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7025,9 +7051,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
7025
7051
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7026
7052
SmallVector<unsigned> Order;
7027
7053
SmallVector<Value *> PointerOps;
7028
- LoadsState LS =
7029
- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
7030
- /*TryRecursiveCheck=*/false);
7054
+ LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7055
+ PointerOps, SPtrInfo , BestVF,
7056
+ /*TryRecursiveCheck=*/false);
7031
7057
// Check that the sorted loads are consecutive.
7032
7058
if (LS == LoadsState::Gather) {
7033
7059
if (BestVF) {
@@ -7699,9 +7725,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7699
7725
// extra analysis later, so include such nodes into a special list.
7700
7726
if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7701
7727
SmallVector<Value *> PointerOps;
7728
+ StridedPtrInfo SPtrInfo;
7702
7729
OrdersType CurrentOrder;
7703
7730
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7704
- CurrentOrder, PointerOps);
7731
+ CurrentOrder, PointerOps, SPtrInfo );
7705
7732
if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
7706
7733
Res == LoadsState::CompressVectorize)
7707
7734
return std::move(CurrentOrder);
@@ -9207,8 +9234,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
9207
9234
// Try to build vector load.
9208
9235
ArrayRef<Value *> Values(
9209
9236
reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9237
+ StridedPtrInfo SPtrInfo;
9210
9238
LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9211
- PointerOps, &BestVF);
9239
+ PointerOps, SPtrInfo, &BestVF);
9212
9240
if (LS != LoadsState::Gather ||
9213
9241
(BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9214
9242
if (LS == LoadsState::ScatterVectorize) {
@@ -9402,6 +9430,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
9402
9430
unsigned VF = *CommonVF;
9403
9431
OrdersType Order;
9404
9432
SmallVector<Value *> PointerOps;
9433
+ StridedPtrInfo SPtrInfo;
9405
9434
// Segmented load detected - vectorize at maximum vector factor.
9406
9435
if (InterleaveFactor <= Slice.size() &&
9407
9436
TTI.isLegalInterleavedAccessType(
@@ -9410,8 +9439,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
9410
9439
cast<LoadInst>(Slice.front())->getAlign(),
9411
9440
cast<LoadInst>(Slice.front())
9412
9441
->getPointerAddressSpace()) &&
9413
- canVectorizeLoads(Slice, Slice.front(), Order,
9414
- PointerOps ) == LoadsState::Vectorize) {
9442
+ canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9443
+ SPtrInfo ) == LoadsState::Vectorize) {
9415
9444
UserMaxVF = InterleaveFactor * VF;
9416
9445
} else {
9417
9446
InterleaveFactor = 0;
@@ -9433,8 +9462,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
9433
9462
ArrayRef<Value *> VL = TE.Scalars;
9434
9463
OrdersType Order;
9435
9464
SmallVector<Value *> PointerOps;
9465
+ StridedPtrInfo SPtrInfo;
9436
9466
LoadsState State = canVectorizeLoads(
9437
- VL, VL.front(), Order, PointerOps);
9467
+ VL, VL.front(), Order, PointerOps, SPtrInfo );
9438
9468
if (State == LoadsState::ScatterVectorize ||
9439
9469
State == LoadsState::CompressVectorize)
9440
9470
return false;
@@ -9452,11 +9482,11 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
9452
9482
[&, Slice = Slice](unsigned Idx) {
9453
9483
OrdersType Order;
9454
9484
SmallVector<Value *> PointerOps;
9485
+ StridedPtrInfo SPtrInfo;
9455
9486
return canVectorizeLoads(
9456
9487
Slice.slice(Idx * UserMaxVF, UserMaxVF),
9457
- Slice[Idx * UserMaxVF], Order,
9458
- PointerOps) ==
9459
- LoadsState::ScatterVectorize;
9488
+ Slice[Idx * UserMaxVF], Order, PointerOps,
9489
+ SPtrInfo) == LoadsState::ScatterVectorize;
9460
9490
}))
9461
9491
UserMaxVF = MaxVF;
9462
9492
if (Slice.size() != ConsecutiveNodesSize)
@@ -9813,7 +9843,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
9813
9843
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9814
9844
const InstructionsState &S, ArrayRef<Value *> VL,
9815
9845
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9816
- SmallVectorImpl<Value *> &PointerOps) {
9846
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo ) {
9817
9847
assert(S.getMainOp() &&
9818
9848
"Expected instructions with same/alternate opcodes only.");
9819
9849
@@ -9915,7 +9945,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9915
9945
});
9916
9946
});
9917
9947
};
9918
- switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
9948
+ switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo )) {
9919
9949
case LoadsState::Vectorize:
9920
9950
return TreeEntry::Vectorize;
9921
9951
case LoadsState::CompressVectorize:
@@ -11385,8 +11415,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11385
11415
UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11386
11416
OrdersType CurrentOrder;
11387
11417
SmallVector<Value *> PointerOps;
11418
+ StridedPtrInfo SPtrInfo;
11388
11419
TreeEntry::EntryState State = getScalarsVectorizationState(
11389
- S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
11420
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo );
11390
11421
if (State == TreeEntry::NeedToGather) {
11391
11422
newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11392
11423
return;
@@ -11546,6 +11577,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11546
11577
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
11547
11578
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11548
11579
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11580
+ TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11549
11581
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11550
11582
TE->dump());
11551
11583
break;
@@ -12934,8 +12966,9 @@ void BoUpSLP::transformNodes() {
12934
12966
if (S.getOpcode() == Instruction::Load) {
12935
12967
OrdersType Order;
12936
12968
SmallVector<Value *> PointerOps;
12937
- LoadsState Res =
12938
- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
12969
+ StridedPtrInfo SPtrInfo;
12970
+ LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
12971
+ PointerOps, SPtrInfo);
12939
12972
AllStrided &= Res == LoadsState::StridedVectorize ||
12940
12973
Res == LoadsState::ScatterVectorize ||
12941
12974
Res == LoadsState::Gather;
@@ -13041,10 +13074,18 @@ void BoUpSLP::transformNodes() {
13041
13074
InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13042
13075
Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13043
13076
/*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13044
- if (StridedCost < OriginalVecCost || ForceStridedLoads)
13077
+ if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13045
13078
// Strided load is more profitable than consecutive load + reverse -
13046
13079
// transform the node to strided load.
13080
+ Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13081
+ ->getPointerOperand()
13082
+ ->getType());
13083
+ StridedPtrInfo SPtrInfo;
13084
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13085
+ SPtrInfo.Ty = VecTy;
13086
+ TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13047
13087
E.State = TreeEntry::StridedVectorize;
13088
+ }
13048
13089
}
13049
13090
break;
13050
13091
}
@@ -19485,6 +19526,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19485
19526
19486
19527
LoadInst *LI = cast<LoadInst>(VL0);
19487
19528
Instruction *NewLI;
19529
+ FixedVectorType *StridedLoadTy = nullptr;
19488
19530
Value *PO = LI->getPointerOperand();
19489
19531
if (E->State == TreeEntry::Vectorize) {
19490
19532
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19522,43 +19564,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19522
19564
Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19523
19565
Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19524
19566
PO = IsReverseOrder ? PtrN : Ptr0;
19525
- std::optional<int64_t> Diff = getPointersDiff(
19526
- VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
19527
19567
Type *StrideTy = DL->getIndexType(PO->getType());
19528
19568
Value *StrideVal;
19529
- if (Diff) {
19530
- int64_t Stride =
19531
- *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
19532
- StrideVal =
19533
- ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
19534
- DL->getTypeAllocSize(ScalarTy));
19535
- } else {
19536
- SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
19537
- transform(E->Scalars, PointerOps.begin(), [](Value *V) {
19538
- return cast<LoadInst>(V)->getPointerOperand();
19539
- });
19540
- OrdersType Order;
19541
- const SCEV *StrideSCEV =
19542
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
19543
- assert(StrideSCEV && "At this point stride should be known");
19569
+ const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19570
+ StridedLoadTy = SPtrInfo.Ty;
19571
+ assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19572
+ unsigned StridedLoadEC =
19573
+ StridedLoadTy->getElementCount().getKnownMinValue();
19574
+
19575
+ Value *Stride = SPtrInfo.StrideVal;
19576
+ if (!Stride) {
19577
+ const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19578
+ assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19544
19579
SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19545
- Value *Stride = Expander.expandCodeFor(
19546
- StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
19547
- Value *NewStride =
19548
- Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19549
- StrideVal = Builder.CreateMul(
19550
- NewStride,
19551
- ConstantInt::get(
19552
- StrideTy,
19553
- (IsReverseOrder ? -1 : 1) *
19554
- static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
19555
- }
19580
+ Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19581
+ &*Builder.GetInsertPoint());
19582
+ }
19583
+ Value *NewStride =
19584
+ Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19585
+ StrideVal = Builder.CreateMul(
19586
+ NewStride, ConstantInt::get(
19587
+ StrideTy, (IsReverseOrder ? -1 : 1) *
19588
+ static_cast<int>(
19589
+ DL->getTypeAllocSize(ScalarTy))));
19556
19590
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19557
19591
auto *Inst = Builder.CreateIntrinsic(
19558
19592
Intrinsic::experimental_vp_strided_load,
19559
- {VecTy, PO->getType(), StrideTy},
19560
- {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
19561
- Builder.getInt32(E->Scalars.size())});
19593
+ {StridedLoadTy, PO->getType(), StrideTy},
19594
+ {PO, StrideVal,
19595
+ Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19596
+ Builder.getInt32(StridedLoadEC)});
19562
19597
Inst->addParamAttr(
19563
19598
/*ArgNo=*/0,
19564
19599
Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
0 commit comments