@@ -3259,13 +3259,14 @@ class BoUpSLP {
32593259#endif
32603260
32613261 /// Create a new VectorizableTree entry.
3262- TreeEntry *
3263- newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,
3264- const InstructionsState &S, const EdgeInfo &UserTreeIdx,
3265- ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3266- ArrayRef<unsigned> ReorderIndices = std::nullopt,
3267- unsigned InterleaveFactor = 0,
3268- const DenseSet<const TreeEntry *> &Nodes = {}) {
3262+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3263+ std::optional<ScheduleData *> Bundle,
3264+ const InstructionsState &S,
3265+ const EdgeInfo &UserTreeIdx,
3266+ ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3267+ ArrayRef<unsigned> ReorderIndices = std::nullopt,
3268+ unsigned InterleaveFactor = 0,
3269+ const DenseSet<const TreeEntry *> &Nodes = {}) {
32693270 TreeEntry::EntryState EntryState =
32703271 Bundle ? ((InterleaveFactor > 0 && !Nodes.empty())
32713272 ? TreeEntry::InterleavedVectorize
@@ -5532,11 +5533,12 @@ void BoUpSLP::reorderTopToBottom() {
55325533 // need to take into account their order when looking for the most used
55335534 // order.
55345535 if (TE->isAltShuffle()) {
5535- VectorType *VecTy =
5536- getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5536+ VectorType *VecTy =
5537+ getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
55375538 unsigned Opcode0 = TE->getOpcode();
55385539 unsigned Opcode1 = TE->getAltOpcode();
5539- SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5540+ SmallBitVector OpcodeMask(
5541+ getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
55405542 // If this pattern is supported by the target then we consider the
55415543 // order.
55425544 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
@@ -6739,173 +6741,170 @@ void BoUpSLP::tryToVectorizeGatheredLoads() {
67396741 }
67406742 return Results;
67416743 };
6742- auto ProcessGatheredLoads =
6743- [&](ArrayRef< SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
6744- SmallVector<LoadInst *> NonVectorized;
6745- for (ArrayRef<std::pair< LoadInst *, int>> LoadsDists : GatheredLoads) {
6746- SmallVector <std::pair<LoadInst *, int>> LocalLoadsDists( LoadsDists);
6747- SmallVector<LoadInst *> OriginalLoads( LocalLoadsDists.size() );
6748- transform(
6749- LoadsDists, OriginalLoads.begin(),
6750- [](const std::pair<LoadInst *, int> &L) { return L.first; });
6751- stable_sort(LocalLoadsDists, LoadSorter);
6752- SmallVector<LoadInst *> Loads;
6753- for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
6754- if (!getTreeEntry(L.first))
6755- Loads.push_back(L.first);
6756- }
6757- if (Loads.empty())
6758- continue;
6759- BoUpSLP::ValueSet VectorizedLoads;
6760- SmallVector<LoadInst *> SortedNonVectorized;
6761- SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
6762- GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized);
6763- if (!Results.empty() && !SortedNonVectorized.empty() &&
6764- all_of(Results,
6765- [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
6766- return P.second == LoadsState::ScatterVectorize;
6767- })) {
6768- VectorizedLoads.clear();
6769- SmallVector<LoadInst *> UnsortedNonVectorized;
6770- SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
6771- UnsortedResults = GetVectorizedRanges(
6772- OriginalLoads, VectorizedLoads, UnsortedNonVectorized);
6773- if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
6774- SortedNonVectorized.swap(UnsortedNonVectorized);
6775- Results.swap(UnsortedResults);
6776- }
6777- }
6778- for (auto [Slice, _] : Results) {
6779- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
6780- << Slice.size() << ")\n");
6781- if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
6782- for (Value *L : Slice)
6783- if (!getTreeEntry(L))
6784- SortedNonVectorized.push_back(cast<LoadInst>(L));
6785- continue;
6786- }
6744+ auto ProcessGatheredLoads = [&](ArrayRef<
6745+ SmallVector<std::pair<LoadInst *, int>>>
6746+ GatheredLoads) {
6747+ SmallVector< LoadInst *> NonVectorized;
6748+ for (ArrayRef <std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
6749+ SmallVector<std::pair< LoadInst *, int>> LocalLoadsDists(LoadsDists );
6750+ SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
6751+ transform( LoadsDists, OriginalLoads.begin(),
6752+ [](const std::pair<LoadInst *, int> &L) { return L.first; });
6753+ stable_sort(LocalLoadsDists, LoadSorter);
6754+ SmallVector<LoadInst *> Loads;
6755+ for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
6756+ if (!getTreeEntry(L.first))
6757+ Loads.push_back(L.first);
6758+ }
6759+ if (Loads.empty())
6760+ continue;
6761+ BoUpSLP::ValueSet VectorizedLoads;
6762+ SmallVector<LoadInst *> SortedNonVectorized;
6763+ SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
6764+ GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized);
6765+ if (!Results.empty() && !SortedNonVectorized.empty() &&
6766+ all_of(Results,
6767+ [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
6768+ return P.second == LoadsState::ScatterVectorize;
6769+ })) {
6770+ VectorizedLoads.clear();
6771+ SmallVector<LoadInst *> UnsortedNonVectorized;
6772+ SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> UnsortedResults =
6773+ GetVectorizedRanges(OriginalLoads, VectorizedLoads,
6774+ UnsortedNonVectorized);
6775+ if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
6776+ SortedNonVectorized.swap(UnsortedNonVectorized);
6777+ Results.swap(UnsortedResults);
6778+ }
6779+ }
6780+ for (auto [Slice, _] : Results) {
6781+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
6782+ << Slice.size() << ")\n");
6783+ if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
6784+ for (Value *L : Slice)
6785+ if (!getTreeEntry(L))
6786+ SortedNonVectorized.push_back(cast<LoadInst>(L));
6787+ continue;
6788+ }
67876789
6788- // Select maximum VF as a maximum of user gathered nodes and
6789- // distance between scalar loads in these nodes.
6790- unsigned MaxVF = Slice.size();
6791- unsigned UserMaxVF = 0;
6792- std::optional<unsigned> SegmentedLoadsDistance = 0;
6793- std::optional<unsigned> CommonVF = 0;
6794- unsigned Order = 0;
6795- DenseMap<const TreeEntry *, unsigned> EntryToPosition;
6796- DenseSet<const TreeEntry *> DeinterleavedNodes;
6797- for (auto [Idx, V] : enumerate(Slice)) {
6798- for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
6799- UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
6800- unsigned Pos =
6801- EntryToPosition.try_emplace(E, Idx).first->second;
6802- UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
6803- if (CommonVF) {
6804- if (*CommonVF == 0) {
6805- CommonVF = E->Scalars.size();
6806- continue;
6807- }
6808- if (*CommonVF != E->Scalars.size())
6809- CommonVF.reset();
6810- }
6811- if (Pos != Idx && SegmentedLoadsDistance) {
6812- DeinterleavedNodes.insert(E);
6813- if (*SegmentedLoadsDistance == 0) {
6814- SegmentedLoadsDistance = Idx - Pos;
6815- continue;
6816- }
6817- if ((Idx - Pos) % *SegmentedLoadsDistance != 0 ||
6818- (Idx - Pos) / *SegmentedLoadsDistance < Order) {
6819- SegmentedLoadsDistance.reset();
6820- DeinterleavedNodes.clear();
6821- }
6822- Order = (Idx - Pos) / SegmentedLoadsDistance.value_or(1);
6823- }
6790+ // Select maximum VF as a maximum of user gathered nodes and
6791+ // distance between scalar loads in these nodes.
6792+ unsigned MaxVF = Slice.size();
6793+ unsigned UserMaxVF = 0;
6794+ std::optional<unsigned> SegmentedLoadsDistance = 0;
6795+ std::optional<unsigned> CommonVF = 0;
6796+ unsigned Order = 0;
6797+ DenseMap<const TreeEntry *, unsigned> EntryToPosition;
6798+ DenseSet<const TreeEntry *> DeinterleavedNodes;
6799+ for (auto [Idx, V] : enumerate(Slice)) {
6800+ for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
6801+ UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
6802+ unsigned Pos = EntryToPosition.try_emplace(E, Idx).first->second;
6803+ UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
6804+ if (CommonVF) {
6805+ if (*CommonVF == 0) {
6806+ CommonVF = E->Scalars.size();
6807+ continue;
68246808 }
6809+ if (*CommonVF != E->Scalars.size())
6810+ CommonVF.reset();
68256811 }
6826- unsigned Limit = 2;
6827- unsigned InterleaveFactor = 0;
6828- // Check if the large load represents segmented load operation.
6829- if (SegmentedLoadsDistance.value_or(0) > 1 &&
6830- CommonVF.value_or(0) != 0) {
6831- InterleaveFactor = PowerOf2Ceil(*SegmentedLoadsDistance);
6832- unsigned VF = *CommonVF;
6833- SmallVector<unsigned> Order;
6834- SmallVector<Value *> PointerOps;
6835- // Segmented load detected - vectorize at maximum vector factor.
6836- if (TTI->isLegalInterleavedAccessType(
6837- getWidenedType(Slice.front()->getType(), VF),
6838- InterleaveFactor,
6839- cast<LoadInst>(Slice.front())->getAlign(),
6840- cast<LoadInst>(Slice.front())
6841- ->getPointerAddressSpace()) &&
6842- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps) ==
6843- LoadsState::Vectorize) {
6844- UserMaxVF = InterleaveFactor * VF;
6845- Limit = UserMaxVF;
6846- } else {
6847- UserMaxVF = VF;
6812+ if (Pos != Idx && SegmentedLoadsDistance) {
6813+ DeinterleavedNodes.insert(E);
6814+ if (*SegmentedLoadsDistance == 0) {
6815+ SegmentedLoadsDistance = Idx - Pos;
6816+ continue;
6817+ }
6818+ if ((Idx - Pos) % *SegmentedLoadsDistance != 0 ||
6819+ (Idx - Pos) / *SegmentedLoadsDistance < Order) {
6820+ SegmentedLoadsDistance.reset();
68486821 DeinterleavedNodes.clear();
68496822 }
6850- } else {
6851- DeinterleavedNodes.clear();
6823+ Order = (Idx - Pos) / SegmentedLoadsDistance.value_or(1);
68526824 }
6853- // Cannot represent the loads as consecutive vectorizable nodes -
6854- // just exit.
6855- unsigned ConsecutiveNodesSize = 0;
6856- if (!LoadEntriesToVectorize.empty() &&
6857- (SegmentedLoadsDistance.value_or(0) == 0 ||
6858- CommonVF.value_or(UserMaxVF) == UserMaxVF) &&
6859- any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6860- [&, Slice = Slice](const auto &P) {
6861- const auto *It = find_if(Slice, [&](Value *V) {
6862- return std::get<1>(P).contains(V);
6863- });
6864- if (It == Slice.end())
6865- return false;
6866- ArrayRef<Value *> VL = std::get<0>(P);
6867- ConsecutiveNodesSize += VL.size();
6868- unsigned Start = std::distance(Slice.begin(), It);
6869- unsigned Sz = Slice.size() - Start;
6870- return Sz < VL.size() ||
6871- Slice.slice(std::distance(Slice.begin(), It),
6872- VL.size()) != VL;
6825+ }
6826+ }
6827+ unsigned Limit = 2;
6828+ unsigned InterleaveFactor = 0;
6829+ // Check if the large load represents segmented load operation.
6830+ if (SegmentedLoadsDistance.value_or(0) > 1 &&
6831+ CommonVF.value_or(0) != 0) {
6832+ InterleaveFactor = PowerOf2Ceil(*SegmentedLoadsDistance);
6833+ unsigned VF = *CommonVF;
6834+ SmallVector<unsigned> Order;
6835+ SmallVector<Value *> PointerOps;
6836+ // Segmented load detected - vectorize at maximum vector factor.
6837+ if (TTI->isLegalInterleavedAccessType(
6838+ getWidenedType(Slice.front()->getType(), VF),
6839+ InterleaveFactor, cast<LoadInst>(Slice.front())->getAlign(),
6840+ cast<LoadInst>(Slice.front())->getPointerAddressSpace()) &&
6841+ canVectorizeLoads(Slice, Slice.front(), Order, PointerOps) ==
6842+ LoadsState::Vectorize) {
6843+ UserMaxVF = InterleaveFactor * VF;
6844+ Limit = UserMaxVF;
6845+ } else {
6846+ UserMaxVF = VF;
6847+ DeinterleavedNodes.clear();
6848+ }
6849+ } else {
6850+ DeinterleavedNodes.clear();
6851+ }
6852+ // Cannot represent the loads as consecutive vectorizable nodes -
6853+ // just exit.
6854+ unsigned ConsecutiveNodesSize = 0;
6855+ if (!LoadEntriesToVectorize.empty() &&
6856+ (SegmentedLoadsDistance.value_or(0) == 0 ||
6857+ CommonVF.value_or(UserMaxVF) == UserMaxVF) &&
6858+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6859+ [&, Slice = Slice](const auto &P) {
6860+ const auto *It = find_if(Slice, [&](Value *V) {
6861+ return std::get<1>(P).contains(V);
6862+ });
6863+ if (It == Slice.end())
6864+ return false;
6865+ ArrayRef<Value *> VL = std::get<0>(P);
6866+ ConsecutiveNodesSize += VL.size();
6867+ unsigned Start = std::distance(Slice.begin(), It);
6868+ unsigned Sz = Slice.size() - Start;
6869+ return Sz < VL.size() ||
6870+ Slice.slice(std::distance(Slice.begin(), It),
6871+ VL.size()) != VL;
6872+ }))
6873+ continue;
6874+ if (Slice.size() != ConsecutiveNodesSize)
6875+ MaxVF = std::min<unsigned>(MaxVF, PowerOf2Ceil(UserMaxVF));
6876+ for (unsigned VF = MaxVF; VF >= Limit; VF /= 2) {
6877+ bool IsVectorized = true;
6878+ for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
6879+ ArrayRef<Value *> SubSlice = Slice.slice(I, VF);
6880+ if (getTreeEntry(SubSlice.front()))
6881+ continue;
6882+ // Check if the subslice is to be-vectorized entry, which is not
6883+ // equal to entry.
6884+ if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6885+ [&](const auto &P) {
6886+ return !SubSlice.equals(std::get<0>(P)) &&
6887+ set_is_subset(SubSlice, std::get<1>(P));
68736888 }))
68746889 continue;
6875- if (Slice.size() != ConsecutiveNodesSize)
6876- MaxVF = std::min<unsigned>(MaxVF, PowerOf2Ceil(UserMaxVF));
6877- for (unsigned VF = MaxVF; VF >= Limit; VF /= 2) {
6878- bool IsVectorized = true;
6879- for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
6880- ArrayRef<Value *> SubSlice = Slice.slice(I, VF);
6881- if (getTreeEntry(SubSlice.front()))
6882- continue;
6883- // Check if the subslice is to be-vectorized entry, which is not
6884- // equal to entry.
6885- if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6886- [&](const auto &P) {
6887- return !SubSlice.equals(std::get<0>(P)) &&
6888- set_is_subset(SubSlice, std::get<1>(P));
6889- }))
6890- continue;
6891- unsigned Sz = VectorizableTree.size();
6892- buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor,
6893- DeinterleavedNodes);
6894- if (Sz + 1 == VectorizableTree.size() &&
6895- VectorizableTree.back()->isGather()) {
6896- VectorizableTree.pop_back();
6897- IsVectorized = false;
6898- continue;
6899- }
6900- }
6901- if (IsVectorized)
6902- break;
6890+ unsigned Sz = VectorizableTree.size();
6891+ buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor,
6892+ DeinterleavedNodes);
6893+ if (Sz + 1 == VectorizableTree.size() &&
6894+ VectorizableTree.back()->isGather()) {
6895+ VectorizableTree.pop_back();
6896+ IsVectorized = false;
6897+ continue;
69036898 }
69046899 }
6905- NonVectorized.append(SortedNonVectorized);
6900+ if (IsVectorized)
6901+ break;
69066902 }
6907- return NonVectorized;
6908- };
6903+ }
6904+ NonVectorized.append(SortedNonVectorized);
6905+ }
6906+ return NonVectorized;
6907+ };
69096908 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(GatheredLoads);
69106909 SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
69116910 for (LoadInst *LI : NonVectorized) {
0 commit comments