@@ -8304,35 +8304,57 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
83048304 });
83058305 // FIXME: this must be moved to TTI for better estimation.
83068306 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8307- auto CheckPerRegistersShuffle =
8308- [&](MutableArrayRef<int> Mask,
8309- SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
8307+ auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8308+ SmallVectorImpl<unsigned> &Indices)
8309+ -> std::optional<TTI::ShuffleKind> {
83108310 if (NumElts <= EltsPerVector)
83118311 return std::nullopt;
8312+ int OffsetReg0 =
8313+ alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8314+ [](int S, int I) {
8315+ if (I == PoisonMaskElem)
8316+ return S;
8317+ return std::min(S, I);
8318+ }),
8319+ EltsPerVector);
8320+ int OffsetReg1 = OffsetReg0;
83128321 DenseSet<int> RegIndices;
83138322 // Check that if trying to permute same single/2 input vectors.
83148323 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
83158324 int FirstRegId = -1;
8316- Indices.assign(1, -1 );
8317- for (int &I : Mask) {
8325+ Indices.assign(1, OffsetReg0 );
8326+ for (auto [Pos, I] : enumerate( Mask) ) {
83188327 if (I == PoisonMaskElem)
83198328 continue;
8320- int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8329+ int Idx = I - OffsetReg0;
8330+ int RegId =
8331+ (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
83218332 if (FirstRegId < 0)
83228333 FirstRegId = RegId;
83238334 RegIndices.insert(RegId);
83248335 if (RegIndices.size() > 2)
83258336 return std::nullopt;
83268337 if (RegIndices.size() == 2) {
83278338 ShuffleKind = TTI::SK_PermuteTwoSrc;
8328- if (Indices.size() == 1)
8329- Indices.push_back(-1);
8339+ if (Indices.size() == 1) {
8340+ OffsetReg1 = alignDown(
8341+ std::accumulate(
8342+ std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8343+ [&](int S, int I) {
8344+ if (I == PoisonMaskElem)
8345+ return S;
8346+ int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8347+ ((I - OffsetReg0) % NumElts) / EltsPerVector;
8348+ if (RegId == FirstRegId)
8349+ return S;
8350+ return std::min(S, I);
8351+ }),
8352+ EltsPerVector);
8353+ Indices.push_back(OffsetReg1 % NumElts);
8354+ }
8355+ Idx = I - OffsetReg1;
83308356 }
8331- if (RegId == FirstRegId)
8332- Indices.front() = I % NumElts;
8333- else
8334- Indices.back() = I % NumElts;
8335- I = (I % NumElts) % EltsPerVector +
8357+ I = (Idx % NumElts) % EltsPerVector +
83368358 (RegId == FirstRegId ? 0 : EltsPerVector);
83378359 }
83388360 return ShuffleKind;
@@ -8349,7 +8371,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
83498371 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
83508372 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
83518373 copy(MaskSlice, SubMask.begin());
8352- SmallVector<int > Indices;
8374+ SmallVector<unsigned, 2 > Indices;
83538375 std::optional<TTI::ShuffleKind> RegShuffleKind =
83548376 CheckPerRegistersShuffle(SubMask, Indices);
83558377 if (!RegShuffleKind) {
@@ -8367,12 +8389,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
83678389 FixedVectorType::get(ScalarTy, EltsPerVector),
83688390 SubMask);
83698391 }
8370- for (int Idx : Indices) {
8371- Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
8372- FixedVectorType::get(ScalarTy, NumElts),
8373- std::nullopt, CostKind, Idx,
8374- FixedVectorType::get(ScalarTy, EltsPerVector));
8375- }
8392+ for (unsigned Idx : Indices) {
8393+ assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8394+ "SK_ExtractSubvector index out of range");
8395+ Cost += ::getShuffleCost(
8396+ TTI, TTI::SK_ExtractSubvector,
8397+ FixedVectorType::get(ScalarTy, alignTo(NumElts, EltsPerVector)),
8398+ std::nullopt, CostKind, Idx,
8399+ FixedVectorType::get(ScalarTy, EltsPerVector));
8400+ }
8401+ // Second attempt to check, if just a permute is better estimated than
8402+ // subvector extract.
8403+ SubMask.assign(NumElts, PoisonMaskElem);
8404+ copy(MaskSlice, SubMask.begin());
8405+ InstructionCost OriginalCost =
8406+ ::getShuffleCost(TTI, *ShuffleKinds[Part],
8407+ FixedVectorType::get(ScalarTy, NumElts), SubMask);
8408+ if (OriginalCost < Cost)
8409+ Cost = OriginalCost;
83768410 }
83778411 return Cost;
83788412 }
0 commit comments