@@ -29769,24 +29769,27 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
2976929769// Given a vector of values, find a permutation such that every adjacent even-
2977029770// odd pair has the same value. ~0 is reserved as a special value for wildcard,
2977129771// which can be paired with any value. Returns true if a permutation is found.
29772+ // If output Permutation is not empty, permutation index starts at its previous
29773+ // size, so that this function can concatenate the result of multiple calls.
29774+ // UnpairedInputs contains values yet to be paired, mapping an unpaired value to
29775+ // its current neighbor's value and index.
29776+ // Do not use llvm::DenseMap as ~0 is reserved key.
2977229777template <typename InputTy,
2977329778 typename PermutationTy,
29774- typename MapTy = std::unordered_map <typename InputTy::value_type,
29775- std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
29779+ typename MapTy = SmallMapVector <typename InputTy::value_type,
29780+ std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8 >>
2977629781static bool PermuteAndPairVector(const InputTy& Inputs,
29777- PermutationTy &Permutation) {
29782+ PermutationTy &Permutation,
29783+ MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
29784+ std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
2977829785 const auto Wildcard = ~typename InputTy::value_type();
29779-
29780- // List of values to be paired, mapping an unpaired value to its current
29781- // neighbor's value and index.
29782- MapTy UnpairedInputs;
2978329786 SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
2978429787
29785- Permutation.clear ();
29788+ size_t OutputOffset = Permutation.size ();
2978629789 typename PermutationTy::value_type I = 0;
2978729790 for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
29788- Permutation.push_back(I);
29789- Permutation.push_back(I + 1);
29791+ Permutation.push_back(OutputOffset + I);
29792+ Permutation.push_back(OutputOffset + I + 1);
2979029793
2979129794 auto Even = *InputIt++;
2979229795 assert(InputIt != InputEnd && "Expected even number of elements");
@@ -29806,7 +29809,7 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
2980629809 // value's neighbor, otherwise the current value is added to the map.
2980729810 if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
2980829811 auto [SwapValue, SwapIndex] = MapIt->second;
29809- std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
29812+ std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
2981029813 This = SwapValue;
2981129814 UnpairedInputs.erase(MapIt);
2981229815
@@ -29850,13 +29853,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
2985029853 if (Neighbor != Wildcard) {
2985129854 assert(UnpairedInputs.count(Neighbor));
2985229855 if (WildcardPairs.size()) {
29853- std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
29856+ std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
2985429857 WildcardPairs.pop_back();
2985529858 // Mark the neighbor as processed.
2985629859 UnpairedInputs[Neighbor].first = Wildcard;
29857- } else {
29860+ } else
2985829861 return false;
29859- }
2986029862 }
2986129863 }
2986229864 return true;
@@ -30140,87 +30142,107 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3014030142 }
3014130143 }
3014230144
30143- // ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
30144- // amounts can be shuffled such that every pair of adjacent elements has the
30145- // same value. This introduces an extra shuffle before and after the shift,
30146- // and it is profitable if the operand is aready a shuffle so that both can
30147- // be merged, or if the extra shuffle is fast (can use VPSHUFB).
30145+ // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
30146+ // amounts can be shuffled such that every pair or quad of adjacent elements
30147+ // has the same value. This introduces an extra shuffle before and after the
30148+ // shift, and it is profitable if the operand is aready a shuffle so that both
30149+ // can be merged and the extra shuffle is fast. This is not profitable on
30150+ // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
3014830151 // (shift (shuffle X P1) S1) ->
3014930152 // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
3015030153 // widened, and P2^-1 is the inverse shuffle of P2.
30151- if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
30152- bool Profitable = true;
30153- // VPAND ymm only available on AVX2.
30154- if (VT == MVT::v32i8 || VT == MVT::v64i8) {
30155- Profitable = Subtarget.hasAVX2();
30156- }
30154+ if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
30155+ && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
30156+ constexpr size_t LaneBytes = 16;
30157+ const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
3015730158
3015830159 SmallVector<int, 64> Permutation;
30159- SmallVector<uint16_t , 64> ShiftAmt;
30160+ SmallVector<uint8_t , 64> ShiftAmt;
3016030161 for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
3016130162 if (Amt.getOperand(I).isUndef())
3016230163 ShiftAmt.push_back(~0);
3016330164 else
3016430165 ShiftAmt.push_back(Amt.getConstantOperandVal(I));
3016530166 }
3016630167
30167- if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
30168- Profitable = false;
30169- constexpr size_t LaneBytes = 16;
30170- const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
30171-
30172- // For v32i8 or v64i8, we should check if we can generate a shuffle that
30173- // may be lowered to VPSHUFB, because it is faster than VPERMB. This is
30174- // possible if we can apply the same shuffle mask to each v16i8 lane.
30175- // For example (assuming a lane has 4 elements for simplicity),
30176- // <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
30177- // be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
30178- // <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
30179- // Limitation: if there are some undef in shift amounts, this algorithm
30180- // may not find a solution even if one exists, as here we only treat a
30181- // VPSHUFB index as undef if all shuffle amounts of the same index modulo
30182- // lane size are all undef.
30183- // Since a byte can only be shifted by 7 bits without being UB, 4 bits are
30184- // enough to represent the shift amount or undef (0xF).
30185- std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
30186- for (size_t I = 0; I < LaneBytes; ++I)
30187- for (size_t J = 0; J < NumLanes; ++J)
30188- VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
30189- if (VT == MVT::v32i8) {
30190- for (size_t I = 0; I < LaneBytes; ++I)
30191- VPSHUFBShiftAmt[I] |= 0xFF00;
30192- }
30193- if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
30194- // Found a VPSHUFB solution, offset the shuffle amount to other lanes.
30195- Permutation.resize(VT.getVectorNumElements());
30196- for (size_t I = 0; I < LaneBytes; ++I)
30197- for (size_t J = 1; J < NumLanes; ++J)
30198- Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
30199- Profitable = true;
30200- } else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
30201- // A slower shuffle is profitable if the operand is also a slow shuffle,
30202- // such that they can be merged.
30203- // TODO: Use TargetTransformInfo to systematically determine whether
30204- // inner shuffle is slow. Currently we only check if it contains
30205- // cross-lane shuffle.
30206- if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
30207- if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
30208- is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
30209- Profitable = true;
30168+ // Check if we can find an in-lane shuffle to rearrange the shift amounts,
30169+ // if so, this transformation may be profitable.
30170+ bool Profitable;
30171+ for (size_t I = 0; I < NumLanes; ++I) {
30172+ if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
30173+ break;
30174+ }
30175+
30176+ // For AVX2, check if we can further rearrange shift amounts into adjacent
30177+ // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
30178+ // faster.
30179+ bool IsAdjacentQuads = false;
30180+ if (Profitable && Subtarget.hasAVX2()) {
30181+ SmallVector<uint8_t, 64> EveryOtherShiftAmt;
30182+ for (size_t I = 0; I < Permutation.size(); I += 2) {
30183+ uint8_t Shift1 = ShiftAmt[Permutation[I]];
30184+ uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
30185+ assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
30186+ EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
30187+ }
30188+ SmallVector<int, 32> Permutation2;
30189+ for (size_t I = 0; I < NumLanes; ++I) {
30190+ if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
30191+ break;
30192+ }
30193+ if (IsAdjacentQuads) {
30194+ SmallVector<int, 64> CombinedPermutation;
30195+ for (int Index : Permutation2) {
30196+ CombinedPermutation.push_back(Permutation[Index * 2]);
30197+ CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
3021030198 }
30199+ std::swap(Permutation, CombinedPermutation);
3021130200 }
3021230201 }
3021330202
30214- // If it is still profitable at this point, and has not found a permutation
30215- // yet, try again with any shuffle index.
30216- if (Profitable && Permutation.empty()) {
30217- PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
30218- SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
30203+ // For right shifts, (V)PMULHUW needs an extra instruction to handle an
30204+ // amount of 0, disabling the transformation here to be cautious.
30205+ if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
30206+ any_of(ShiftAmt, [](auto x) { return x == 0; }))
30207+ Profitable = false;
30208+
30209+ bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
30210+ // If operand R is not a shuffle by itself, the transformation here adds two
30211+ // shuffles, adding a non-trivial cost. Here we take out a few cases where
30212+ // the benefit is questionable according to llvm-mca's modeling.
30213+ //
30214+ // Each cell shows latency before/after transform. Here R is not a shuffle.
30215+ // SSE3
30216+ // | v16i8 | v32i8 | v64i8
30217+ // ----------------------------
30218+ // SLL | 17/17 | 20/20 | 26/26
30219+ // SRL | 18/17 | 22/20 | 35/26
30220+ // SRA | 21/19 | 26/22 | 39/30
30221+ // AVX2 using VPMUL*W
30222+ // | v16i8 | v32i8 | v64i8
30223+ // ----------------------------
30224+ // SLL | 20/18 | 18/18 | 21/21
30225+ // SRL | 20/18 | 22/18 | 26/21
30226+ // SRA | 20/20 | 22/20 | 25/23
30227+ // AVX2 using VPS*LVD
30228+ // | v16i8 | v32i8 | v64i8
30229+ // ----------------------------
30230+ // SLL | 20/16 | 18/16 | 21/20
30231+ // SRL | 20/16 | 22/16 | 26/20
30232+ // SRA | 20/18 | 22/18 | 25/22
30233+ if (!IsOperandShuffle) {
30234+ if (Subtarget.hasAVX2()) {
30235+ if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
30236+ Profitable = false;
30237+ } else {
30238+ if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
30239+ Profitable = false;
30240+ }
3021930241 }
3022030242
3022130243 // Found a permutation P that can rearrange the shift amouts into adjacent
30222- // pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
30223- if (!Permutation.empty() ) {
30244+ // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
30245+ if (Profitable ) {
3022430246 SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
3022530247 SmallVector<SDValue, 64> NewShiftAmt;
3022630248 for (int Index : Permutation) {
0 commit comments