2828#include "llvm/ADT/StringSwitch.h"
2929#include "llvm/Analysis/BlockFrequencyInfo.h"
3030#include "llvm/Analysis/ObjCARCUtil.h"
31- #include "llvm/Analysis/ProfileSummaryInfo.h"
3231#include "llvm/Analysis/VectorUtils.h"
3332#include "llvm/CodeGen/IntrinsicLowering.h"
3433#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29782,8 +29781,8 @@ template <typename InputTy, typename PermutationTy,
2978229781 8>>
2978329782static bool PermuteAndPairVector(
2978429783 const InputTy &Inputs, PermutationTy &Permutation,
29785- MapTy UnpairedInputs = MapTy()) {
29786- const auto Wildcard = ~ typename InputTy::value_type() ;
29784+ MapTy UnpairedInputs = MapTy()) {static_assert(std::is_same<typename InputTy::value_type, uint8_t>::value);
29785+ const typename InputTy::value_type Wildcard = ~0 ;
2978729786 SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
2978829787
2978929788 size_t OutputOffset = Permutation.size();
@@ -30155,14 +30154,16 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3015530154 // amounts can be shuffled such that every pair or quad of adjacent elements
3015630155 // has the same value. This introduces an extra shuffle before and after the
3015730156 // shift, and it is profitable if the operand is aready a shuffle so that both
30158- // can be merged and the extra shuffle is fast. This is not profitable on
30159- // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
30157+ // can be merged or the extra shuffle is fast.
3016030158 // (shift (shuffle X P1) S1) ->
3016130159 // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
3016230160 // widened, and P2^-1 is the inverse shuffle of P2.
30161+ // This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
30162+ // variable shift instructions.
3016330163 if (ConstantAmt &&
3016430164 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
30165- R.hasOneUse() && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
30165+ R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
30166+ !Subtarget.hasXOP()) {
3016630167 constexpr size_t LaneBytes = 16;
3016730168 const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
3016830169
@@ -30176,7 +30177,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3017630177 }
3017730178
3017830179 // Check if we can find an in-lane shuffle to rearrange the shift amounts,
30179- // if so, this transformation may be profitable.
30180+ // if so, this transformation may be profitable. Cross-lane shuffle is
30181+ // almost never profitable because there is no general 1-instruction
30182+ // solution.
3018030183 bool Profitable;
3018130184 for (size_t I = 0; I < NumLanes; ++I) {
3018230185 if (!(Profitable = PermuteAndPairVector(
@@ -30193,8 +30196,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3019330196 for (size_t I = 0; I < Permutation.size(); I += 2) {
3019430197 uint8_t Shift1 = ShiftAmt[Permutation[I]];
3019530198 uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
30196- assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
30197- EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
30199+ assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 ||
30200+ Shift2 == (uint8_t) ~0);
30201+ EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2);
3019830202 }
3019930203 SmallVector<int, 32> Permutation2;
3020030204 for (size_t I = 0; I < NumLanes; ++I) {
@@ -30214,51 +30218,36 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3021430218 }
3021530219 }
3021630220
30217- // For right shifts, (V)PMULHUW needs an extra instruction to handle an
30218- // amount of 0, disabling the transformation here to be cautious .
30221+ // For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
30222+ // amount of 0, making it unprofitable .
3021930223 if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
3022030224 any_of(ShiftAmt, [](auto x) { return x == 0; }))
3022130225 Profitable = false;
3022230226
3022330227 bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
30224- // If operand R is not a shuffle by itself, the transformation here adds two
30225- // shuffles, adding a non-trivial cost. Here we take out a few cases where
30226- // the benefit is questionable according to llvm-mca's modeling.
30227- //
30228- // Each cell shows latency before/after transform. Here R is not a shuffle.
30229- // SSE3
30230- // | v16i8 | v32i8 | v64i8
30231- // ----------------------------
30232- // SLL | 17/17 | 20/20 | 26/26
30233- // SRL | 18/17 | 22/20 | 35/26
30234- // SRA | 21/19 | 26/22 | 39/30
30235- // AVX2 using VPMUL*W
30236- // | v16i8 | v32i8 | v64i8
30237- // ----------------------------
30238- // SLL | 20/18 | 18/18 | 21/21
30239- // SRL | 20/18 | 22/18 | 26/21
30240- // SRA | 20/20 | 22/20 | 25/23
30241- // AVX2 using VPS*LVD
30242- // | v16i8 | v32i8 | v64i8
30243- // ----------------------------
30244- // SLL | 20/16 | 18/16 | 21/20
30245- // SRL | 20/16 | 22/16 | 26/20
30246- // SRA | 20/18 | 22/18 | 25/22
30228+ // If operand R is a shuffle, one of the two shuffles introduced by this
30229+ // transformation can be merged with it, and the extrast shuffle is 1 cycle.
30230+ // This is generally profitable because it eliminates one (or both) vector
30231+ // multiplication, which has to be scheduled at least 1 cycle apart.
30232+ // If operand R is not a shuffle, several cases are not profitable based on
30233+ // pipeline modeling, so we are excluding them here.
3024730234 if (!IsOperandShuffle) {
30248- if (Subtarget.hasAVX2()) {
30249- if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
30235+ // A hack to detect AMD CPU.
30236+ if (Subtarget.hasSSE4A() && Opc == ISD::SRA) {
30237+ if (Opc == ISD::SRA)
3025030238 Profitable = false;
3025130239 } else {
30252- if (Opc == ISD::SHL ||
30253- ((VT == MVT::v16i8 || VT == MVT::v32i8 ) && Opc == ISD::SRL ))
30240+ if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
30241+ (Subtarget.hasAVX2( ) && !IsAdjacentQuads ))
3025430242 Profitable = false;
3025530243 }
3025630244 }
3025730245
3025830246 // Found a permutation P that can rearrange the shift amouts into adjacent
3025930247 // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
3026030248 if (Profitable) {
30261- SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
30249+ SDValue InnerShuffle =
30250+ DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
3026230251 SmallVector<SDValue, 64> NewShiftAmt;
3026330252 for (int Index : Permutation) {
3026430253 NewShiftAmt.push_back(Amt.getOperand(Index));
@@ -30267,7 +30256,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3026730256 for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
3026830257 SDValue Even = NewShiftAmt[I];
3026930258 SDValue Odd = NewShiftAmt[I + 1];
30270- assert(Even.isUndef() || Odd.isUndef() || Even->getAsZExtVal() == Odd->getAsZExtVal());
30259+ assert(Even.isUndef() || Odd.isUndef() ||
30260+ Even->getAsZExtVal() == Odd->getAsZExtVal());
3027130261 }
3027230262#endif
3027330263 SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
@@ -30276,7 +30266,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3027630266 for (size_t I = 0; I < Permutation.size(); ++I) {
3027730267 InversePermutation[Permutation[I]] = I;
3027830268 }
30279- SDValue OuterShuffle = DAG.getVectorShuffle(VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
30269+ SDValue OuterShuffle = DAG.getVectorShuffle(
30270+ VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
3028030271 return OuterShuffle;
3028130272 }
3028230273 }
0 commit comments