Second version: more cpu latency measurement with llvm-mca

huangjd · huangjd · commit 0a0f4805c0bd · 2024-11-28T01:47:21.000-05:00
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29769,24 +29769,27 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
 // Given a vector of values, find a permutation such that every adjacent even-
 // odd pair has the same value. ~0 is reserved as a special value for wildcard,
 // which can be paired with any value. Returns true if a permutation is found.
+// If output Permutation is not empty, permutation index starts at its previous
+// size, so that this function can concatenate the result of multiple calls.
+// UnpairedInputs contains values yet to be paired, mapping an unpaired value to
+// its current neighbor's value and index.
+// Do not use llvm::DenseMap as ~0 is reserved key.
 template <typename InputTy,
          typename PermutationTy,
-         typename MapTy = std::unordered_map<typename InputTy::value_type,
-                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
+         typename MapTy = SmallMapVector<typename InputTy::value_type,
+                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>>
 static bool PermuteAndPairVector(const InputTy& Inputs,
-                                 PermutationTy &Permutation) {
+                                 PermutationTy &Permutation,
+                                 MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
+                                                                       std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
   const auto Wildcard = ~typename InputTy::value_type();
-
-  // List of values to be paired, mapping an unpaired value to its current
-  // neighbor's value and index.
-  MapTy UnpairedInputs;
   SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
 
-  Permutation.clear();
+  size_t OutputOffset = Permutation.size();
   typename PermutationTy::value_type I = 0;
   for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
-    Permutation.push_back(I);
-    Permutation.push_back(I + 1);
+    Permutation.push_back(OutputOffset + I);
+    Permutation.push_back(OutputOffset + I + 1);
 
     auto Even = *InputIt++;
     assert(InputIt != InputEnd && "Expected even number of elements");
@@ -29806,7 +29809,7 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
           // value's neighbor, otherwise the current value is added to the map.
           if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
             auto [SwapValue, SwapIndex] = MapIt->second;
-            std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
+            std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
             This = SwapValue;
             UnpairedInputs.erase(MapIt);
 
@@ -29850,13 +29853,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
     if (Neighbor != Wildcard) {
       assert(UnpairedInputs.count(Neighbor));
       if (WildcardPairs.size()) {
-        std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
+        std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
         WildcardPairs.pop_back();
         // Mark the neighbor as processed.
         UnpairedInputs[Neighbor].first = Wildcard;
-      } else {
+      } else
         return false;
-      }
     }
   }
   return true;
@@ -30140,87 +30142,107 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  // ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
-  // amounts can be shuffled such that every pair of adjacent elements has the
-  // same value. This introduces an extra shuffle before and after the shift,
-  // and it is profitable if the operand is aready a shuffle so that both can
-  // be merged, or if the extra shuffle is fast (can use VPSHUFB).
+  // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
+  // amounts can be shuffled such that every pair or quad of adjacent elements
+  // has the same value. This introduces an extra shuffle before and after the
+  // shift, and it is profitable if the operand is aready a shuffle so that both
+  // can be merged and the extra shuffle is fast. This is not profitable on
+  // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
   // (shift (shuffle X P1) S1) ->
   // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
   // widened, and P2^-1 is the inverse shuffle of P2.
-  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
-    bool Profitable = true;
-    // VPAND ymm only available on AVX2.
-    if (VT == MVT::v32i8 || VT == MVT::v64i8) {
-      Profitable = Subtarget.hasAVX2();
-    }
+  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
+      && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
+    constexpr size_t LaneBytes = 16;
+    const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
 
     SmallVector<int, 64> Permutation;
-    SmallVector<uint16_t, 64> ShiftAmt;
+    SmallVector<uint8_t, 64> ShiftAmt;
     for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
       if (Amt.getOperand(I).isUndef())
         ShiftAmt.push_back(~0);
       else
         ShiftAmt.push_back(Amt.getConstantOperandVal(I));
     }
 
-    if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
-      Profitable = false;
-      constexpr size_t LaneBytes = 16;
-      const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
-
-      // For v32i8 or v64i8, we should check if we can generate a shuffle that
-      // may be lowered to VPSHUFB, because it is faster than VPERMB. This is
-      // possible if we can apply the same shuffle mask to each v16i8 lane.
-      // For example (assuming a lane has 4 elements for simplicity),
-      // <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
-      // be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
-      // <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
-      // Limitation: if there are some undef in shift amounts, this algorithm
-      // may not find a solution even if one exists, as here we only treat a
-      // VPSHUFB index as undef if all shuffle amounts of the same index modulo
-      // lane size are all undef.
-      // Since a byte can only be shifted by 7 bits without being UB, 4 bits are
-      // enough to represent the shift amount or undef (0xF).
-      std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
-      for (size_t I = 0; I < LaneBytes; ++I)
-        for (size_t J = 0; J < NumLanes; ++J)
-          VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
-      if (VT == MVT::v32i8) {
-        for (size_t I = 0; I < LaneBytes; ++I)
-          VPSHUFBShiftAmt[I] |= 0xFF00;
-      }
-      if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
-        // Found a VPSHUFB solution, offset the shuffle amount to other lanes.
-        Permutation.resize(VT.getVectorNumElements());
-        for (size_t I = 0; I < LaneBytes; ++I)
-          for (size_t J = 1; J < NumLanes; ++J)
-            Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
-        Profitable = true;
-      } else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
-        // A slower shuffle is profitable if the operand is also a slow shuffle,
-        // such that they can be merged.
-        // TODO: Use TargetTransformInfo to systematically determine whether
-        // inner shuffle is slow. Currently we only check if it contains
-        // cross-lane shuffle.
-        if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
-          if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
-              is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
-            Profitable = true;
+    // Check if we can find an in-lane shuffle to rearrange the shift amounts,
+    // if so, this transformation may be profitable.
+    bool Profitable;
+    for (size_t I = 0; I < NumLanes; ++I) {
+      if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
+        break;
+    }
+
+    // For AVX2, check if we can further rearrange shift amounts into adjacent
+    // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
+    // faster.
+    bool IsAdjacentQuads = false;
+    if (Profitable && Subtarget.hasAVX2()) {
+      SmallVector<uint8_t, 64> EveryOtherShiftAmt;
+      for (size_t I = 0; I < Permutation.size(); I += 2) {
+        uint8_t Shift1 = ShiftAmt[Permutation[I]];
+        uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
+        assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
+        EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
+      }
+      SmallVector<int, 32> Permutation2;
+      for (size_t I = 0; I < NumLanes; ++I) {
+        if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
+          break;
+      }
+      if (IsAdjacentQuads) {
+        SmallVector<int, 64> CombinedPermutation;
+        for (int Index : Permutation2) {
+          CombinedPermutation.push_back(Permutation[Index * 2]);
+          CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
         }
+        std::swap(Permutation, CombinedPermutation);
       }
     }
 
-    // If it is still profitable at this point, and has not found a permutation
-    // yet, try again with any shuffle index.
-    if (Profitable && Permutation.empty()) {
-      PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
-                           SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
+    // For right shifts, (V)PMULHUW needs an extra instruction to handle an
+    // amount of 0, disabling the transformation here to be cautious.
+    if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
+        any_of(ShiftAmt, [](auto x) { return x == 0; }))
+      Profitable = false;
+
+    bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
+    // If operand R is not a shuffle by itself, the transformation here adds two
+    // shuffles, adding a non-trivial cost. Here we take out a few cases where
+    // the benefit is questionable according to llvm-mca's modeling.
+    //
+    // Each cell shows latency before/after transform. Here R is not a shuffle.
+    // SSE3
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 17/17 | 20/20 | 26/26
+    // SRL  | 18/17 | 22/20 | 35/26
+    // SRA  | 21/19 | 26/22 | 39/30
+    // AVX2 using VPMUL*W
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 20/18 | 18/18 | 21/21
+    // SRL  | 20/18 | 22/18 | 26/21
+    // SRA  | 20/20 | 22/20 | 25/23
+    // AVX2 using VPS*LVD
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 20/16 | 18/16 | 21/20
+    // SRL  | 20/16 | 22/16 | 26/20
+    // SRA  | 20/18 | 22/18 | 25/22
+    if (!IsOperandShuffle) {
+      if (Subtarget.hasAVX2()) {
+        if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
+          Profitable = false;
+      } else {
+        if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
+          Profitable = false;
+      }
     }
 
     // Found a permutation P that can rearrange the shift amouts into adjacent
-    // pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
-    if (!Permutation.empty()) {
+    // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
+    if (Profitable) {
       SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
       SmallVector<SDValue, 64> NewShiftAmt;
       for (int Index : Permutation) {