intel
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 12 additions & 3 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 3 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 237 additions & 12 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 237 additions & 12 deletions
@@ -181,10 +181,19 @@ class GatherLoweringHelper {
 // `pReg` and `pLane` are square layouts each with only one input and output
 // dimension. `mixedTranspositions` holds pairs of integers (i, j)
 // corresponding to the transposition (r_i l_j) of the i-th register basis
-// vector with the j-th lane basis vector.
+// vector with the j-th lane basis vector along with 16-bit selectors for byte
+// permute instructions (where each of the four nybbles is in the range [0, 7]).
 struct DecomposedWarpConversion {
+  struct TranspositionInfo {
+    std::pair<int, int> transposition;
+    uint16_t topPreSel = 0x3210;
+    uint16_t botPreSel = 0x7654;
+    uint16_t topPostSel = 0x3210;
+    uint16_t botPostSel = 0x7654;
+  };
+
   triton::LinearLayout pReg, pLane;
-  SmallVector<std::pair<int, int>> mixedTranspositions;
+  SmallVector<TranspositionInfo> mixedTranspositions;
 };
 
 // Produces a decomposition of a permutation describing a warp-local layout
@@ -196,7 +205,7 @@ struct DecomposedWarpConversion {
 // represented as a permutation.
 DecomposedWarpConversion
 getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
-                                  RankedTensorType dstTy);
+                                  RankedTensorType dstTy, int bitwidth);
 
 // Decomposes a reshape into simpler pieces.
 //
 
@@ -48,6 +48,9 @@ class TargetInfoBase {
   virtual Value shuffleIdx(RewriterBase &rewriter, Location loc, Value val,
                            Value i) const = 0;
 
+  virtual Value permute(RewriterBase &rewriter, Location loc, Value a, Value b,
+                        Value selector) const = 0;
+
   virtual Value programId(RewriterBase &rewriter, Location loc,
                           ModuleOp moduleOp, ProgramIDDim axis) const = 0;
 
 
@@ -16,6 +16,7 @@
 #include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/Sys/GetEnv.hpp"
+#include "llvm/ADT/SmallSet.h"
 
 namespace mlir {
 
@@ -247,9 +248,14 @@ unsigned ScanLoweringHelper::getScratchSizeInBytes() {
   return elementSizeInBytes * getScratchSizeInElems();
 }
 
+static SmallVector<DecomposedWarpConversion::TranspositionInfo>
+getTranspositionSelectors(SmallVector<std::pair<int, int>> &mixedTranspositions,
+                          std::vector<std::vector<int32_t>> &regBases,
+                          int bitwidth);
+
 DecomposedWarpConversion
 getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
-                                  RankedTensorType dstTy) {
+                                  RankedTensorType dstTy, int bitwidth) {
   // Two layouts, ll_src and ll_dst, representing the same tensor can be
   // viewed as surjections of GF(2) vector spaces:
   //
@@ -278,11 +284,12 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   // subsequences of consecutive lane bits from cycles involving both bit types.
   // Further explanation of this method is below.
   //
-  // The decomposition is performed in two stages. First, we compute the
+  // The decomposition is performed in three stages. First, we compute the
   // permutation matrix `P` by using `invertAndCompose` to generate a skeleton
   // and then fill in any zero columns. Second, we walk the cycles of `P` to
   // factor out mixed transpositions to build `mixedTranspositions`, `pReg`, and
-  // `pLane`.
+  // `pLane`. Finally, we determine any selectors needed for byte permute
+  // instructions in place of `selp` instructions when packing registers.
 
   // We remove any broadcasting in the register dimensions of the layouts before
   // forming the permutation `P` as the components of the decomposition directly
@@ -336,19 +343,14 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
     T = padWithZeros(T);
   }
 
-  // Flatten outs for ease of building `P`, and reorder outs as flattening
-  // depends on output dimension order.
-  if (outDimNames != llvm::to_vector(T.getOutDimNames()))
-    T = T.transposeOuts(outDimNames);
-  S = S.flattenOuts();
-  T = T.flattenOuts();
-
   // We compute T^transpose \circ S, which serves as a skeleton for `P`, then
   // fill in zero columns, prioritizing producing fixed points. As we only need
   // the basis vectors of `P`, we never actually produce the LinearLayout.
   auto pBases = S.invertAndCompose(T).getBases();
 
   // Find the common and uncommon zeros of S and T
+  S = S.flattenOuts();
+  T = T.flattenOuts();
   SmallVector<std::pair<int32_t, int32_t>> srcFreeZeros;
   SmallVector<std::pair<int32_t, int32_t>> dstFreeZeros;
   for (auto [dimIdx, dim] : llvm::enumerate(inDimNames)) {
@@ -461,11 +463,234 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   }
   assert(visited.all() && "Cycle walk incomplete");
 
+  auto processedTranspos =
+      getTranspositionSelectors(mixedTranspositions, regBases, bitwidth);
+
   auto pReg = LinearLayout(std::move(pRegBases), {{kReg, 1 << nRegBases}},
                            /*requireSurjective=*/true);
   auto pLane = LinearLayout(std::move(pLaneBases), {{kLane, 1 << nLaneBases}},
                             /*requireSurjective=*/true);
-  return {std::move(pReg), std::move(pLane), std::move(mixedTranspositions)};
+  return {std::move(pReg), std::move(pLane), std::move(processedTranspos)};
+}
+
+static SmallVector<DecomposedWarpConversion::TranspositionInfo>
+getTranspositionSelectors(SmallVector<std::pair<int, int>> &mixedTranspositions,
+                          std::vector<std::vector<int32_t>> &regBases,
+                          int bitwidth) {
+  // When possible, we fuse permutations of 'low' register bits together
+  // with a mixed transposition, resulting in byte permute instructions instead
+  // of `select` instructions. After processing, no low register bits appear in
+  // the returned list of mixed transpositions.
+  int m = mixedTranspositions.size();
+  int nRegBases = regBases.size();
+  int nPackPrelim = llvm::Log2_32(std::clamp(32 / bitwidth, 1, 4));
+  int nPack = std::min(nPackPrelim, nRegBases - m);
+
+  SmallVector<DecomposedWarpConversion::TranspositionInfo> ret;
+  ret.reserve(mixedTranspositions.size());
+  if (nPack == 0) {
+    for (auto &t : mixedTranspositions)
+      ret.push_back(DecomposedWarpConversion::TranspositionInfo{t});
+    return ret;
+  }
+  // Consider for example the cycle
+  //
+  //        (r2 r1 l0 r0 r3) = (r0 l0) * (r2 r1 r0 r3)
+  //                         = (r3 r0) * (r3 l0) * (r3 r1) * (r3 r2)
+  //
+  // with `nPack` = 2 so that r0 and r1 are considered low bits. We want to
+  // factor out any low bits from `pReg` and to incorporate them into the data
+  // of the mixed transposition. After processing, the contribution to `pReg`
+  // is reduced to (r3 r2) and the mixed transposition recorded is (r3 l0), with
+  // the effects of (r3 r0) and (r3 r1) encoded in the returned selectors.
+  // In general, low bits occurring immediately before l_j modify the selectors
+  // of the `prmt` before the shuffle, while low bits occurring immediately
+  // after l_k modify the selectors of the `prmt` after the shuffle. Unmodified
+  // selectors correspond to `select` instructions.
+  // Cases like (l0 r0 r1) must be handled by selecting a 'partner' bit that is
+  // not used in another mixed transposition and conjugating out a low bit:
+  //
+  //           (l0 r0 r1) = (r2 r1) * (l0 r0 r2) * (r2 r1)
+  //                      = (r2 r1) * (r2 r0) * (r2 l0) * (r2 r1).
+  //
+  // Conjugation does not affect `pReg`. However, the set of fused mixed and
+  // low-bit transpositions is noncommutative in cases where there are no
+  // intervening high bits in between distinct sequences of lane bits as the
+  // paired low bit is used in modifying the selectors of both factors:
+  //
+  //    (l0 r0 r1 l1 r2) = (r3 r0)(r3 l0)(r3 r0) * (r2 l1)(r2 r1)(r2 r0).
+  //
+  // The `*` is standard composition of permutations. The groupings correspond
+  // to different `TranspositionInfo` objects. For example, the permutation
+  // `(r3 r0)(r3 l0)(r3 r0) = (r0 l0)` has mixed transposition `(r3 l0)` with
+  // pre- and post-shuffle selectors determined by the `r0` bit.
+  // Processing of mixed transpositions is performed by determining the `head`
+  // and `tail` of an excision of bits in cycles of `pReg` and building lists
+  // of low bits acting as selector modifiers. In the noncommutative cases, we
+  // opt to restrict the number of post-shuffle modifiers to one.
+
+  auto permuteSelector = [nPack](uint16_t sel, int bitIdx) {
+    int lo = bitIdx + (2 - nPack);
+    uint16_t maskHi = 0x4444;
+    uint16_t maskLo = 0x1111 << lo;
+    uint16_t fixed = sel & ~maskHi & ~maskLo;
+    int shift = 2 - lo;
+    return fixed | ((maskHi & sel) >> shift) | ((maskLo & sel) << shift);
+  };
+  auto generateSelectors = [&](int head, int tail, auto &&lowBits) {
+    uint16_t topSel = 0x3210;
+    uint16_t botSel = 0x7654;
+    for (auto lowBit : lowBits) {
+      topSel = permuteSelector(topSel, lowBit);
+      botSel = permuteSelector(botSel, lowBit);
+      if (lowBit != head && lowBit != tail)
+        regBases[lowBit][0] = 1 << lowBit;
+    }
+    return std::pair{topSel, botSel};
+  };
+
+  llvm::SmallSet<int32_t, 6> pairedRegBits;
+  for (auto [rBit, lBit] : mixedTranspositions)
+    pairedRegBits.insert(rBit);
+
+  // A low bit in a mixed transposition must be replaced by a high bit. The
+  // choice of high bit can affect instruction count. If the first high bit
+  // found when walking along `pReg` is unpaired, then that bit is the best
+  // choice. We reorder the transpositions to guarantee this during processing.
+  auto next = [&](int b) { return llvm::Log2_32(regBases[b][0]); };
+  auto nextHighFree = [&](auto p) {
+    int curr = p.first;
+    do {
+      if (curr >= nPack)
+        return curr == p.first || !pairedRegBits.contains(curr);
+      curr = next(curr);
+    } while (curr != p.first);
+    return false;
+  };
+  std::stable_partition(mixedTranspositions.begin(), mixedTranspositions.end(),
+                        nextHighFree);
+  // If `P` has an isolated low-bit mixed transposition, and `pReg` maps a low
+  // bit to an open high bit, then the high bit should be used as the partner.
+  auto prev = [&](int b) {
+    int tail = b;
+    int curr = next(b);
+    while (curr != b) {
+      tail = curr;
+      curr = next(curr);
+    }
+    return tail;
+  };
+  auto findPartner = [&](int lowBit, auto &preShufLoBits) {
+    if (nPack == 2) {
+      int otherLow = 1 - lowBit;
+      int b = next(otherLow);
+      if (next(lowBit) == lowBit && b >= nPack && !pairedRegBits.contains(b) &&
+          !pairedRegBits.contains(otherLow)) {
+        preShufLoBits.push_back(otherLow);
+        regBases[prev(otherLow)][0] = 1 << b;
+        pairedRegBits.insert(b);
+        return b;
+      }
+    }
+    int potentialPartner = nPack;
+    while (pairedRegBits.contains(potentialPartner))
+      ++potentialPartner;
+    pairedRegBits.insert(potentialPartner);
+    return potentialPartner;
+  };
+
+  for (auto p : mixedTranspositions) {
+    int rBit = p.first;
+    int lBit = p.second;
+    SmallVector<int> cycle;
+    int currBit = rBit;
+    do {
+      cycle.push_back(currBit);
+      currBit = next(currBit);
+    } while (currBit != rBit);
+
+    // Find any low register bits adjacent to the excised lane bits which aren't
+    // used in other mixed transpositions.
+    auto isBoundary = [&](int bit) {
+      return bit >= nPack || (pairedRegBits.contains(bit) && bit != rBit);
+    };
+    auto forwardEnd = llvm::find_if(cycle, isBoundary);
+    auto backwardEnd = std::find_if(cycle.rbegin(), cycle.rend(), isBoundary);
+    SmallVector<int> postShufLoBits(cycle.begin(), forwardEnd);
+    SmallVector<int> preShufLoBits(cycle.rbegin(), backwardEnd);
+    int head;
+    int tail;
+    int partnerBit = -1;
+
+    // Case work to determine what to conjugate out.
+    if (forwardEnd != cycle.end()) {
+      if (*forwardEnd == rBit || !pairedRegBits.contains(*forwardEnd)) {
+        // End at original or unpaired high bit. E.g. (l0 r0 r2) or (l0 r2)
+        // No conjugation needed.
+        head = partnerBit = *forwardEnd;
+      } else {
+        // End at different paired bit. E.g. (l0 r0 r1 l1 r2)
+        // Non-leading factor in a noncommutative case.
+        // Conjugate by first low bit in forward walk.
+        head = postShufLoBits.front();
+        preShufLoBits.push_back(head);
+        postShufLoBits.resize(1);
+        pairedRegBits.erase(head);
+      }
+      tail = *backwardEnd;
+      if (tail < nPack && pairedRegBits.contains(tail)) {
+        // Non-terminal factor in a noncommutative case.
+        preShufLoBits.insert(preShufLoBits.begin(), tail);
+      }
+    } else {
+      if (next(rBit) != rBit && pairedRegBits.contains(next(rBit))) {
+        // Symmetric noncommutative case. E.g. (l0 r0 l1 r1)
+        preShufLoBits.erase(preShufLoBits.begin());
+        postShufLoBits.pop_back();
+        pairedRegBits.erase(postShufLoBits.front());
+        head = rBit;
+        tail = next(rBit);
+      } else {
+        // Isolated low bits with single mixed transposition. E.g. (l0 r0 r1)
+        if (postShufLoBits.size() == 2)
+          postShufLoBits.pop_back();
+        head = tail = preShufLoBits.front();
+      }
+    }
+
+    if (partnerBit < 0)
+      partnerBit = findPartner(head, preShufLoBits);
+    auto [topPostSel, botPostSel] =
+        generateSelectors(head, tail, llvm::reverse(postShufLoBits));
+    auto [topPreSel, botPreSel] = generateSelectors(head, tail, preShufLoBits);
+    regBases[tail][0] = 1 << head;
+
+    DecomposedWarpConversion::TranspositionInfo info;
+    info.transposition = {partnerBit, lBit};
+    info.topPreSel = topPreSel;
+    info.botPreSel = botPreSel;
+    info.topPostSel = topPostSel;
+    info.botPostSel = botPostSel;
+
+    // In noncommutative cases, post-shuffle selectors of non-leading terms come
+    // from a single low bit by design, so we can determine where to insert a
+    // non-terminal factor by examining processed selectors.
+    if (!preShufLoBits.empty()) {
+      uint16_t sel = (nPack - preShufLoBits.back()) == 2 ? 0x6240 : 0x5410;
+      auto it =
+          llvm::find_if(ret, [&](auto &t) { return t.topPostSel == sel; });
+      ret.insert(it, info);
+    } else {
+      ret.push_back(info);
+    }
+  }
+  if (nPack == 2 && regBases[0][0] == 2 && regBases[1][0] == 1 && ret.size()) {
+    // If (r0 r1) was originally in `P`, fold it into a mixed transposition.
+    auto &t = ret.back();
+    t.topPostSel = 0x3120;
+    t.botPostSel = 0x7564;
+  }
+  return ret;
 }
 
 SmallVector<std::pair<SmallVector<int64_t>, SmallVector<int64_t>>>
@@ -763,7 +988,7 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
   auto kLane = StringAttr::get(ctx, "lane");
   if (to_vector(layout.getOutDimNames()) ==
       SmallVector<StringAttr, 2>{kRegister, kLane}) {
-    auto factors = getWarpLayoutConvertDecomposition(srcTy, dstTy);
+    auto factors = getWarpLayoutConvertDecomposition(srcTy, dstTy, 32);
     return (factors.mixedTranspositions.size() < 2);
   }
   return false;