intel
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 3 additions & 12 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 3 additions & 12 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 12 additions & 237 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 12 additions & 237 deletions
@@ -181,19 +181,10 @@ class GatherLoweringHelper {
 // `pReg` and `pLane` are square layouts each with only one input and output
 // dimension. `mixedTranspositions` holds pairs of integers (i, j)
 // corresponding to the transposition (r_i l_j) of the i-th register basis
-// vector with the j-th lane basis vector along with 16-bit selectors for byte
-// permute instructions (where each of the four nybbles is in the range [0, 7]).
+// vector with the j-th lane basis vector.
 struct DecomposedWarpConversion {
-  struct TranspositionInfo {
-    std::pair<int, int> transposition;
-    uint16_t topPreSel = 0x3210;
-    uint16_t botPreSel = 0x7654;
-    uint16_t topPostSel = 0x3210;
-    uint16_t botPostSel = 0x7654;
-  };
-
   triton::LinearLayout pReg, pLane;
-  SmallVector<TranspositionInfo> mixedTranspositions;
+  SmallVector<std::pair<int, int>> mixedTranspositions;
 };
 
 // Produces a decomposition of a permutation describing a warp-local layout
@@ -205,7 +196,7 @@ struct DecomposedWarpConversion {
 // represented as a permutation.
 DecomposedWarpConversion
 getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
-                                  RankedTensorType dstTy, int bitwidth);
+                                  RankedTensorType dstTy);
 
 // Decomposes a reshape into simpler pieces.
 //
 
@@ -17,7 +17,6 @@
 #include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/Sys/GetEnv.hpp"
-#include "llvm/ADT/SmallSet.h"
 
 namespace mlir {
 
@@ -254,14 +253,9 @@ unsigned ScanLoweringHelper::getScratchSizeInBytes() {
   return elementSizeInBytes * getScratchSizeInElems();
 }
 
-static SmallVector<DecomposedWarpConversion::TranspositionInfo>
-getTranspositionSelectors(SmallVector<std::pair<int, int>> &mixedTranspositions,
-                          std::vector<std::vector<int32_t>> &regBases,
-                          int bitwidth);
-
 DecomposedWarpConversion
 getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
-                                  RankedTensorType dstTy, int bitwidth) {
+                                  RankedTensorType dstTy) {
   // Two layouts, ll_src and ll_dst, representing the same tensor can be
   // viewed as surjections of GF(2) vector spaces:
   //
@@ -290,12 +284,11 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   // subsequences of consecutive lane bits from cycles involving both bit types.
   // Further explanation of this method is below.
   //
-  // The decomposition is performed in three stages. First, we compute the
+  // The decomposition is performed in two stages. First, we compute the
   // permutation matrix `P` by using `invertAndCompose` to generate a skeleton
   // and then fill in any zero columns. Second, we walk the cycles of `P` to
   // factor out mixed transpositions to build `mixedTranspositions`, `pReg`, and
-  // `pLane`. Finally, we determine any selectors needed for byte permute
-  // instructions in place of `selp` instructions when packing registers.
+  // `pLane`.
 
   // We remove any broadcasting in the register dimensions of the layouts before
   // forming the permutation `P` as the components of the decomposition directly
@@ -349,14 +342,19 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
     T = padWithZeros(T);
   }
 
+  // Flatten outs for ease of building `P`, and reorder outs as flattening
+  // depends on output dimension order.
+  if (outDimNames != llvm::to_vector(T.getOutDimNames()))
+    T = T.transposeOuts(outDimNames);
+  S = S.flattenOuts();
+  T = T.flattenOuts();
+
   // We compute T^transpose \circ S, which serves as a skeleton for `P`, then
   // fill in zero columns, prioritizing producing fixed points. As we only need
   // the basis vectors of `P`, we never actually produce the LinearLayout.
   auto pBases = S.invertAndCompose(T).getBases();
 
   // Find the common and uncommon zeros of S and T
-  S = S.flattenOuts();
-  T = T.flattenOuts();
   SmallVector<std::pair<int32_t, int32_t>> srcFreeZeros;
   SmallVector<std::pair<int32_t, int32_t>> dstFreeZeros;
   for (auto [dimIdx, dim] : llvm::enumerate(inDimNames)) {
@@ -469,234 +467,11 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   }
   assert(visited.all() && "Cycle walk incomplete");
 
-  auto processedTranspos =
-      getTranspositionSelectors(mixedTranspositions, regBases, bitwidth);
-
   auto pReg = LinearLayout(std::move(pRegBases), {{kReg, 1 << nRegBases}},
                            /*requireSurjective=*/true);
   auto pLane = LinearLayout(std::move(pLaneBases), {{kLane, 1 << nLaneBases}},
                             /*requireSurjective=*/true);
-  return {std::move(pReg), std::move(pLane), std::move(processedTranspos)};
-}
-
-static SmallVector<DecomposedWarpConversion::TranspositionInfo>
-getTranspositionSelectors(SmallVector<std::pair<int, int>> &mixedTranspositions,
-                          std::vector<std::vector<int32_t>> &regBases,
-                          int bitwidth) {
-  // When possible, we fuse permutations of 'low' register bits together
-  // with a mixed transposition, resulting in byte permute instructions instead
-  // of `select` instructions. After processing, no low register bits appear in
-  // the returned list of mixed transpositions.
-  int m = mixedTranspositions.size();
-  int nRegBases = regBases.size();
-  int nPackPrelim = llvm::Log2_32(std::clamp(32 / bitwidth, 1, 4));
-  int nPack = std::min(nPackPrelim, nRegBases - m);
-
-  SmallVector<DecomposedWarpConversion::TranspositionInfo> ret;
-  ret.reserve(mixedTranspositions.size());
-  if (nPack == 0) {
-    for (auto &t : mixedTranspositions)
-      ret.push_back(DecomposedWarpConversion::TranspositionInfo{t});
-    return ret;
-  }
-  // Consider for example the cycle
-  //
-  //        (r2 r1 l0 r0 r3) = (r0 l0) * (r2 r1 r0 r3)
-  //                         = (r3 r0) * (r3 l0) * (r3 r1) * (r3 r2)
-  //
-  // with `nPack` = 2 so that r0 and r1 are considered low bits. We want to
-  // factor out any low bits from `pReg` and to incorporate them into the data
-  // of the mixed transposition. After processing, the contribution to `pReg`
-  // is reduced to (r3 r2) and the mixed transposition recorded is (r3 l0), with
-  // the effects of (r3 r0) and (r3 r1) encoded in the returned selectors.
-  // In general, low bits occurring immediately before l_j modify the selectors
-  // of the `prmt` before the shuffle, while low bits occurring immediately
-  // after l_k modify the selectors of the `prmt` after the shuffle. Unmodified
-  // selectors correspond to `select` instructions.
-  // Cases like (l0 r0 r1) must be handled by selecting a 'partner' bit that is
-  // not used in another mixed transposition and conjugating out a low bit:
-  //
-  //           (l0 r0 r1) = (r2 r1) * (l0 r0 r2) * (r2 r1)
-  //                      = (r2 r1) * (r2 r0) * (r2 l0) * (r2 r1).
-  //
-  // Conjugation does not affect `pReg`. However, the set of fused mixed and
-  // low-bit transpositions is noncommutative in cases where there are no
-  // intervening high bits in between distinct sequences of lane bits as the
-  // paired low bit is used in modifying the selectors of both factors:
-  //
-  //    (l0 r0 r1 l1 r2) = (r3 r0)(r3 l0)(r3 r0) * (r2 l1)(r2 r1)(r2 r0).
-  //
-  // The `*` is standard composition of permutations. The groupings correspond
-  // to different `TranspositionInfo` objects. For example, the permutation
-  // `(r3 r0)(r3 l0)(r3 r0) = (r0 l0)` has mixed transposition `(r3 l0)` with
-  // pre- and post-shuffle selectors determined by the `r0` bit.
-  // Processing of mixed transpositions is performed by determining the `head`
-  // and `tail` of an excision of bits in cycles of `pReg` and building lists
-  // of low bits acting as selector modifiers. In the noncommutative cases, we
-  // opt to restrict the number of post-shuffle modifiers to one.
-
-  auto permuteSelector = [nPack](uint16_t sel, int bitIdx) {
-    int lo = bitIdx + (2 - nPack);
-    uint16_t maskHi = 0x4444;
-    uint16_t maskLo = 0x1111 << lo;
-    uint16_t fixed = sel & ~maskHi & ~maskLo;
-    int shift = 2 - lo;
-    return fixed | ((maskHi & sel) >> shift) | ((maskLo & sel) << shift);
-  };
-  auto generateSelectors = [&](int head, int tail, auto &&lowBits) {
-    uint16_t topSel = 0x3210;
-    uint16_t botSel = 0x7654;
-    for (auto lowBit : lowBits) {
-      topSel = permuteSelector(topSel, lowBit);
-      botSel = permuteSelector(botSel, lowBit);
-      if (lowBit != head && lowBit != tail)
-        regBases[lowBit][0] = 1 << lowBit;
-    }
-    return std::pair{topSel, botSel};
-  };
-
-  llvm::SmallSet<int32_t, 6> pairedRegBits;
-  for (auto [rBit, lBit] : mixedTranspositions)
-    pairedRegBits.insert(rBit);
-
-  // A low bit in a mixed transposition must be replaced by a high bit. The
-  // choice of high bit can affect instruction count. If the first high bit
-  // found when walking along `pReg` is unpaired, then that bit is the best
-  // choice. We reorder the transpositions to guarantee this during processing.
-  auto next = [&](int b) { return llvm::Log2_32(regBases[b][0]); };
-  auto nextHighFree = [&](auto p) {
-    int curr = p.first;
-    do {
-      if (curr >= nPack)
-        return curr == p.first || !pairedRegBits.contains(curr);
-      curr = next(curr);
-    } while (curr != p.first);
-    return false;
-  };
-  std::stable_partition(mixedTranspositions.begin(), mixedTranspositions.end(),
-                        nextHighFree);
-  // If `P` has an isolated low-bit mixed transposition, and `pReg` maps a low
-  // bit to an open high bit, then the high bit should be used as the partner.
-  auto prev = [&](int b) {
-    int tail = b;
-    int curr = next(b);
-    while (curr != b) {
-      tail = curr;
-      curr = next(curr);
-    }
-    return tail;
-  };
-  auto findPartner = [&](int lowBit, auto &preShufLoBits) {
-    if (nPack == 2) {
-      int otherLow = 1 - lowBit;
-      int b = next(otherLow);
-      if (next(lowBit) == lowBit && b >= nPack && !pairedRegBits.contains(b) &&
-          !pairedRegBits.contains(otherLow)) {
-        preShufLoBits.push_back(otherLow);
-        regBases[prev(otherLow)][0] = 1 << b;
-        pairedRegBits.insert(b);
-        return b;
-      }
-    }
-    int potentialPartner = nPack;
-    while (pairedRegBits.contains(potentialPartner))
-      ++potentialPartner;
-    pairedRegBits.insert(potentialPartner);
-    return potentialPartner;
-  };
-
-  for (auto p : mixedTranspositions) {
-    int rBit = p.first;
-    int lBit = p.second;
-    SmallVector<int> cycle;
-    int currBit = rBit;
-    do {
-      cycle.push_back(currBit);
-      currBit = next(currBit);
-    } while (currBit != rBit);
-
-    // Find any low register bits adjacent to the excised lane bits which aren't
-    // used in other mixed transpositions.
-    auto isBoundary = [&](int bit) {
-      return bit >= nPack || (pairedRegBits.contains(bit) && bit != rBit);
-    };
-    auto forwardEnd = llvm::find_if(cycle, isBoundary);
-    auto backwardEnd = std::find_if(cycle.rbegin(), cycle.rend(), isBoundary);
-    SmallVector<int> postShufLoBits(cycle.begin(), forwardEnd);
-    SmallVector<int> preShufLoBits(cycle.rbegin(), backwardEnd);
-    int head;
-    int tail;
-    int partnerBit = -1;
-
-    // Case work to determine what to conjugate out.
-    if (forwardEnd != cycle.end()) {
-      if (*forwardEnd == rBit || !pairedRegBits.contains(*forwardEnd)) {
-        // End at original or unpaired high bit. E.g. (l0 r0 r2) or (l0 r2)
-        // No conjugation needed.
-        head = partnerBit = *forwardEnd;
-      } else {
-        // End at different paired bit. E.g. (l0 r0 r1 l1 r2)
-        // Non-leading factor in a noncommutative case.
-        // Conjugate by first low bit in forward walk.
-        head = postShufLoBits.front();
-        preShufLoBits.push_back(head);
-        postShufLoBits.resize(1);
-        pairedRegBits.erase(head);
-      }
-      tail = *backwardEnd;
-      if (tail < nPack && pairedRegBits.contains(tail)) {
-        // Non-terminal factor in a noncommutative case.
-        preShufLoBits.insert(preShufLoBits.begin(), tail);
-      }
-    } else {
-      if (next(rBit) != rBit && pairedRegBits.contains(next(rBit))) {
-        // Symmetric noncommutative case. E.g. (l0 r0 l1 r1)
-        preShufLoBits.erase(preShufLoBits.begin());
-        postShufLoBits.pop_back();
-        pairedRegBits.erase(postShufLoBits.front());
-        head = rBit;
-        tail = next(rBit);
-      } else {
-        // Isolated low bits with single mixed transposition. E.g. (l0 r0 r1)
-        if (postShufLoBits.size() == 2)
-          postShufLoBits.pop_back();
-        head = tail = preShufLoBits.front();
-      }
-    }
-
-    if (partnerBit < 0)
-      partnerBit = findPartner(head, preShufLoBits);
-    auto [topPostSel, botPostSel] =
-        generateSelectors(head, tail, llvm::reverse(postShufLoBits));
-    auto [topPreSel, botPreSel] = generateSelectors(head, tail, preShufLoBits);
-    regBases[tail][0] = 1 << head;
-
-    DecomposedWarpConversion::TranspositionInfo info;
-    info.transposition = {partnerBit, lBit};
-    info.topPreSel = topPreSel;
-    info.botPreSel = botPreSel;
-    info.topPostSel = topPostSel;
-    info.botPostSel = botPostSel;
-
-    // In noncommutative cases, post-shuffle selectors of non-leading terms come
-    // from a single low bit by design, so we can determine where to insert a
-    // non-terminal factor by examining processed selectors.
-    if (!preShufLoBits.empty()) {
-      uint16_t sel = (nPack - preShufLoBits.back()) == 2 ? 0x6240 : 0x5410;
-      auto it =
-          llvm::find_if(ret, [&](auto &t) { return t.topPostSel == sel; });
-      ret.insert(it, info);
-    } else {
-      ret.push_back(info);
-    }
-  }
-  if (nPack == 2 && regBases[0][0] == 2 && regBases[1][0] == 1 && ret.size()) {
-    // If (r0 r1) was originally in `P`, fold it into a mixed transposition.
-    auto &t = ret.back();
-    t.topPostSel = 0x3120;
-    t.botPostSel = 0x7564;
-  }
-  return ret;
+  return {std::move(pReg), std::move(pLane), std::move(mixedTranspositions)};
 }
 
 SmallVector<std::pair<SmallVector<int64_t>, SmallVector<int64_t>>>
@@ -994,7 +769,7 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
   auto kLane = StringAttr::get(ctx, "lane");
   if (to_vector(layout.getOutDimNames()) ==
       SmallVector<StringAttr, 2>{kRegister, kLane}) {
-    auto factors = getWarpLayoutConvertDecomposition(srcTy, dstTy, 32);
+    auto factors = getWarpLayoutConvertDecomposition(srcTy, dstTy);
     return (factors.mixedTranspositions.size() < 2);
   }
   return false;