[Backend] Follow-up refactor of getWarpLayoutConvertDecomposition (#7571)

FrederickVu · web-flow · commit db7170e18a35 · 2025-07-20T16:41:11.000+01:00
This PR is a follow-up to #7558 to move the logic of `basisPermutationLayout` inside `getWarpLayoutConvertDecomposition` and to remove the associated unit tests. We also restore the `convert_layout_blocked_blocked_multi_rep` LIT test with changes to the tensor shape and encodings.  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [x] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/include/triton/Tools/LayoutUtils.h b/include/triton/Tools/LayoutUtils.h
@@ -148,22 +148,6 @@ LinearLayout reshapeLayout(MLIRContext *ctx, LinearLayout layout,
 // order.
 LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order);
 
-// Reorders the in and out dimensions to match another layout.
-LinearLayout reorder_like(const LinearLayout &x, const LinearLayout &y);
-
-// For two layouts, `src` and `dst`, that differ only by a permutation of
-// their basis vectors, return a permutation layout `P` which satisfies
-// `dst` \circ `P` = `src`.
-//
-// The returned layout has the following properties:
-// - The orders of the input and output dimensions of `P` match the order of the
-//   input dimensions of `src`.
-// - Prioritizes making zero (broadcasting) vectors fixed-points of the
-//   permutation. I.e., if a vector is zero in both `src` and `dst` for the same
-//   input coordinate, it maps to itself under `P`.
-LinearLayout basisPermutationLayout(const LinearLayout &src,
-                                    const LinearLayout &dst);
-
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LAYOUTUTILS_H
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -278,10 +278,11 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   // subsequences of consecutive lane bits from cycles involving both bit types.
   // Further explanation of this method is below.
   //
-  // The decomposition is implemented by building bases for the layouts `pReg`
-  // and `pLane` by walking the cycles of `P`, a permutation layout returned by
-  // `basisPermutationLayout(S, T)` which accepts two layouts `S` and `T` which
-  // differ only by a permutation of their basis vectors.
+  // The decomposition is performed in two stages. First, we compute the
+  // permutation matrix `P` by using `invertAndCompose` to generate a skeleton
+  // and then fill in any zero columns. Second, we walk the cycles of `P` to
+  // factor out mixed transpositions to build `mixedTranspositions`, `pReg`, and
+  // `pLane`.
 
   // We remove any broadcasting in the register dimensions of the layouts before
   // forming the permutation `P` as the components of the decomposition directly
@@ -310,9 +311,10 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   int nRegBases = std::max(nSrcRegBases, nDstRegBases);
   int nLaneBases = std::max(nSrcLaneBases, nDstLaneBases);
   // Restrict attention to the input dimensions which matter.
+  SmallVector<StringAttr> inDimNames{kReg, kLane};
   auto outDimNames = llvm::to_vector(srcLayout.getOutDimNames());
-  auto S = srcLayout.sublayout({kReg, kLane}, outDimNames);
-  auto T = dstLayout.sublayout({kReg, kLane}, outDimNames);
+  auto S = srcLayout.sublayout(inDimNames, outDimNames);
+  auto T = dstLayout.sublayout(inDimNames, outDimNames);
   // Conditionally pad.
   if (nSrcRegBases != nDstRegBases || nSrcLaneBases != nDstLaneBases) {
     auto padWithZeros = [&](const LinearLayout &ll) {
@@ -334,10 +336,41 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
     T = padWithZeros(T);
   }
 
-  // Now that `S` and `T` have the same basis vectors, we compute the
-  // permutation `P` which transforms `S` into `T`.
-  auto P = basisPermutationLayout(S, T);
-  auto &pBases = P.getBases();
+  // Flatten outs for ease of building `P`, and reorder outs as flattening
+  // depends on output dimension order.
+  if (outDimNames != llvm::to_vector(T.getOutDimNames()))
+    T = T.transposeOuts(outDimNames);
+  S = S.flattenOuts();
+  T = T.flattenOuts();
+
+  // We compute T^transpose \circ S, which serves as a skeleton for `P`, then
+  // fill in zero columns, prioritizing producing fixed points. As we only need
+  // the basis vectors of `P`, we never actually produce the LinearLayout.
+  auto pBases = S.invertAndCompose(T).getBases();
+
+  // Find the common and uncommon zeros of S and T
+  SmallVector<std::pair<int32_t, int32_t>> srcFreeZeros;
+  SmallVector<std::pair<int32_t, int32_t>> dstFreeZeros;
+  for (auto [dimIdx, dim] : llvm::enumerate(inDimNames)) {
+    for (int inIdx = 0; inIdx < S.getInDimSizeLog2(dim); ++inIdx) {
+      int sVal = S.getBasis(dim, inIdx)[0];
+      int tVal = T.getBasis(dim, inIdx)[0];
+      if (sVal == 0 && tVal == 0) {
+        pBases[dim][inIdx][dimIdx] = 1 << inIdx;
+      } else if (sVal == 0) {
+        srcFreeZeros.emplace_back(dimIdx, inIdx);
+      } else if (tVal == 0) {
+        dstFreeZeros.emplace_back(dimIdx, inIdx);
+      }
+    }
+  }
+  // Fill in non-fixed-point zero vectors
+  for (auto [srcZeroLoc, dstZeroLoc] : llvm::zip(srcFreeZeros, dstFreeZeros)) {
+    auto [srcDimIdx, srcIdx] = srcZeroLoc;
+    auto [dstDimIdx, dstIdx] = dstZeroLoc;
+    auto inDim = inDimNames[srcDimIdx];
+    pBases[inDim][srcIdx][dstDimIdx] = 1 << dstIdx;
+  }
 
   // We walk the cycles of `P` to build the bases for `pReg` and `pLane` while
   // factoring out mixed transpositions from cycles that include both register
@@ -355,9 +388,8 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
     return (dim == kReg) ? index : nRegBases + index;
   };
 
-  auto dimNames = llvm::to_vector(P.getInDimNames());
-  for (auto dim : dimNames) {
-    int inDimSize = P.getInDimSizeLog2(dim);
+  for (auto dim : inDimNames) {
+    int inDimSize = S.getInDimSizeLog2(dim);
     for (int i = 0; i < inDimSize; ++i) {
       if (visited.test(flatIdx(dim, i)))
         continue;
@@ -393,7 +425,7 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
         int32_t nextIdx;
         for (auto [nextDimIdx, nextVal] : llvm::enumerate(nextVec)) {
           if (nextVal != 0) {
-            nextDim = dimNames[nextDimIdx];
+            nextDim = inDimNames[nextDimIdx];
             nextIdx = llvm::Log2_32(nextVal);
           }
         }
diff --git a/lib/Tools/LayoutUtils.cpp b/lib/Tools/LayoutUtils.cpp
@@ -1,6 +1,5 @@
 #include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/GenericSwizzling.h"
-#include "llvm/ADT/SmallSet.h"
 
 namespace mlir::triton {
 
@@ -447,137 +446,4 @@ LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order) {
                       to_vector(layout.getOutDimNames()));
 }
 
-LinearLayout reorder_like(const LinearLayout &x, const LinearLayout &y) {
-  // This will check that the names are the same up to permutation, and
-  // apply the necessary permutation:
-  auto x2 = x.transposeOuts(llvm::to_vector(y.getOutDimNames()));
-  auto x3 = x2.transposeIns(llvm::to_vector(y.getInDimNames()));
-  return x3;
-}
-
-LinearLayout basisPermutationLayout(const LinearLayout &src,
-                                    const LinearLayout &dst) {
-  // This function computes a permutation layout `P` which satisfies the
-  // property `src = dst \circ P`. It requires that the multiset of basis
-  // vectors for each of `src` and `dst` agree and that the nonzero values in
-  // each of the multisets are unique. I.e., broadcasting is allowed in either
-  // layout so long as the degree of broadcasting (the number of zero basis
-  // vectors) is the same between the two layouts.
-  //
-  // The orders of the input and output dimensions of `P` are set to be the
-  // order of the input dimensions of `src`.
-  //
-  // The mapping of broadcasting basis vectors prioritizes keeping such vectors
-  // as fixed points of the permutation. I.e., if `src[inDim][i]` and
-  // `dst[inDim][i]` are zero vectors, then `P[inDim][i][inDimIdx] == 1 << i`,
-  // where `inDimIdx` is the index of `inDim` in `src`. Otherwise, they are
-  // paired according to their order of appearance in the two layouts, again
-  // following the order of the input dimensions of `src`.
-  //
-  // The algorithm first performs a linear scan over the columns of `dst` and
-  // `src` to build a map from ('flattened') basis vectors to the input
-  // vectors of `dst` while tracking the fixed-point zero vectors and 'free'
-  // zero vectors. It then performs a second linear scan over `src` to build
-  // the basis of `P`.
-
-  // Check that the input and output dimensions are equal up to ordering.
-  auto srcInDims = src.getInDimNames();
-  assert(std::is_permutation(srcInDims.begin(), srcInDims.end(),
-                             dst.getInDimNames().begin()) &&
-         "Layouts must have same input dimensions");
-  for (auto inDim : srcInDims) {
-    assert(src.getInDimSize(inDim) == dst.getInDimSize(inDim) &&
-           "Layouts must have same input dimension sizes");
-  }
-  auto srcOutDims = src.getOutDims();
-  assert(std::is_permutation(srcOutDims.begin(), srcOutDims.end(),
-                             dst.getOutDims().begin()) &&
-         "Layouts must have same output dimensions and dimension sizes");
-
-  auto srcFlat = src.flattenOuts();
-  // Reorder the output dimensions of `dst` if necessary before flattening, as
-  // flattening depends on the order.
-  LinearLayout dstFlat;
-  if (!llvm::equal(src.getOutDims(), dst.getOutDims())) {
-    auto temp = dst.transposeOuts(llvm::to_vector(src.getOutDimNames()));
-    dstFlat = temp.flattenOuts();
-  } else {
-    dstFlat = dst.flattenOuts();
-  }
-
-  // Populate the map of flattened values to dst inputs and track zero vectors.
-  // The `commonZeros` become fixed-points of `P`, while the 'free' zeros are
-  // later paired with one another.
-  DenseMap<int32_t, std::pair<StringAttr, int32_t>> valToDstInput;
-  llvm::SmallDenseMap<StringAttr, llvm::SmallSet<int32_t, 4>> commonZeros;
-  SmallVector<std::pair<StringAttr, int32_t>> dstFreeZeros;
-  size_t srcFreeZerosCount = 0;
-
-  // We traverse the input dimensions according to their order in `src` so that
-  // 'free' zero vectors for a given input dimension in `src` prefer to map to
-  // 'free' zero vectors in the same dimension in `dst.
-  for (auto inDim : srcInDims) {
-    int inDimSize = dstFlat.getInDimSizeLog2(inDim);
-    for (int i = 0; i < inDimSize; ++i) {
-      int32_t dstVal = dstFlat.getBasis(inDim, i)[0];
-      int32_t srcVal = srcFlat.getBasis(inDim, i)[0];
-      if (dstVal == 0 && srcVal == 0) {
-        commonZeros[inDim].insert(i);
-      } else if (dstVal == 0) {
-        dstFreeZeros.emplace_back(inDim, i);
-      } else {
-        auto [it, success] = valToDstInput.try_emplace(dstVal, inDim, i);
-        assert(success && "Found duplicate nonzero vectors in dst layout");
-        if (srcVal == 0)
-          ++srcFreeZerosCount;
-      }
-    }
-  }
-  assert(srcFreeZerosCount == dstFreeZeros.size() &&
-         "src and dst layouts have differing number of zero bases");
-
-  // Build the basis vectors for the permutation layout `P`.
-  // For each basis vector in `src`, determine its target in `dst`:
-  // - If the vector is nonzero, find the corresponding vector in `dst`.
-  // - If it is a zero vector common to both layouts, set it as a fixed-point.
-  // - Otherwise, pair it with the next available free zero of `dst`.
-  LinearLayout::BasesT pBases;
-  size_t numDims = llvm::size(srcInDims);
-  size_t freeZeroIdx = 0;
-  for (auto inDim : srcInDims) {
-    int inDimSize = srcFlat.getInDimSizeLog2(inDim);
-    auto &inDimBases = pBases[inDim];
-    inDimBases.reserve(inDimSize);
-    for (int i = 0; i < inDimSize; ++i)
-      inDimBases.emplace_back(numDims, 0);
-
-    for (int inIdx = 0; inIdx < inDimSize; ++inIdx) {
-      int32_t val = srcFlat.getBasis(inDim, inIdx)[0];
-      std::pair<StringAttr, int32_t> dstTarget;
-
-      if (val != 0) {
-        auto it = valToDstInput.find(val);
-        assert(it != valToDstInput.end() && "src basis not found in dst");
-        dstTarget = it->second;
-      } else if (commonZeros.lookup(inDim).count(inIdx)) {
-        dstTarget = {inDim, inIdx};
-      } else {
-        dstTarget = dstFreeZeros[freeZeroIdx++];
-      }
-
-      // Build the basis vector for `P` using the ordering on output dimensions
-      // induced by the ordering on the input dimensions of `src`.
-      auto it = llvm::find(srcInDims, dstTarget.first);
-      int outDimIdx = std::distance(srcInDims.begin(), it);
-      inDimBases[inIdx][outDimIdx] = 1 << dstTarget.second;
-    }
-  }
-  // Declare the ordering on the `outDims` of `P` to be that of `srcInDims`.
-  SmallVector<std::pair<StringAttr, int32_t>> outDims;
-  for (auto outDim : srcInDims)
-    outDims.emplace_back(outDim, srcFlat.getInDimSize(outDim));
-
-  return LinearLayout(std::move(pBases), outDims, /*requireSurjective=*/true);
-}
-
 } // namespace mlir::triton
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -874,6 +874,27 @@ tt.func @convert_layout_ptr_element(%arg0: tensor<16x16x!tt.ptr<i32>, #blocked0>
 
 // -----
 
+#blocked0 = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 1], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [4, 8], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
+  // CHECK: llvm.mlir.global external @global_smem
+  // CHECK-LABEL: convert_layout_blocked_blocked_multi_rep
+  tt.func @convert_layout_blocked_blocked_multi_rep(%arg0: tensor<32x32xf32, #blocked0>) {
+    // CHECK: llvm.mlir.addressof @global_smem
+    // CHECK-COUNT-4: llvm.store
+    // CHECK: nvvm.barrier0
+    // CHECK-COUNT-4: llvm.load
+    // CHECK: nvvm.barrier0
+    // CHECK-COUNT-4: llvm.store
+    // CHECK: nvvm.barrier0
+    // CHECK-COUNT-4: llvm.load
+    %0 = ttg.convert_layout %arg0 : tensor<32x32xf32, #blocked0> -> tensor<32x32xf32, #blocked1>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked0 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #shared0 = #ttg.swizzled_shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #mma0 = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [1, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
diff --git a/unittest/Tools/LayoutUtilsTest.cpp b/unittest/Tools/LayoutUtilsTest.cpp
@@ -45,35 +45,5 @@ TEST_F(LayoutUtilsTest, SquareSublayoutIsIdentity) {
   EXPECT_TRUE(squareSublayoutIsIdentity(l3, {S("in1"), S("in2")}));
 }
 
-TEST_F(LayoutUtilsTest, BasisPermutationLayout) {
-  LinearLayout src1(
-      {{S("in1"), {{1, 0}, {0, 0}, {0, 2}}}, {S("in2"), {{2, 0}, {0, 1}}}},
-      {S("out1"), S("out2")});
-  LinearLayout dst1(
-      {{S("in2"), {{1, 0}, {0, 0}}}, {S("in1"), {{2, 0}, {0, 1}, {0, 2}}}},
-      {S("out2"), S("out1")});
-  LinearLayout P1(
-      {{S("in1"), {{2, 0}, {0, 2}, {1, 0}}}, {S("in2"), {{4, 0}, {0, 1}}}},
-      {S("in1"), S("in2")});
-  EXPECT_EQ(P1, basisPermutationLayout(src1, dst1));
-  EXPECT_EQ(src1, reorder_like(P1.compose(dst1), src1));
-  LinearLayout src2({{S("in3"), {{2, 0}, {4, 0}, {8, 0}, {0, 0}}},
-                     {S("in2"), {{0, 0}, {16, 0}, {0, 0}, {0, 1}}},
-                     {S("in1"), {{0, 2}, {0, 0}, {0, 4}}}},
-                    {{S("out1"), 32}, {S("out2"), 8}},
-                    /*requireSurjective=*/false);
-  LinearLayout dst2({{S("in1"), {{0, 0}, {0, 16}, {2, 0}}},
-                     {S("in2"), {{0, 4}, {0, 8}, {0, 0}, {4, 0}}},
-                     {S("in3"), {{0, 0}, {0, 0}, {0, 2}, {1, 0}}}},
-                    {{S("out2"), 8}, {S("out1"), 32}},
-                    /*requireSurjective=*/false);
-  LinearLayout P2({{S("in3"), {{4, 0, 0}, {0, 1, 0}, {0, 2, 0}, {1, 0, 0}}},
-                   {S("in2"), {{2, 0, 0}, {0, 0, 2}, {0, 4, 0}, {8, 0, 0}}},
-                   {S("in1"), {{0, 0, 4}, {0, 0, 1}, {0, 8, 0}}}},
-                  {S("in3"), S("in2"), S("in1")});
-  EXPECT_EQ(P2, basisPermutationLayout(src2, dst2));
-  EXPECT_EQ(src2, reorder_like(P2.compose(dst2), src2));
-}
-
 } // namespace
 } // namespace mlir::triton