[LAYOUTS] Improve the swizzling algorithm when we don't have enough vectorisation (#7524)

lezcano · apgoucher · web-flow · commit 1031dc78060f · 2025-07-16T07:20:01.000Z
We write a few heuristics to improve vectorisation and decrease the bank
conflicts in the case when the default vectorisation does not cover a
whole bank.

---------

Co-authored-by: apgoucher &lt;apgoucher@openai.com&gt;
diff --git a/lib/Tools/GenericSwizzling.cpp b/lib/Tools/GenericSwizzling.cpp
@@ -47,6 +47,16 @@ SmallVector<int32_t> flatten(const LinearLayout &ll, StringAttr dim) {
   return vec;
 };
 
+SmallVector<int32_t> removeZeros(ArrayRef<int32_t> vec) {
+  SmallVector<int32_t> result;
+  for (int32_t r : vec) {
+    if (r != 0) {
+      result.push_back(r);
+    }
+  }
+  return result;
+}
+
 // [1, 2, 4, 8] -> [[1], [2], [4], [8]]
 std::vector<std::vector<int32_t>> unflatten(ArrayRef<int32_t> basis) {
   std::vector<std::vector<int32_t>> unflattened;
@@ -279,6 +289,7 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
   auto *ctx = src.getInDimNames().begin()->getContext();
   auto kReg = StringAttr::get(ctx, "register");
   auto kLane = StringAttr::get(ctx, "lane");
+  auto kWarp = StringAttr::get(ctx, "warp");
 
   // We work on the flattened tensors as the tensor dimensions are not relevant
   const LinearLayout srcFlat = src.flattenOuts();
@@ -307,6 +318,65 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
   if (vbasis.size() > maxVecBases) {
     vbasis.resize(maxVecBases);
   }
+  // We fill-up vbasis until it has 32 bits as best we can
+  auto vecFillsBank = (1 << vbasis.size()) * bitwidth >= 32;
+  if (!vecFillsBank) {
+    auto warpSrc = removeZeros(flatten(srcFlat, kWarp));
+    auto warpDst = removeZeros(flatten(dstFlat, kWarp));
+    auto removeVec = [&vbasis](ArrayRef<int32_t> vec) {
+      SmallVector<int32_t> result;
+      for (int32_t r : vec) {
+        if (!llvm::is_contained(vbasis, r)) {
+          result.push_back(r);
+        }
+      }
+      return result;
+    };
+    auto regSrcWarp = intersectionBasis(removeVec(regSrc), warpDst, dim);
+    auto regDstWarp = intersectionBasis(removeVec(regDst), warpSrc, dim);
+    // Maximise vectorisation in the load or the store without creating
+    // conflicts
+    SmallVector<int32_t> largest;
+    if (regSrcWarp.size() == regDstWarp.size() && regSrcWarp.size() > 0) {
+      // We choose the one with the lowest basis in the hope that it will
+      // avoid PRMTs. The comparison of the mins will be strict as the sets
+      // removeVec(regSrc) and removeVec(regDst) don't intersect
+      if (*llvm::min_element(regSrcWarp) < *llvm::min_element(regDstWarp)) {
+        largest = regSrcWarp;
+      } else {
+        largest = regDstWarp;
+      }
+    } else {
+      largest = regSrcWarp.size() > regDstWarp.size() ? regSrcWarp : regDstWarp;
+    }
+    vbasis.append(largest.begin(), largest.end());
+    if (vbasis.size() > maxVecBases) {
+      vbasis.resize(maxVecBases);
+    }
+    // We allow vbasis.size > Log2_32(32 / bitwidth) at this point, as it is in
+    // general good, but one should note
+    if (vbasis.size() < llvm::Log2_32(32 / bitwidth)) {
+      // Pad the vectorisation to 32 bits with warp bases
+      auto warpSrcWarp = intersectionBasis(warpSrc, warpDst, dim);
+      vbasis.append(warpSrcWarp.begin(), warpSrcWarp.end());
+    }
+
+    int i = 0;
+    while (vbasis.size() < llvm::Log2_32(32 / bitwidth) &&
+           (i < warpSrc.size() || i < warpDst.size())) {
+      // If we have not filled up a whole bank, we add more warp bases
+      // until we have 32 bits. They will at least avoid bank conflicts in one
+      // direction
+      if (i < warpSrc.size() && !llvm::is_contained(vbasis, warpSrc[i])) {
+        vbasis.push_back(warpSrc[i]);
+      }
+      if (vbasis.size() < llvm::Log2_32(32 / bitwidth) && i < warpDst.size() &&
+          !llvm::is_contained(vbasis, warpDst[i])) {
+        vbasis.push_back(warpDst[i]);
+      }
+      ++i;
+    }
+  }
 
   // Bits in a bank segment: 32 banks x 32 bits
   constexpr int32_t bankBits = 32 * 32;
@@ -321,8 +391,11 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
   auto bankDst = llvm::to_vector(llvm::concat<int32_t>(vbasis, laneDst));
 
   // Whether we'll use b32.v1 / b32.v2 / b32.v4
-  auto b32Vec =
-      llvm::Log2_32(std::max<int32_t>((1 << vbasis.size()) * bitwidth / 32, 1));
+  // FIXME: With !vecFillsBank we may use b32.v2 or b32.v4 for the load or
+  // store, but we pesimistically assume we don't.
+  auto b32Vec = !vecFillsBank ? 0
+                              : llvm::Log2_32(std::max<int32_t>(
+                                    (1 << vbasis.size()) * bitwidth / 32, 1));
   // Drop the last vec bases of the banks
   bankSrc.resize(bankSrc.size() - b32Vec);
   bankDst.resize(bankDst.size() - b32Vec);
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1011,6 +1011,22 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK: llvm.mlir.global external @global_smem
+  // CHECK-LABEL: convert_layout_transpose
+  tt.func @convert_layout_transpose(%arg0: tensor<128x128xf8E5M2, #blocked>) {
+    // CHECK-COUNT-128: llvm.store {{.*}} vector<1xi8>
+    // CHECK: nvvm.barrier0
+    // CHECK-COUNT-32: llvm.load {{.*}} vector<4xi8>
+    %0 = ttg.convert_layout %arg0 : tensor<128x128xf8E5M2, #blocked> -> tensor<128x128xf8E5M2, #blocked1>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked0 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {