[BACKEND] Fix vectorisation for convert_layout with ldmatrix and stmatrix (#8655)

lezcano · web-flow · commit a5b948c16f0a · 2025-11-06T23:18:16.000Z
The previous code was a bit too eager adding `reps` in this case. So much so that after doing that we wouldn't have enough registers as to fully vectorise the ldmatrix/stmatrix Fixes the regression reported in triton-lang/triton#8328
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
@@ -459,6 +459,15 @@ class LinearLayout {
   auto getOutDimSizes() const { return llvm::make_second_range(outDims); }
 
   // Relevant for reshaping
+
+  SmallVector<std::pair<StringAttr, int32_t>> getInDims() const {
+    SmallVector<std::pair<StringAttr, int32_t>> inDims;
+    inDims.reserve(bases.size());
+    for (auto [inDim, inDimBases] : bases) {
+      inDims.push_back({inDim, getInDimSize(inDim)});
+    }
+    return inDims;
+  }
   SmallVector<std::pair<StringAttr, int32_t>> getOutDims() const {
     return to_vector(outDims);
   }
diff --git a/lib/Tools/GenericSwizzling.cpp b/lib/Tools/GenericSwizzling.cpp
@@ -100,7 +100,8 @@ SmallVector<int32_t> nullspaceBasis(ArrayRef<int32_t> vectors, int32_t dim) {
 // without sacrificing vectorisation and split it into its own
 // `reps` dimension
 LinearLayout buildReps(MLIRContext *ctx, const LinearLayout &src,
-                       const LinearLayout &dst, const LinearLayout &smem) {
+                       const LinearLayout &dst, const LinearLayout &smem,
+                       int32_t leaveReps) {
   auto kVec = StringAttr::get(ctx, "vector");
   auto kBank = StringAttr::get(ctx, "bank");
   auto kSegment = StringAttr::get(ctx, "segment");
@@ -116,8 +117,16 @@ LinearLayout buildReps(MLIRContext *ctx, const LinearLayout &src,
   SetVector<int32_t> segment;
   SetVector<int32_t> reps;
   for (auto s : smemSegment) {
+    // Do not move the first leaveReps bases from reps to segment
+    // as we need them to vectorise the instructions (think .x2 and .x4 in
+    // ldmatrix)
     if (srcRegs.contains(s) && dstRegs.contains(s)) {
-      reps.insert(s);
+      if (leaveReps > 0) {
+        leaveReps--;
+        segment.insert(s);
+      } else {
+        reps.insert(s);
+      }
     } else {
       segment.insert(s);
     }
@@ -376,11 +385,12 @@ std::optional<SmallVector<int32_t>> optimalSwizzlingTile(
   return vbasis;
 }
 
-LinearLayout
-optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
-                 int32_t bitwidth, ArrayRef<int32_t> vbasis,
-                 ArrayRef<int32_t> tileSrc, ArrayRef<int32_t> tileDst,
-                 ArrayRef<std::pair<StringAttr, int32_t>> outDims) {
+LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
+                              int32_t bitwidth, ArrayRef<int32_t> vbasis,
+                              ArrayRef<int32_t> tileSrc,
+                              ArrayRef<int32_t> tileDst,
+                              ArrayRef<std::pair<StringAttr, int32_t>> outDims,
+                              int32_t leaveReps = 0) {
   // We work on the flattened tensors as the tensor dimensions are not relevant
   assert(src.getNumOutDims() == 1 && dst.getNumOutDims() == 1 &&
          "src and dst must have a single output dimension");
@@ -439,7 +449,7 @@ optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
                         {bankAttr, unflatten(bbasis)},
                         {segAttr, unflatten(sbasis)}},
                        src.getOutDims(), /*requireSurjective=*/true);
-  basis1D = buildReps(ctx, src, dst, basis1D);
+  basis1D = buildReps(ctx, src, dst, basis1D, leaveReps);
 
   return basis1D.reshapeOuts(outDims);
 }
@@ -649,7 +659,7 @@ optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
 
   // Get the associated src/dst tiles for each instruction if they exist
   SmallVector<std::tuple<std::pair<int32_t, int32_t>, SmallVector<int32_t>,
-                         SmallVector<int32_t>, SmallVector<int32_t>>>
+                         SmallVector<int32_t>, SmallVector<int32_t>, int32_t>>
       tiles;
   for (auto [instrs, vbasis] : instr) {
     auto maybeTileSrc =
@@ -659,22 +669,31 @@ optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
     if (!maybeTileSrc.has_value() || !maybeTileDst.has_value()) {
       continue;
     }
+    // Regs bases missing to get full vectorisation
+    auto regsMissing = [](const LocalMemOpTile &instr) {
+      return instr.laneContig.size() + instr.laneAddr.size() - 3;
+    };
+    // We leave 2 reps for combinations of ldmatrix/stmatrix instructions
+    // to be able to fully vectorise them
+    int32_t leaveReps = std::min(regsMissing(srcTiles[instrs.first]),
+                                 regsMissing(dstTiles[instrs.second]));
+    assert((leaveReps == 0 || leaveReps == 2) && "leaveReps must be 0 or 2");
     tiles.push_back({instrs, std::move(vbasis), std::move(*maybeTileSrc),
-                     std::move(*maybeTileDst)});
+                     std::move(*maybeTileDst), leaveReps});
   }
 
   if (tiles.empty()) {
     // We lower to an ld / st, but can't use LDS128/STS128
     auto smem = optimalSwizzlingLdSt(src, dst, bitwidth);
     return {smem, {0, 0}};
   } else {
-    // We choose the pair of instructions that minimises the total bank
-    // conflicts
     SmallVector<std::tuple<int, LinearLayout, std::pair<int32_t, int32_t>>>
         smems;
-    for (auto [instrs, vbasis, tileSrc, tileDst] : tiles) {
+    // We choose the pair of instructions that minimises the total bank
+    // conflicts
+    for (auto [instrs, vbasis, tileSrc, tileDst, leaveReps] : tiles) {
       auto smem = optimalSwizzling(srcFlat, dstFlat, bitwidth, vbasis, tileSrc,
-                                   tileDst, src.getOutDims());
+                                   tileDst, src.getOutDims(), leaveReps);
       auto [read, write] = bankConflicts(tileSrc, tileDst, smem);
       smems.push_back({read + write, smem, {instrs.first, instrs.second}});
     }
diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
@@ -248,6 +248,34 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 32]], warp = [[32, 0], [64, 0], [16, 0]], block = []}>
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  tt.func @convert_mma_to_blocked(%a: tensor<128x64xbf16, #linear>) {
+    // CHECK: llvm.store {{.*}} : vector<4xi32>
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.load {{.*}} -> vector<4xi32>
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.store {{.*}} : vector<4xi32>
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.load {{.*}} -> vector<4xi32>
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.store {{.*}} : vector<4xi32>
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.load {{.*}} -> vector<4xi32>
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.store {{.*}} : vector<4xi32>
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.load {{.*}} -> vector<4xi32>
+    // CHECK-NOT: llvm.store
+    // CHECK-NOT: llvm.load
+    %b = ttg.convert_layout %a: tensor<128x64xbf16, #linear> -> tensor<128x64xbf16, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
 #mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {