[BACKEND] Support stmatrix.trans (triton-lang#6910)

lezcano · zwu-2025 · commit 53fd1130a730 · 2025-05-27T13:42:50.000-05:00
With this we are able to lower pretty much anything that can be lowered
to an stmatrix.
We are just missing two niche cases:
- Multi CTA
- Lowering fp8 with stmatrix.trans (you need the first two bases of kReg
to be `[[0, 1], [1, 0]]`).

These can be supported in the future if necessary.

Will use this to support `ldmatrix` in the next PR.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -5958,10 +5958,6 @@ def compute_scratch_buffer_shape(src_layout, dst_layout, shape):
 def test_convert2d(M, N, src_layout, interm_layout, dst_layout, dtype, device, tmp_path: pathlib.Path):
     if str(src_layout) == str(dst_layout):
         pytest.skip()
-    if (isinstance(src_layout, DotOperandLayout)
-            and isinstance(interm_layout, SharedLayout)) or (isinstance(dst_layout, DotOperandLayout)
-                                                             and isinstance(interm_layout, SharedLayout)):
-        pytest.skip("DotOperandLayout <-> SharedLayout conversion is not completely supported")
     if is_hip():
         try:
             scratch_shape = compute_scratch_buffer_shape(src_layout, dst_layout, (M, N))
diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
@@ -363,6 +363,45 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
   }
 }
 
+
+// -----
+
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#linear = #ttg.linear<{register = [[0, 1], [8, 0], [0, 8], [0, 16]], lane = [[0, 2], [0, 4], [1, 0], [2, 0], [4, 0]], warp = [[16, 0], [32, 0]], block = []}>
+#smem = #ttg.shared_memory
+// CHECK-LABEL: linear_to_swizzled_st_matrix_trans_local_store
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  tt.func @linear_to_swizzled_st_matrix_trans_local_store(%a: tensor<64x32xf16, #linear>) {
+    // CHECK-COUNT-2: nvgpu.stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans}
+    //          CHECK: llvm.return
+    %b = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<64x32xf16, #shared, #smem, mutable>
+    ttg.local_store %a, %b : tensor<64x32xf16, #linear> -> !ttg.memdesc<64x32xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+// Stretching a bit the lowering. Feel free to kill this test if we restrain
+// the lowering a bit later on.
+// These layouts will have plenty of bank conflicts, so it'd make sense not to
+// lower them via stmatrix.
+// It is of course possible to design a shared memory layout that makes the lowering
+// via stmatrix not have any bank conflicts, but yeah.
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#linear = #ttg.linear<{register = [[0, 2], [0, 8], [0, 0], [0, 16], [0, 1]], lane = [[0, 0], [0, 4], [1, 0], [2, 0], [4, 0]], warp = [[0, 0], [8, 0]], block = []}>
+#smem = #ttg.shared_memory
+// CHECK-LABEL: linear_to_swizzled_st_matrix_trans_local_store
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  tt.func @linear_to_swizzled_st_matrix_trans_local_store(%a: tensor<16x32xf16, #linear>) {
+    // CHECK-COUNT-2: nvgpu.stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans}
+    //          CHECK: llvm.return
+    %b = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<16x32xf16, #shared, #smem, mutable>
+    ttg.local_store %a, %b : tensor<16x32xf16, #linear> -> !ttg.memdesc<16x32xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
 // -----
 
 #blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp
@@ -139,7 +139,7 @@ struct LocalLoadOpConversion
 
 LogicalResult lowerDistributedToSharedStmatrix(
     Location loc, RankedTensorType tensorTy, MemDescType memDescType,
-    Value adaptorSrc, Value smemBase, Type llvmElemTy,
+    bool transpose, Value adaptorSrc, Value smemBase, Type llvmElemTy,
     ConversionPatternRewriter &rewriter, const TargetInfo &targetInfo,
     std::pair<size_t, Type> *const llvmOpCount = nullptr) {
   if (!targetInfo.supportLdStMatrix())
@@ -160,7 +160,11 @@ LogicalResult lowerDistributedToSharedStmatrix(
   auto kOffset = S("offset");
   auto smemPtrTy = ptr_ty(ctx, 3);
   auto bitwidth = tensorTy.getElementTypeBitWidth();
-  if (bitwidth > 32)
+  // In the transpose case, consecutive elements are not stored contiguously
+  // so we cannot split an fp32
+  // We could support bitwidth == 8, but it'd be a rather weird layout
+  // so we don't do that for now
+  if ((!transpose && bitwidth > 32) || (transpose && bitwidth != 16))
     return failure();
   // Inter block stmatrix is not supported
   if (cvt.hasInDim(kBlock))
@@ -173,31 +177,75 @@ LogicalResult lowerDistributedToSharedStmatrix(
   cvt = removeBroadcast.apply(cvt);
   srcVals = removeBroadcast.apply(srcVals);
 
-  auto tile = LinearLayout::identity1D(32 / bitwidth, kReg, kOffset) *
-              LinearLayout::identity1D(4, kLane, kOffset);
-  // Find if there is a register permutation that allows us to divideLeft
-  auto maybeAction = regPermForDivideLeft(cvt, tile);
-  if (!maybeAction.has_value()) {
-    return failure();
+  LinearLayout reps;
+  if (!transpose) {
+    auto tile = LinearLayout::identity1D(32 / bitwidth, kReg, kOffset) *
+                LinearLayout::identity1D(4, kLane, kOffset);
+
+    // Find if there is a register permutation that allows us to divideLeft
+    // We need to pass the map from regs to offsets, as is cvt
+    auto maybeAction = regPermForDivideLeft(cvt, tile);
+    if (!maybeAction.has_value()) {
+      return failure();
+    }
+    auto action = maybeAction.value();
+    // Check if the action indeed allows us to divideLeft
+    cvt = action.apply(cvt);
+    srcVals = action.apply(srcVals);
+
+    auto maybeQuot = divideLeft(cvt, tile);
+    if (!maybeQuot.has_value()) {
+      return failure();
+    }
+    reps = zerosLike(tile) * maybeQuot.value();
+  } else {
+    // Division does not quite work here. To define this properly, we would need
+    // to define a different multiplication that does:
+    // A *' B = [[0, A], [B, 0]] and define leftDivision for it
+    // We do it ad-hoc for now, as I beleive there's not much demand for this op
+    // outside of this lowering
+
+    // Divisibility in the sense above is the same as regular divisibility
+    // You need to see that the tile A is a sublayout of the matrix, and that
+    // it has zeros above it and to its right.
+
+    // In particular, offsets lanes 4, 8, 16 map to offsets 1, 2, 4...
+    const auto &laneBases = cvt.getBases().find(kLane)->second;
+    for (int i = 0; i < 3; ++i) {
+      if (laneBases[i + 2][0] != (1 << i))
+        return failure();
+    }
+    // ... and no other basis should depend on 1, 2, 4
+    // Note that this gives us the usual alignment condition, but we have
+    // translated it to checking that the matrix to the left of A is all zeros
+    for (auto dim : cvt.getInDimNames()) {
+      const auto &bases = cvt.getBases().find(dim)->second;
+      for (auto [i, basis] : llvm::enumerate(bases)) {
+        if (dim == kLane && i >= 2)
+          continue;
+        if (basis[0] & 0b111)
+          return failure();
+      }
+    }
+
+    // Hack: We are not going to use in the rest of the function reps[kLane][2:]
+    // so we don't need to zero them out
+    reps = cvt;
   }
-  auto action = maybeAction.value();
-  // Check if the action indeed allows us to divideLeft
-  cvt = action.apply(cvt);
-  auto maybeQuot = divideLeft(cvt, tile);
-  if (!maybeQuot.has_value()) {
+
+  // We must have at least 2 register elements to use stmatrix.trans
+  if (transpose && reps.getInDimSizeLog2(kReg) < llvm::Log2_32(32 / bitwidth)) {
     return failure();
   }
-  auto quot = maybeQuot.value();
-  srcVals = action.apply(srcVals);
-  // Map from kReg, kLane, kWarp to beginning of each tile
-  auto reps = zerosLike(tile) * quot;
-  assert(reps.getOutDimSize(kOffset) == cvt.getOutDimSize(kOffset));
 
-  // Choose up to 4 packs of 32-bit elements indexed by the next to bases
-  // as the vectorisation factor
-  auto vec = std::min(2, quot.getInDimSizeLog2(kReg));
+  // Choose up to 4 packs of 32-bit elements indexed by the next (at most) two
+  // bases as the vectorisation factor. We don't consider the basis of the tile
+  // for vectorisation so we substract them
+  auto vec = std::min<int32_t>(2, reps.getInDimSizeLog2(kReg) -
+                                      llvm::Log2_32(32 / bitwidth));
 
-  // FIXME(Lezcano): Should we bail if any of the other 3 lane bases is zero?
+  // Map from kReg, kLane, kWarp to beginning of each tile
+  assert(reps.getOutDimSize(kOffset) == cvt.getOutDimSize(kOffset));
 
   auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   // Compute the addresses for the 0th tile
@@ -212,12 +260,24 @@ LogicalResult lowerDistributedToSharedStmatrix(
   // given
   //   by the first `vec` reg bases that are not part of the tile
   std::vector<std::vector<int32_t>> laneBases;
-  assert(tile.getInDimSizeLog2(kLane) == 2);
-  for (int i = 0; i < 3; ++i) {
-    laneBases.push_back(reps.getBasis(kLane, tile.getInDimSizeLog2(kLane) + i));
-  }
-  for (int i = 0; i < vec; ++i) {
-    laneBases.push_back(reps.getBasis(kReg, tile.getInDimSizeLog2(kReg) + i));
+  if (!transpose) {
+    auto tileDimSizeReg = llvm::Log2_32(32 / bitwidth);
+    auto tileDimSizeLane = 2;
+    for (int i = 0; i < 3; ++i) {
+      laneBases.push_back(reps.getBasis(kLane, tileDimSizeLane + i));
+    }
+    for (int i = 0; i < vec; ++i) {
+      laneBases.push_back(reps.getBasis(kReg, tileDimSizeReg + i));
+    }
+  } else {
+    // We choose the first basis of the register. In the future we could choose
+    // a basis that minimises the bank conflicts
+    laneBases.push_back(reps.getBasis(kReg, 0));
+    laneBases.push_back(reps.getBasis(kLane, 0));
+    laneBases.push_back(reps.getBasis(kLane, 1));
+    for (int i = 0; i < vec; ++i) {
+      laneBases.push_back(reps.getBasis(kReg, i + 1));
+    }
   }
 
   LinearLayout addrLayout =
@@ -247,7 +307,8 @@ LogicalResult lowerDistributedToSharedStmatrix(
       }
       inputs.push_back(b.bitcast(input, i32_ty));
     }
-    rewriter.create<triton::nvgpu::StoreMatrixOp>(loc, vecAddr, inputs);
+    rewriter.create<triton::nvgpu::StoreMatrixOp>(loc, vecAddr, inputs,
+                                                  /*needTrans=*/transpose);
   }
   return success();
 }
@@ -271,10 +332,19 @@ struct LocalAllocOpConversion
     Value smemBase =
         LLVM::getSharedMemoryBase(op.getLoc(), rewriter, targetInfo, op);
 
-    if (lowerDistributedToSharedStmatrix(op.getLoc(), srcTy, memDescType,
-                                         adaptor.getSrc(), smemBase, llvmElemTy,
-                                         rewriter, targetInfo)
-            .failed()) {
+    // Try to lower transposed or not
+    bool lowered = false;
+    for (bool transpose : {false, true}) {
+      lowered =
+          lowerDistributedToSharedStmatrix(
+              op.getLoc(), srcTy, memDescType, transpose, adaptor.getSrc(),
+              smemBase, llvmElemTy, rewriter, targetInfo)
+              .succeeded();
+      if (lowered) {
+        break;
+      }
+    }
+    if (!lowered) {
       return failure();
     }
 
@@ -306,11 +376,20 @@ struct LocalStoreOpConversion
         getTypeConverter()->convertType(op.getDst().getType().getElementType());
     SharedMemoryObject smemObj = LLVM::getSharedMemoryObjectFromStruct(
         op.getLoc(), adaptor.getDst(), llvmElemTy, rewriter);
-    if (lowerDistributedToSharedStmatrix(op.getLoc(), op.getSrc().getType(),
-                                         op.getDst().getType(),
-                                         adaptor.getSrc(), smemObj.getBase(),
-                                         llvmElemTy, rewriter, targetInfo)
-            .failed()) {
+
+    // Try to lower transposed or not
+    bool lowered = false;
+    for (bool transpose : {false, true}) {
+      lowered = lowerDistributedToSharedStmatrix(
+                    op.getLoc(), op.getSrc().getType(), op.getDst().getType(),
+                    transpose, adaptor.getSrc(), smemObj.getBase(), llvmElemTy,
+                    rewriter, targetInfo)
+                    .succeeded();
+      if (lowered) {
+        break;
+      }
+    }
+    if (!lowered) {
       return failure();
     }
     rewriter.eraseOp(op);