[BACKEND] Support vectorisation and arbitrary bitwidth in stmatrix (#6899)

lezcano · web-flow · commit 7568a4d9e0ef · 2025-05-21T16:48:22.000+01:00
As per title.

I'll add transpose + generic support for ldmatrix in a different pr
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ llvm-project-*/
 python/build/
 python/dist/
 python/triton*.egg-info/
+python/triton_kernels/triton*.egg-info/
 
 python/triton/_C/*.pyd
 python/triton/_C/*.so
diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
@@ -301,7 +301,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 // CHECK-LABEL: linear_to_swizzled_st_matrix_local_store
 module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func @linear_to_swizzled_st_matrix_local_store(%a: tensor<64x32xf16, #linear>) {
-    // CHECK-COUNT-2: nvgpu.stmatrix
+    // CHECK-COUNT-2: nvgpu.stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
     //          CHECK: llvm.return
     %b = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<64x32xf16, #shared, #smem, mutable>
     ttg.local_store %a, %b : tensor<64x32xf16, #linear> -> !ttg.memdesc<64x32xf16, #shared, #smem, mutable>
@@ -323,7 +323,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 // CHECK-LABEL: linear_to_swizzled_st_matrix_local_store
 module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func @linear_to_swizzled_st_matrix_local_store(%a: tensor<32x32xf16, #linear>) {
-    // CHECK-COUNT-2: nvgpu.stmatrix
+    // CHECK-COUNT-2: nvgpu.stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
     //          CHECK: llvm.return
     %b = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<32x32xf16, #shared, #smem, mutable>
     ttg.local_store %a, %b : tensor<32x32xf16, #linear> -> !ttg.memdesc<32x32xf16, #shared, #smem, mutable>
@@ -333,6 +333,38 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [8, 0]], lane = [[0, 4], [0, 8], [1, 0], [2, 0], [4, 0]], warp = [[16, 0], [32, 0]], block = []}>
+#smem = #ttg.shared_memory
+// CHECK-LABEL: linear_to_swizzled_st_matrix_x2_local_store_fp8
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  tt.func @linear_to_swizzled_st_matrix_x2_local_store_fp8(%a: tensor<64x16xf8E4M3FNUZ, #linear>) {
+    // CHECK-COUNT-1: nvgpu.stmatrix %{{.*}}, %{{.*}}, %{{.*}} :
+    //          CHECK: llvm.return
+    %b = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<64x16xf8E4M3FNUZ, #shared, #smem, mutable>
+    ttg.local_store %a, %b : tensor<64x16xf8E4M3FNUZ, #linear> -> !ttg.memdesc<64x16xf8E4M3FNUZ, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#linear = #ttg.linear<{register = [[8, 0], [0, 4], [0, 8]], lane = [[0, 1], [0, 2], [1, 0], [2, 0], [4, 0]], warp = [[16, 0], [32, 0]], block = []}>
+#smem = #ttg.shared_memory
+// CHECK-LABEL: linear_to_swizzled_st_matrix_local_store_fp32
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  tt.func @linear_to_swizzled_st_matrix_local_store_fp32(%a: tensor<64x16xf32, #linear>) {
+    // CHECK-COUNT-2: nvgpu.stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
+    //          CHECK: llvm.return
+    %b = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<64x16xf32, #shared, #smem, mutable>
+    ttg.local_store %a, %b : tensor<64x16xf32, #linear> -> !ttg.memdesc<64x16xf32, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
 module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func @fp8_const(%arg0: tensor<1024xi1, #blocked>, %arg1: tensor<1024xf8E4M3FNUZ, #blocked>) attributes {noinline = false} {
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp
@@ -159,14 +159,9 @@ LogicalResult lowerDistributedToSharedStmatrix(
   auto kBlock = S("block");
   auto kOffset = S("offset");
   auto smemPtrTy = ptr_ty(ctx, 3);
-
-  // Just stmatrix for now
-  // 1) NYI in the stmatrix lowering
-  //    Pack everything into uint32_t to support bitwidths other than 16
   auto bitwidth = tensorTy.getElementTypeBitWidth();
-  if (bitwidth != 16)
+  if (bitwidth > 32)
     return failure();
-
   // Inter block stmatrix is not supported
   if (cvt.hasInDim(kBlock))
     return failure();
@@ -198,13 +193,9 @@ LogicalResult lowerDistributedToSharedStmatrix(
   auto reps = zerosLike(tile) * quot;
   assert(reps.getOutDimSize(kOffset) == cvt.getOutDimSize(kOffset));
 
-  // Choose the 4 elements indexed by the next to bases as the vectorisation
-  // factor
+  // Choose up to 4 packs of 32-bit elements indexed by the next to bases
+  // as the vectorisation factor
   auto vec = std::min(2, quot.getInDimSizeLog2(kReg));
-  // 2) NYI stmatrix.x1 and stmatrix.x2
-  if (vec != 2) {
-    return failure();
-  }
 
   // FIXME(Lezcano): Should we bail if any of the other 3 lane bases is zero?
 
@@ -237,17 +228,26 @@ LogicalResult lowerDistributedToSharedStmatrix(
                      .second;
 
   // Elements per op
-  auto step = (1 << vec) * (32 / bitwidth);
+  auto nVecs = 1 << vec;
+  auto elemsPerVec = 32 / bitwidth;
+  auto step = nVecs * elemsPerVec;
   for (int i = 0; i < srcVals.size(); i += step) {
     auto regIdx = reps.apply({{kReg, i}, {kLane, 0}, {kWarp, 0}})[0].second;
     Value offset = b.xor_(regBase, b.i32_val(regIdx));
     auto vecAddr = b.gep(smemPtrTy, llvmElemTy, smemBase, offset,
                          LLVM::GEPNoWrapFlags::inbounds);
-    SmallVector<Value> inValsVec;
-    for (int j = 0; j < step; j++)
-      inValsVec.push_back(srcVals[i + j]);
-    Value valsVec = packLLVector(loc, inValsVec, rewriter);
-    targetInfo.storeMatrixShared(rewriter, loc, vecAddr, valsVec);
+    // Pack into vector of i32
+    SmallVector<Value> inputs;
+    Type packedTy = vec_ty(llvmElemTy, 32 / bitwidth);
+    for (int j = 0; j < nVecs; j++) {
+      Value input = b.undef(packedTy);
+      for (int k = 0; k < elemsPerVec; k++) {
+        input = b.insert_element(
+            packedTy, input, srcVals[i + j * elemsPerVec + k], b.i32_val(k));
+      }
+      inputs.push_back(b.bitcast(input, i32_ty));
+    }
+    rewriter.create<triton::nvgpu::StoreMatrixOp>(loc, vecAddr, inputs);
   }
   return success();
 }