[WARNINGS] Emit warning for WGMMA fp8 dot when transposition prevents pipelining (#6875)

davidberard98 · web-flow · commit 629fd5057b69 · 2025-05-20T10:17:13.000-07:00
**TL;DR**: For fp8 WGMMA matmuls, If input tensors are not in a specific transposed format in global memory (row-major A, col-major B), pipelining will be disabled. Emit a warning for these cases. If you run an fp8 matmul (e.g. 03-matrix-multiplication) with the B matrix in row-major format (e.g. https://gist.github.com/davidberard98/21fcee4a46192a1a756a458dfc3669fe), and use MLIR_ENABLE_DIAGNOSTICS=warnings, then a warning like this one will be emitted: ``` /home/dberard/fbcode/scripts/dberard/triton/fp8_mm.py:171:35: warning: Warning: Forcing a different order [0, 1] on SMEM than the register order for the operand 1. Registers will be transposed before SMEM store and the pipelined load for this operand will be disabled, so poor performance is expected. accumulator = tl.dot(a, b, accumulator) ``` Since this is a user-facing restriction that has significant implications on the performance of fp8 matmuls, I think it makes sense to make this a warning. Note: This warning already exists for MMAv5; this PR just plumbs the required info into the getSharedMemoryMMAOperand function so that diagnostics can be emitted: https://github.com/triton-lang/triton/blob/7dc549208aa3ce30612fe884bc4723f95f4b40b1/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp#L188-L195
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -188,10 +188,12 @@ getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter, int opIdx,
   if (newOrder != order && op) {
     op->emitWarning("Warning: Forcing a different order [")
         << newOrder[0] << ", " << newOrder[1]
-        << "] on SMEM than the register order for the opreand " << opIdx
+        << "] on SMEM than the register order for the operand " << opIdx
         << ". Registers will be transposed before SMEM store and the pipelined "
            "load for this operand will be disabled, so poor performance is "
-           "expected.";
+           "expected. Recommendation: consider transposing the operand in "
+           "global "
+           "memory to remove the need to transpose the tensor in registers.";
   }
 
   Attribute SharedMemorySpace =
@@ -391,9 +393,14 @@ class BlockedToMMA : public mlir::OpRewritePattern<DotOp> {
         int bitwidth = getElementTypeOrSelf(a).getIntOrFloatBitWidth();
         a = getDotOperand(a, 0, bitwidth);
       } else {
-        a = getSharedMemoryMMAOperand(a, rewriter, 0, allowTranspose);
+        a = getSharedMemoryMMAOperand(a, rewriter, 0, allowTranspose,
+                                      /*isMMAv5Fp4Padded=*/false,
+                                      /*forceTranspose=*/false, dotOp);
       }
-      b = getSharedMemoryMMAOperand(b, rewriter, 1, allowTranspose);
+      b = getSharedMemoryMMAOperand(b, rewriter, 1, allowTranspose,
+                                    /*isMMAv5Fp4Padded=*/false,
+                                    /*forceTranspose=*/false, dotOp);
+
       newDot = rewriter.create<triton::nvidia_gpu::WarpGroupDotOp>(
           dotOp.getLoc(), newRetType, a, b, newAcc, nullptr,
           dotOp.getInputPrecision(), dotOp.getMaxNumImpreciseAcc(), false);
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul | FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritongpu-accelerate-matmul -verify-diagnostics=only-expected | FileCheck %s
 
 // CHECK: #[[MMA:.+]] = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}>
 // CHECK: #[[MMA1:.+]] = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
@@ -526,3 +526,21 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %0 : tensor<128x256xf32, #blocked>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: hopper_fp8_non_transposed_b
+  tt.func public @hopper_fp8_non_transposed_b(
+   %operand0: tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+   %operand1: tensor<128x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+   %out_ptrs: tensor<128x256x!tt.ptr<f32>, #blocked>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #blocked>
+    // CHECK: ttng.warp_group_dot
+    // expected-warning @below {{Forcing a different order}}
+    %64 = tt.dot %operand0, %operand1, %cst, inputPrecision = tf32 {maxNumImpreciseAcc = 1073741824 : i32} : tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x256xf32, #blocked>
+    tt.store %out_ptrs, %64 : tensor<128x256x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}