[BACKEND] Don't promote fp8 MMAv2 dot inputs for sm120 (#7409)

davidberard98 · web-flow · commit 299b3bb9cc21 · 2025-07-08T21:41:47.000-07:00
Fixes #7188 This speeds up fp8 matmuls on consumer blackwell (RTX 50xx series) by ~1.9x on large matmuls. sm>=89 supports MMAv2 with fp8 operands, but prior to this PR, Triton was only using this on sm==89; on other architectures, fp8 inputs would be promoted to fp16 and the mma would be executed in fp16. This PR causes the the fp8->fp16 promotion step to be skipped on any architecture >= 89. It also adds more mma variants to support f8 operands and f16 results, which were previously supported via the `FP16_FP16_FP16_FP16` variant. Evidence that we should be able to use fp8 operands to mmav2 on any architecture >= 89: In PTX docs https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma, under the "Target ISA Notes" section, see that the e4m3 and e5m2 are supported on sm_89 or higher (and don't require the "a" suffix, which would indicate that the support is non-backward-compatible). Perf improvement verified on a 5070 Ti using 03-matrix-multiplication.py (below are flops measurements on large MNK sizes): Before: ``` matmul-performance-fp8: M N K Triton ... 26 3584.0 3584.0 3584.0 101.256071 27 3712.0 3712.0 3712.0 99.947313 28 3840.0 3840.0 3840.0 101.182062 29 3968.0 3968.0 3968.0 101.771419 30 4096.0 4096.0 4096.0 101.206889 ``` After: ``` matmul-performance-fp8: M N K Triton ... 26 3584.0 3584.0 3584.0 191.309345 27 3712.0 3712.0 3712.0 190.280662 28 3840.0 3840.0 3840.0 195.316740 29 3968.0 3968.0 3968.0 194.305628 30 4096.0 4096.0 4096.0 193.258070 ```
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -775,6 +775,14 @@ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
   return builder.create<arith::ExtFOp>(loc, tensorPromotedType, operand);
 }
 
+static bool mmav2SupportsFp8Operands(int computeCapability) {
+  // promote operands for sm < 89 since fp8 mma is not natively supported
+  // although PTX instructions for mma v2 w/ fp8 operands exist for sm90 and
+  // sm100, they are emulated as fp16 upcasts + fp16 HMMA in SASS. sm120 has
+  // hardware support for fp8 operands w/ mmav2.
+  return computeCapability == 89 || computeCapability == 120;
+}
+
 // promote operands of dot op if the existing combination is not natively
 // supported.
 static void decomposeMixedModeDotOp(ModuleOp mod, int computeCapability) {
@@ -787,10 +795,10 @@ static void decomposeMixedModeDotOp(ModuleOp mod, int computeCapability) {
         dyn_cast<NvidiaMmaEncodingAttr>(D.getType().getEncoding());
     if (mmaLayout) {
       bool isNativeFP8 = llvm::isa<Float8E5M2Type, Float8E4M3FNType>(AElType);
-      // promote operands for sm < 89 since fp8 mma is not natively supported
-      // promote operands for sm >= 90 when mma is not v3
+      // promote to f16 unless there's hardware support for fp8 operands
       if (!isNativeFP8 ||
-          (isNativeFP8 && (computeCapability == 89 || mmaLayout.isHopper())))
+          (isNativeFP8 && (mmav2SupportsFp8Operands(computeCapability) ||
+                           mmaLayout.isHopper())))
         return;
       promoteType = builder.getF16Type();
     } else {
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -2257,6 +2257,39 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 
 // -----
 
+#mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [16, 8]}>
+module attributes {"ttg.num-warps" = 8 : i32, ttg.target = "cuda:120"} {
+  // CHECK-LABEL: mmav2_e5m2_e5m2_fp16
+  tt.func public @mmav2_e5m2_e5m2_fp16(%arg0: tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %arg1: tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, %arg2: tensor<32x32xf16, #mma>) {
+    // CHECK: mma.{{.*}}.col.f16.e5m2.e5m2.f16
+    %0 = tt.dot %arg0, %arg1, %arg2 {maxNumImpreciseAcc = 1073741824 : i32} : tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<32x32xf16, #mma>
+    tt.return
+  }
+
+  // CHECK-LABEL: mmav2_e5m2_e4m3_fp16
+  tt.func public @mmav2_e5m2_e4m3_fp16(%arg0: tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %arg1: tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, %arg2: tensor<32x32xf16, #mma>) {
+    // CHECK: mma.{{.*}}.col.f16.e5m2.e4m3.f16
+    %0 = tt.dot %arg0, %arg1, %arg2 {maxNumImpreciseAcc = 1073741824 : i32} : tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<32x32xf16, #mma>
+    tt.return
+  }
+
+  // CHECK-LABEL: mmav2_e4m3_e5m2_fp16
+  tt.func public @mmav2_e4m3_e5m2_fp16(%arg0: tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %arg1: tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, %arg2: tensor<32x32xf16, #mma>) {
+    // CHECK: mma.{{.*}}.col.f16.e4m3.e5m2.f16
+    %0 = tt.dot %arg0, %arg1, %arg2 {maxNumImpreciseAcc = 1073741824 : i32} : tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<32x32xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<32x32xf16, #mma>
+    tt.return
+  }
+
+  // CHECK-LABEL: mmav2_e4m3_e4m3_fp16
+  tt.func public @mmav2_e4m3_e4m3_fp16(%arg0: tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %arg1: tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, %arg2: tensor<32x32xf16, #mma>) {
+    // CHECK: mma.{{.*}}.col.f16.e4m3.e4m3.f16
+    %0 = tt.dot %arg0, %arg1, %arg2 {maxNumImpreciseAcc = 1073741824 : i32} : tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<32x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<32x32xf16, #mma>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [1, 1, 16], threadsPerWarp = [4, 4, 2], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}>
 #linear = #ttg.linear<{register = [[0, 0], [0, 0], [0, 0], [0, 0]], lane = [[0, 0], [0, 1], [0, 2], [1, 0], [2, 0]], warp = [[4, 0], [8, 0], [16, 0]], block = []}>
 
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -562,6 +562,27 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   }
 }
 
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:120", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: sm120_fp8_dot
+  tt.func public @sm120_fp8_dot(%arg0: tensor<128x256xf32, #blocked>, %arg1: tensor<128x128x!tt.ptr<f8E4M3FN>, #blocked1>, %arg2: tensor<128x256x!tt.ptr<f8E4M3FN>, #blocked2>, %arg3: tensor<128x128xi1, #blocked1>, %arg4: tensor<128x256xi1, #blocked2>) -> tensor<128x256xf32, #blocked> {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf8E4M3FN, #blocked2>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf8E4M3FN, #blocked1>
+    %0 = tt.load %arg1, %arg3, %cst_0 : tensor<128x128x!tt.ptr<f8E4M3FN>, #blocked1>
+    %1 = tt.load %arg2, %arg4, %cst : tensor<128x256x!tt.ptr<f8E4M3FN>, #blocked2>
+    %2 = ttg.convert_layout %0 : tensor<128x128xf8E4M3FN, #blocked1> -> tensor<128x128xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+    %3 = ttg.convert_layout %1 : tensor<128x256xf8E4M3FN, #blocked2> -> tensor<128x256xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
+    // CHECK: {{.*}} = tt.dot {{.*}} tensor<128x128xf8E4M3FN
+    %4 = tt.dot %2, %3, %arg0, inputPrecision = tf32 : tensor<128x128xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x256xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x256xf32, #blocked>
+    tt.return %4 : tensor<128x256xf32, #blocked>
+  }
+}
+
+
 // -----
 
 #blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0]}>
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp
@@ -257,10 +257,16 @@ enum class TensorCoreType : uint8_t {
   FP32_BF16_BF16_FP32,
   FP32_TF32_TF32_FP32,
   FP16_FP16_FP16_FP16,
+  // fp32 accumulator, fp8 operand
   FP32_FP8E5M2_FP8E5M2_FP32,
   FP32_FP8E5M2_FP8E4M3FN_FP32,
   FP32_FP8E4M3FN_FP8E5M2_FP32,
   FP32_FP8E4M3FN_FP8E4M3FN_FP32,
+  // fp16 accumulator, fp8 operand
+  FP16_FP8E5M2_FP8E5M2_FP16,
+  FP16_FP8E5M2_FP8E4M3FN_FP16,
+  FP16_FP8E4M3FN_FP8E5M2_FP16,
+  FP16_FP8E4M3FN_FP8E4M3FN_FP16,
   // integer tensor core instr
   INT32_INT1_INT1_INT32, // Not implemented
   INT32_INT4_INT4_INT32, // Not implemented
@@ -298,6 +304,11 @@ static Type getMmaRetType(TensorCoreType mmaType, MLIRContext *ctx) {
   case TensorCoreType::FP32_FP8E4M3FN_FP8E5M2_FP32:
   case TensorCoreType::FP32_FP8E4M3FN_FP8E4M3FN_FP32:
     return fp32x4Ty;
+  case TensorCoreType::FP16_FP8E5M2_FP8E5M2_FP16:
+  case TensorCoreType::FP16_FP8E5M2_FP8E4M3FN_FP16:
+  case TensorCoreType::FP16_FP8E4M3FN_FP8E5M2_FP16:
+  case TensorCoreType::FP16_FP8E4M3FN_FP8E4M3FN_FP16:
+    return fp16x2Pack2Ty;
   case TensorCoreType::INT32_INT8_INT8_INT32:
     return i32x4Ty;
   case TensorCoreType::FP64_FP64_FP64_FP64:
@@ -341,6 +352,18 @@ static TensorCoreType getMmaType(triton::DotOp op) {
   } else if (dTy.getElementType().isF16()) {
     if (aTy.getElementType().isF16() && bTy.getElementType().isF16())
       return TensorCoreType::FP16_FP16_FP16_FP16;
+    if (llvm::isa<Float8E5M2Type>(aTy.getElementType()) &&
+        llvm::isa<Float8E5M2Type>(bTy.getElementType()))
+      return TensorCoreType::FP16_FP8E5M2_FP8E5M2_FP16;
+    if (llvm::isa<Float8E5M2Type>(aTy.getElementType()) &&
+        llvm::isa<Float8E4M3FNType>(bTy.getElementType()))
+      return TensorCoreType::FP16_FP8E5M2_FP8E4M3FN_FP16;
+    if (llvm::isa<Float8E4M3FNType>(aTy.getElementType()) &&
+        llvm::isa<Float8E5M2Type>(bTy.getElementType()))
+      return TensorCoreType::FP16_FP8E4M3FN_FP8E5M2_FP16;
+    if (llvm::isa<Float8E4M3FNType>(aTy.getElementType()) &&
+        llvm::isa<Float8E4M3FNType>(bTy.getElementType()))
+      return TensorCoreType::FP16_FP8E4M3FN_FP8E4M3FN_FP16;
   } else if (dTy.getElementType().isF64()) {
     if (aTy.getElementType().isF64() && bTy.getElementType().isF64())
       return TensorCoreType::FP64_FP64_FP64_FP64;
@@ -387,6 +410,15 @@ inline static const std::map<TensorCoreType, std::string> mmaInstrPtxAmpere = {
     {TensorCoreType::FP32_FP8E4M3FN_FP8E4M3FN_FP32,
      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32"},
 
+    {TensorCoreType::FP16_FP8E5M2_FP8E5M2_FP16,
+     "mma.sync.aligned.m16n8k32.row.col.f16.e5m2.e5m2.f16"},
+    {TensorCoreType::FP16_FP8E5M2_FP8E4M3FN_FP16,
+     "mma.sync.aligned.m16n8k32.row.col.f16.e5m2.e4m3.f16"},
+    {TensorCoreType::FP16_FP8E4M3FN_FP8E5M2_FP16,
+     "mma.sync.aligned.m16n8k32.row.col.f16.e4m3.e5m2.f16"},
+    {TensorCoreType::FP16_FP8E4M3FN_FP8E4M3FN_FP16,
+     "mma.sync.aligned.m16n8k32.row.col.f16.e4m3.e4m3.f16"},
+
     {TensorCoreType::FP64_FP64_FP64_FP64,
      "mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64"},
 };