[AMD][gfx12] Enable dot for f8 operands (#6814)

joviliast · web-flow · commit 4b9efc5c476e · 2025-05-14T19:02:26.000-07:00
- Enable f8, bf8 dtype for gfx12
- Add related tests
- Fix matmul dtype matcher

Signed-off-by: Ilya Veselov &lt;iveselov.nn@gmail.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -35,6 +35,7 @@
     is_hip_cdna2,
     is_hip_cdna3,
     is_hip_cdna4,
+    is_hip_gfx12,
     is_xpu,
     get_arch,
     torch_float8_dtypes,
@@ -3722,8 +3723,8 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dty
                 pytest.skip("float8e4nv not supported on sm <= 80")
 
         if is_hip():
-            if in_dtype in ("float8e5", "float8e4nv") and not is_hip_cdna4():
-                pytest.skip(f"{in_dtype} only supported on CDNA4")
+            if in_dtype in ("float8e5", "float8e4nv") and not (is_hip_cdna4() or is_hip_gfx12()):
+                pytest.skip(f"{in_dtype} only supported on CDNA4 and gfx12")
             if in_dtype in ("float8e5b16", "float8e4b8") and not is_hip_cdna3():
                 pytest.skip(f"{in_dtype} only supported on CDNA3")
             if not ((input_precision == "ieee") or (input_precision == "tf32" and is_hip_cdna3())):
diff --git a/python/triton/_internal_testing.py b/python/triton/_internal_testing.py
@@ -62,6 +62,12 @@ def is_hip_cdna4():
     return target is not None and target.backend == 'hip' and target.arch == 'gfx950'
 
 
+def is_hip_gfx12():
+    target = get_current_target()
+    print(target.arch)
+    return target is not None and target.backend == 'hip' and 'gfx12' in target.arch
+
+
 def is_hip_cdna():
     return is_hip_cdna2() or is_hip_cdna3() or is_hip_cdna4()
 
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen2.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen2.mlir
@@ -80,15 +80,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // CHECK: %[[DOT2_OP_C_EXT:.+]] = arith.extf %[[DOT2_OP_C]]
     // CHECK-SAME: to tensor<32x64xf32, #[[WMMA_0]]>
     %3 = arith.constant dense<0.000000e+00> : tensor<32x64xf16, #blocked>
-    // CHECK: %[[DOT2_OP_A_F8:.+]] = ttg.convert_layout %[[DOT2_ARG_A]]
+    // CHECK: %[[DOT2_OP_A:.+]] = ttg.convert_layout %[[DOT2_ARG_A]]
     // CHECK-SAME: -> tensor<32x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #[[WMMA_0]]
-    // CHECK: %[[DOT2_OP_A_F16:.+]] = tt.fp_to_fp %[[DOT2_OP_A_F8]]
-    // CHECK-SAME: -> tensor<32x128xf16, #ttg.dot_op<{opIdx = 0, parent = #[[WMMA_0]], kWidth = 8}>>
-    // CHECK: %[[DOT2_OP_B_F8:.+]] = ttg.convert_layout %[[DOT2_ARG_B]]
+    // CHECK: %[[DOT2_OP_B:.+]] = ttg.convert_layout %[[DOT2_ARG_B]]
     // CHECK-SAME: -> tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #[[WMMA_0]]
-    // CHECK: %[[DOT2_OP_B_F16:.+]] = tt.fp_to_fp %[[DOT2_OP_B_F8]]
-    // CHECK-SAME: -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 1, parent = #[[WMMA_0]], kWidth = 8}>>
-    // CHECK: %[[DOT2_WMMA_RES:.+]] = tt.dot %[[DOT2_OP_A_F16]], %[[DOT2_OP_B_F16]], %[[DOT2_OP_C_EXT]]
+    // CHECK: %[[DOT2_WMMA_RES:.+]] = tt.dot %[[DOT2_OP_A]], %[[DOT2_OP_B]], %[[DOT2_OP_C_EXT]]
     // CHECK-SAME: -> tensor<32x64xf32, #[[WMMA_0]]
     %4 = tt.dot %0, %1, %3 : tensor<32x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<32x64xf16, #blocked>
     // CHECK: %[[CONVERTED_RES:.+]] = ttg.convert_layout %[[DOT2_WMMA_RES]]
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -113,6 +113,8 @@ def parse_options(self, opts) -> Any:
                 supported_fp8_dtypes.update({'fp8e4nv', 'fp8e4b8', 'fp8e5b16'})
             elif self.target.arch == 'gfx950':
                 supported_fp8_dtypes.update({'fp8e4nv', 'fp8e5'})
+            elif 'gfx12' in self.target.arch:
+                supported_fp8_dtypes.update({'fp8e4nv', 'fp8e5'})
             args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
 
         if "enable_fp_fusion" not in opts:
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp
@@ -172,6 +172,10 @@ std::string getTypeStr(Type ty) {
     scalarName = "iu8";
   } else if (ty.isInteger(4)) {
     scalarName = "iu4";
+  } else if (llvm::isa<Float8E4M3FNType>(ty)) {
+    scalarName = "fp8";
+  } else if (llvm::isa<Float8E5M2Type>(ty)) {
+    scalarName = "bf8";
   } else if (auto vecTy = dyn_cast<VectorType>(ty)) {
     auto elemType = vecTy.getElementType();
     auto numElems = vecTy.getNumElements();
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -299,20 +299,18 @@ OperandTypesVector getOperandTypesForWmmaOp(PatternRewriter &rewriter,
       // by WMMA instruction, but not supported by triton
       // clang-format on
   };
-  // TODO: support fp8 configurations for WMMAv2. The code should be as
-  // following:
-  // if (version == 2) {
-  //   Type fp8 = rewriter.getFp8Type();
-  //   Type bf8 = rewriter.getBF8Type();
-  //   applicableTypes.append({
-  //       // clang-format off
-  //       {fp8, fp8, f32, f32},
-  //       {fp8, bf8, f32, f32},
-  //       {bf8, fp8, f32, f32},
-  //       {bf8, bf8, f32, f32},
-  //       // clang-format on
-  //   });
-  // }
+  if (version == 2) {
+    Type fp8e4nv = rewriter.getType<Float8E4M3FNType>();
+    Type fp8e5 = rewriter.getType<Float8E5M2Type>();
+    applicableTypes.append({
+        // clang-format off
+        {fp8e4nv, fp8e4nv, f32, f32},
+        {fp8e4nv, fp8e5, f32, f32},
+        {fp8e5, fp8e4nv, f32, f32},
+        {fp8e5, fp8e5, f32, f32},
+        // clang-format on
+    });
+  }
   return selectMatrixCoreOperandTypes(dot, applicableTypes);
 }
 
@@ -1002,11 +1000,6 @@ class BlockedToWMMA : public OpRewritePattern<tt::DotOp> {
         aShape[rank - 1] % mnkDim[2] != 0)   // k
       return failure();
 
-    if (wmmaVersion == 2 && llvm::isa<FloatType>(oldAType) &&
-        oldAType.getIntOrFloatBitWidth() == 8) {
-      return rewriter.notifyMatchFailure(dotOp, "not supported yet");
-    }
-
     // get operand types
     auto operandTypes = getOperandTypesForWmmaOp(rewriter, dotOp, wmmaVersion);
     if (operandTypes.empty())