[AMD] Emulate Float8E4M3FN with Float16 on CDNA3 and below (#7186)

antiagainst · web-flow · commit 5389ed797016 · 2025-06-14T00:47:33.000-07:00
The fact that gfx942 has its own FP8 variants, not the OCP ones, is a
common pitfall. Also starting gfx950, we switch to OCP FP8 variants. So
it means we have a one-generation special case here.

This commit enables emulating Float8E4M3FN with FP16 like what we
already do for Float8E5M2 for better portability, with a performance
remark.
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir
@@ -1,20 +1,45 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=0" | FileCheck %s --check-prefixes MFMA0,CHECK
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16" | FileCheck %s --check-prefixes MFMA16,CHECK
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=0" --verify-diagnostics  | FileCheck %s --check-prefixes MFMA0,CHECK
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16" --verify-diagnostics | FileCheck %s --check-prefixes MFMA16,CHECK
 
 #blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
-// CHECK-LABEL: mfma_dot_fp8e5m2
+// CHECK-LABEL: mfma_dot_fp8e5m2_fp8e4m3fn
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @mfma_dot_fp8e5m2(
+  tt.func public @mfma_dot_fp8e5m2_fp8e4m3fn(
       %arg0: tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
-      %arg1: tensor<64x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+      %arg1: tensor<64x256xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
       %arg2: tensor<128x256x!tt.ptr<f32>, #blocked>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #blocked>
     // CHECK: %[[A0:.+]] = ttg.convert_layout %arg0 : {{.*}} -> tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     // CHECK: %[[A1:.+]] = tt.fp_to_fp %[[A0]] : {{.*}} -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+    // CHECK: %[[B0:.+]] = ttg.convert_layout %arg1 : {{.*}} -> tensor<64x256xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+    // CHECK: %[[B1:.+]] = tt.fp_to_fp %[[B0]] : tensor<64x256xf8E4M3FN, {{.*}} -> tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+    // CHECK: tt.dot %[[A1]], %[[B1]]
+    // expected-remark @+2 {{missing native support for fp8 variant on current architecture; emulated with fp16 so low performance}}
+    // expected-remark @+1 {{for gfx942 please use native supported fp8 variants}}
+    %1 = tt.dot %arg0, %arg1, %cst : tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x256xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x256xf32, #blocked>
+    tt.store %arg2, %1 : tensor<128x256x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
+// CHECK-LABEL: mfma_dot_fp8e4m3fn_fp8e5m2
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_fp8e4m3fn_fp8e5m2(
+      %arg0: tensor<128x64xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+      %arg1: tensor<64x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+      %arg2: tensor<128x256x!tt.ptr<f32>, #blocked>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #blocked>
+    // CHECK: %[[A0:.+]] = ttg.convert_layout %arg0 : {{.*}} -> tensor<128x64xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+    // CHECK: %[[A1:.+]] = tt.fp_to_fp %[[A0]] : {{.*}} -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     // CHECK: %[[B0:.+]] = ttg.convert_layout %arg1 : {{.*}} -> tensor<64x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
     // CHECK: %[[B1:.+]] = tt.fp_to_fp %[[B0]] : tensor<64x256xf8E5M2, {{.*}} -> tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
     // CHECK: tt.dot %[[A1]], %[[B1]]
-    %1 = tt.dot %arg0, %arg1, %cst : tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x256xf32, #blocked>
+    // expected-remark @+2 {{missing native support for fp8 variant on current architecture; emulated with fp16 so low performance}}
+    // expected-remark @+1 {{for gfx942 please use native supported fp8 variants}}
+    %1 = tt.dot %arg0, %arg1, %cst : tensor<128x64xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x256xf32, #blocked>
     tt.store %arg2, %1 : tensor<128x256x!tt.ptr<f32>, #blocked>
     tt.return
   }
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-unsupported.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-unsupported.mlir
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h b/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h
@@ -14,10 +14,11 @@ inline bool isF8F6F4(mlir::Type type) {
 
 struct MfmaIntrinsic {
   // Chooses a suitable mfma instrinsic for the given input case.
-  static FailureOr<MfmaIntrinsic> selectFor(int version, unsigned mDim,
-                                            unsigned nDim, unsigned inputKDim,
-                                            Type aElemType, Type bElemType,
-                                            bool withScale, bool useTF32);
+  static FailureOr<MfmaIntrinsic> selectFor(Location loc, int version,
+                                            unsigned mDim, unsigned nDim,
+                                            unsigned inputKDim, Type aElemType,
+                                            Type bElemType, bool withScale,
+                                            bool useTF32);
 
   MfmaIntrinsic(StringRef symbol, unsigned m, unsigned n, unsigned k,
                 unsigned kB, Type aET, Type bET)
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -272,7 +272,7 @@ struct DotOpMFMAConversionHelper {
         op.getInputPrecision() == InputPrecision::TF32 && mfmaVersion == 3;
     StringRef intrinsicName;
     FailureOr<MfmaIntrinsic> maybeMfmaIntrinsic = MfmaIntrinsic::selectFor(
-        mfmaVersion, mDim, nDim, kDimOperandSize, elemTyA, elemTyB,
+        op.getLoc(), mfmaVersion, mDim, nDim, kDimOperandSize, elemTyA, elemTyB,
         /*withScale=*/false, allowXF32);
     if (failed(maybeMfmaIntrinsic))
       return op.emitError(
@@ -584,7 +584,7 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     auto ctx = op.getContext();
     constexpr bool allowXF32 = false;
     FailureOr<MfmaIntrinsic> maybeMfmaIntrinsic = MfmaIntrinsic::selectFor(
-        mfmaVersion, mDim, nDim,
+        op.getLoc(), mfmaVersion, mDim, nDim,
         aElemType == ScaleDotElemType::E2M1 ? kDimOperandSize * 2
                                             : kDimOperandSize,
         scaleDotElemTypeToMLIRType(ctx, aElemType),
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -166,8 +166,8 @@ chooseMfmaInstruction(Location loc, int mfmaVersion, RankedTensorType cType,
     return failure();
 
   FailureOr<MfmaIntrinsic> maybeMfmaIntrinsic =
-      MfmaIntrinsic::selectFor(mfmaVersion, mDim, nDim, inputKSize, aElemType,
-                               bElemType, withScale, allowXF32);
+      MfmaIntrinsic::selectFor(loc, mfmaVersion, mDim, nDim, inputKSize,
+                               aElemType, bElemType, withScale, allowXF32);
   if (failed(maybeMfmaIntrinsic))
     return emitError(loc, "no matching matrix core intrinsic due to "
                           "unsupported element type");
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp
@@ -1,6 +1,7 @@
 #include "TritonAMDGPUTransforms/MfmaGroup.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "llvm/ADT/DenseMap.h"
 #include <tuple>
 
@@ -23,9 +24,9 @@ using MfmaKey =
 //
 // This function adapts certain parameters so we can be flexible when trying
 // to query with "mismatches".
-MfmaKey composeMfmaKeyFor(unsigned version, unsigned mDim, unsigned nDim,
-                          Type &aElemType, Type &bElemType, bool withScale,
-                          bool useTF32) {
+MfmaKey composeMfmaKeyFor(Location loc, unsigned version, unsigned mDim,
+                          unsigned nDim, Type &aElemType, Type &bElemType,
+                          bool withScale, bool useTF32) {
   Type aET = aElemType, bET = bElemType;
   Builder b(aElemType.getContext());
   if (withScale) {
@@ -38,9 +39,14 @@ MfmaKey composeMfmaKeyFor(unsigned version, unsigned mDim, unsigned nDim,
     // In the MFMA map we use the proper TF32 type. So "fix" it here.
     assert(version == 3);
     aET = bET = b.getType<FloatTF32Type>();
-  } else if (version <= 3 && isa<Float8E5M2Type>(aET) &&
-             isa<Float8E5M2Type>(bET)) {
-    // For the OCP FP8 E5M2 type, we can emulate the support for it with FP16.
+  } else if (version <= 3 && isa<Float8E5M2Type, Float8E4M3FNType>(aET) &&
+             isa<Float8E5M2Type, Float8E4M3FNType>(bET)) {
+    emitRemark(loc, "missing native support for fp8 variant on current "
+                    "architecture; emulated with fp16 so low performance");
+    if (version == 3)
+      emitRemark(loc, "for gfx942 please use native supported fp8 variants");
+    // For the OCP FP8 E5M2/E4M3FN type, we don't have native support until
+    // CDNA4. So emulate with FP16.
     aElemType = bElemType = aET = bET = b.getF16Type();
   }
   return {version, mDim, nDim, aET.getTypeID(), bET.getTypeID()};
@@ -270,12 +276,12 @@ MfmaDatabase::MfmaDatabase(MLIRContext *context) {
 //===----------------------------------------------------------------------===//
 
 FailureOr<MfmaIntrinsic>
-MfmaIntrinsic::selectFor(int version, unsigned mDim, unsigned nDim,
-                         unsigned inputKDim, Type aElemType, Type bElemType,
-                         bool withScale, bool useTF32) {
+MfmaIntrinsic::selectFor(Location loc, int version, unsigned mDim,
+                         unsigned nDim, unsigned inputKDim, Type aElemType,
+                         Type bElemType, bool withScale, bool useTF32) {
   const MfmaMap &mfmaMap = MfmaDatabase::get(aElemType.getContext());
-  MfmaKey key = composeMfmaKeyFor(version, mDim, nDim, aElemType, bElemType,
-                                  withScale, useTF32);
+  MfmaKey key = composeMfmaKeyFor(loc, version, mDim, nDim, aElemType,
+                                  bElemType, withScale, useTF32);
 
   auto it = mfmaMap.find(key);
   if (it == mfmaMap.end())