[AMD][Backend] Enable XF32 (TF32) Support for CDNA3 GPUs (#5637)

SamGinzburg · web-flow · commit 8a16d88c71fe · 2025-01-23T10:25:48.000-08:00
# Overview AMD MI300-series GPUs support XF32 (TF32) mfma instructions in hardware, so we should utilize them if they are there. TF32 provides a ~1.4x improvement to matmuls over FP32 in some examples. # BC breaking: changing the default input precision behavior According to the [Triton docs](https://github.com/triton-lang/triton/blob/6556ec6050649e1fc42feb05a62ab9cc6908a722/python/triton/language/core.py#L1714) "For devices that do have tensor cores, the default precision is tf32". Enabling XF32 (TF32) on the MI300 is BC breaking in this case, as before the ops would execute with FP32 precision and will now execute with TF32. # Testing I've added lit tests and enabled TF32 for MI300 in the python unit tests
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3407,9 +3407,10 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dty
                     pytest.skip("Only test out_dtype=float16 on devices with sm >=80")
             if capability[0] < 9 and in_dtype == 'float8e4nv':
                 pytest.skip("float8e4nv not supported on sm <= 80")
+
         if is_hip() and (in_dtype == 'float8e4nv' or in_dtype == 'float8e5'):
             pytest.skip("float8e4nv and float8e5 not supported on HIP")
-        if is_hip() and (input_precision != "ieee"):
+        if is_hip() and not ((input_precision == "ieee") or (input_precision == "tf32" and is_hip_mi300())):
             pytest.skip(f"{input_precision} not supported on HIP")
         if is_hip() and (kpack == 2 and in_dtype == 'int8' and K < 64):
             pytest.skip("kpack too large for K")
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1715,7 +1715,7 @@ def dot(input, other, acc=None, input_precision=None, allow_tf32=None, max_num_i
       the device does not have Tensor Cores or the inputs are not of dtype f32,
       this option is ignored. For devices that do have tensor cores, the
       default precision is tf32.
-    :type input_precision: string. Available options for nvidia: :code:`"tf32"`, :code:`"tf32x3"`, :code:`"ieee"`. Default: :code:`"tf32"`. Available options for amd: :code:`"ieee"`.
+    :type input_precision: string. Available options for nvidia: :code:`"tf32"`, :code:`"tf32x3"`, :code:`"ieee"`. Default: :code:`"tf32"`. Available options for amd: :code:`"ieee"`, (CDNA3 only) :code:`"tf32"`.
     :param allow_tf32: *Deprecated.* If true, input_precision is set to "tf32".
       Only one of :code:`input_precision` and :code:`allow_tf32` can be
       specified (i.e. at least one must be :code:`None`).
diff --git a/test/TritonGPU/amd/mfma-xf32.mlir b/test/TritonGPU/amd/mfma-xf32.mlir
@@ -0,0 +1,39 @@
+// RUN: triton-opt %s  -split-input-file --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+
+// CHECK-LABEL:mfma_xf32
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = true}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_xf32(
+    %arg0: tensor<64x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>,
+    %arg1: tensor<128x64xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>) {
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    // Check that we generate xf32 instructions
+    // CHECK: rocdl.mfma.f32.16x16x8.xf32
+    %dot = tt.dot %arg0, %arg1, %cst_0, inputPrecision = tf32 :
+      tensor<64x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<128x64xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x64xf32, #mma>
+    tt.return
+  }
+}
+
+// -----
+
+// CHECK-LABEL:mfma_not_xf32
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = true}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_not_xf32(
+    %arg0: tensor<64x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>,
+    %arg1: tensor<128x64xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>) {
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    // Check that we don't generate xf32 instructions if the input precision is "ieee"
+    // CHECK: rocdl.mfma.f32.16x16x4f32
+    %dot = tt.dot %arg0, %arg1, %cst_0, inputPrecision = ieee :
+      tensor<64x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<128x64xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x64xf32, #mma>
+    tt.return
+  }
+}
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -112,6 +112,12 @@ def __init__(self, target: GPUTarget) -> None:
     def parse_options(self, opts) -> Any:
         args = {'arch': os.getenv("TRITON_OVERRIDE_ARCH", self.target.arch)}
 
+        # Enable XF32 (TF32) for CDNA3 GPUs
+        if self.target.arch in ('gfx940', 'gfx941', 'gfx942'):
+            allowed_dot_input_precisions = set(HIPOptions.allowed_dot_input_precisions)
+            allowed_dot_input_precisions.update({'tf32'})
+            args["allowed_dot_input_precisions"] = tuple(sorted(allowed_dot_input_precisions))
+
         if "supported_fp8_dtypes" not in opts:
             supported_fp8_dtypes = set(HIPOptions.supported_fp8_dtypes)
             if self.target.arch in ('gfx940', 'gfx941', 'gfx942'):
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h b/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h
@@ -13,6 +13,7 @@ namespace mlir {
 
 enum class MfmaTypeId : uint32_t {
   Fp32TyId = 0,
+  Xf32TyId,
   Fp16TyId,
   Bf16TyId,
   I8TyId,
@@ -79,7 +80,7 @@ class MfmaInsn {
 public:
   static FailureOr<MfmaInsn> selectMfma(unsigned mDim, unsigned nDim,
                                         Type elementTypeA, Type elementTypeB,
-                                        int mfmaVersion);
+                                        int mfmaVersion, bool allowXF32);
   MfmaInsn(Type elementTypeA, Type elementTypeB, const MfmaInsnAttr &attr);
   unsigned getKDim();
   unsigned getMDim();
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -183,9 +183,11 @@ struct DotOpMFMAConversionHelper {
     auto elemTyA = aTensorTy.getElementType();
     auto elemTyB = bTensorTy.getElementType();
 
+    bool allowXF32 =
+        op.getInputPrecision() == InputPrecision::TF32 && mfmaVersion == 3;
     StringRef mfmaInsnName;
-    auto maybeMfmaInsn =
-        MfmaInsn::selectMfma(mDim, nDim, elemTyA, elemTyB, mfmaVersion);
+    auto maybeMfmaInsn = MfmaInsn::selectMfma(mDim, nDim, elemTyA, elemTyB,
+                                              mfmaVersion, allowXF32);
     if (failed(maybeMfmaInsn))
       llvm::report_fatal_error("No match found in MFMA database\n");
 
@@ -195,6 +197,11 @@ struct DotOpMFMAConversionHelper {
     auto aEncoding = cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
     auto bEncoding = cast<DotOperandEncodingAttr>(bTensorTy.getEncoding());
     int kWidth = aEncoding.getKWidth();
+
+    // If we are using XF32, the kWidth (and kBase) is double that of F32.
+    if (aTensorTy.getElementType().isF32() && allowXF32)
+      kWidth *= 2;
+
     auto rank = aTensorTy.getShape().size();
     const auto kDimOperandSize = aTensorTy.getShape()[rank - 1];
     const auto kDimInstrSize = mfmaLayout.getInstrShapeForOperand(kWidth, 0)[1];
@@ -216,17 +223,17 @@ struct DotOpMFMAConversionHelper {
 
     auto operandA = getValuesFromDotOperandLayoutStruct(
         loadedA, numRepB, numRepM, numRepK, kWidth, kBase,
-        aTensorTy.getElementType());
+        aTensorTy.getElementType(), allowXF32);
     auto operandB = getValuesFromDotOperandLayoutStruct(
         loadedB, numRepB, numRepN, numRepK, kWidth, kBase,
-        aTensorTy.getElementType());
+        aTensorTy.getElementType(), allowXF32);
 
     auto dstElemTy = dTensorTy.getElementType();
     auto fc = unpackLLElements(loc, loadedC, rewriter);
 
     unsigned warpSize = triton::gpu::getWarpSize(mfmaLayout);
     // compute number of output elements that each thread holds for one MFMA
-    // instruction. subBlocks
+    // instruction.
     const int subBlocks =
         getNumSubmatrices(aTensorTy.getElementType(), mDim, nDim);
     auto elemsPerVec = mDim * nDim * subBlocks / warpSize;
@@ -370,7 +377,8 @@ struct DotOpMFMAConversionHelper {
   /// appropriate for mfma instructions
   SmallVector<ValueTable>
   getValuesFromDotOperandLayoutStruct(Value value, int batch, int n0, int n1,
-                                      int kWidth, int kBase, Type type) const {
+                                      int kWidth, int kBase, Type type,
+                                      bool allowXF32) const {
     auto elems = unpackLLElements(loc, value, rewriter);
     int kpack = kWidth / kBase;
     SmallVector<ValueTable> dotOpVals(kpack);
@@ -388,13 +396,15 @@ struct DotOpMFMAConversionHelper {
           }
 
           Value convertedElems;
-          if (type.isF32()) {
+          if (type.isF32() && !allowXF32) {
             for (int k = 0; k < kpack; ++k)
               dotOpVals[k][{b, i, j}] =
                   extract_element(type, rawElems, i32_val(k));
           } else {
             SmallVector<Value> vals;
-            if (type.getIntOrFloatBitWidth() == 8) {
+            if (type.isF32() && allowXF32) {
+              vals = extractOperands(rawElems, kWidth, kBase, f32_ty);
+            } else if (type.getIntOrFloatBitWidth() == 8) {
               vals = extractOperands(rawElems, kWidth, kBase, i8_ty);
             } else if (type.isBF16()) {
               vals = extractOperands(rawElems, kWidth, kBase, bf16_ty);
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -101,7 +101,7 @@ warpsPerTileWMMA(Operation *dotOp, ArrayRef<int64_t> shape, int numWarps) {
 FailureOr<MfmaInsn> chooseMfmaInstruction(RankedTensorType cType,
                                           Type aElemType, Type bElemType,
                                           int inputKSize, int mfmaVersion,
-                                          int enforcedNonKDim) {
+                                          bool allowXF32, int enforcedNonKDim) {
   // number of matrix elements along k dim per one MFMA intruction
   unsigned kDim = 0;
 
@@ -128,8 +128,8 @@ FailureOr<MfmaInsn> chooseMfmaInstruction(RankedTensorType cType,
   if (mDim == 0 || nDim == 0)
     return failure();
 
-  auto maybeMfmaInsn =
-      MfmaInsn::selectMfma(mDim, nDim, aElemType, bElemType, mfmaVersion);
+  auto maybeMfmaInsn = MfmaInsn::selectMfma(mDim, nDim, aElemType, bElemType,
+                                            mfmaVersion, allowXF32);
   if (failed(maybeMfmaInsn))
     llvm::report_fatal_error("No match found in MFMA database\n");
 
@@ -146,19 +146,23 @@ FailureOr<MfmaInsn> chooseMfmaInstruction(RankedTensorType cType,
 FailureOr<MfmaInsn> chooseMfmaInstruction(tt::DotOp dot, int mfmaVersion,
                                           int nonKDim) {
   RankedTensorType aType = dot.getA().getType();
+  bool allowXF32 =
+      dot.getInputPrecision() == InputPrecision::TF32 && mfmaVersion == 3;
   return chooseMfmaInstruction(dot.getC().getType(), aType.getElementType(),
                                dot.getB().getType().getElementType(),
-                               aType.getShape().back(), mfmaVersion, nonKDim);
+                               aType.getShape().back(), mfmaVersion, allowXF32,
+                               nonKDim);
 }
 
 FailureOr<MfmaInsn> chooseMfmaInstruction(tt::DotScaledOp dot, int mfmaVersion,
                                           int nonKDim, bool useFp16) {
   // For scaled dot, we handle it with fp16 or bf16 emulation for now.
   Builder b(dot.getContext());
   Type elemType = useFp16 ? b.getF16Type() : b.getBF16Type();
-  return chooseMfmaInstruction(
-      dot.getC().getType(), /*aElemType=*/elemType, /*bElemType=*/elemType,
-      dot.getLhs().getType().getShape().back(), mfmaVersion, nonKDim);
+  return chooseMfmaInstruction(dot.getC().getType(), /*aElemType=*/elemType,
+                               /*bElemType=*/elemType,
+                               dot.getLhs().getType().getShape().back(),
+                               mfmaVersion, /*allowXF32=*/false, nonKDim);
 }
 
 using OperandTypesVector = SmallVector<Type, 4>;
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp
@@ -3,9 +3,13 @@
 namespace mlir {
 
 static MfmaTypeId chooseAppropriateMfmaId(mlir::Type dataTypeA,
-                                          mlir::Type dataTypeB) {
+                                          mlir::Type dataTypeB,
+                                          bool allowXF32) {
   if (dataTypeA.isF32() && dataTypeB.isF32()) {
-    return MfmaTypeId::Fp32TyId;
+    if (allowXF32)
+      return MfmaTypeId::Xf32TyId;
+    else
+      return MfmaTypeId::Fp32TyId;
   }
   if (dataTypeA.isF16() && dataTypeB.isF16()) {
     return MfmaTypeId::Fp16TyId;
@@ -39,6 +43,13 @@ using MfmaInsnGroupMap = llvm::DenseMap<MfmaInsnGroupSelectKey, MfmaInsnAttr,
 
 auto getMfmaInsnGroupAttrMap = []() -> const MfmaInsnGroupMap & {
   static MfmaInsnGroupMap MfmaInsnMap{
+      // xf32
+      // mfma.xf32.16x16x8xf32
+      {{16, 16, MfmaTypeId::Xf32TyId, 3},
+       {16, 16, 8, 2, ROCDL::mfma_f32_16x16x8_xf32::getOperationName()}},
+      // mfma.xf32.32x32x4.xf32
+      {{32, 32, MfmaTypeId::Xf32TyId, 3},
+       {32, 32, 4, 2, ROCDL::mfma_f32_32x32x4_xf32::getOperationName()}},
       // f32
       // mfma_f32_32x32x2f32
       {{32, 32, MfmaTypeId::Fp32TyId, 1},
@@ -219,6 +230,7 @@ std::pair<mlir::Type, mlir::Type> TypesFromMfmaId(mlir::MLIRContext *ctx,
   auto f32 = Float32Type::get(ctx);
   auto i8 = IntegerType::get(ctx, 8, IntegerType::Signed);
   switch (id) {
+  case MfmaTypeId::Xf32TyId:
   case MfmaTypeId::Fp32TyId:
     return {f32, f32};
   case MfmaTypeId::Fp16TyId:
@@ -242,9 +254,10 @@ std::pair<mlir::Type, mlir::Type> TypesFromMfmaId(mlir::MLIRContext *ctx,
 
 FailureOr<MfmaInsn> MfmaInsn::selectMfma(unsigned mDim, unsigned nDim,
                                          Type elementTypeA, Type elementTypeB,
-                                         int mfmaVersion) {
+                                         int mfmaVersion, bool allowXF32) {
   auto mfmaInsnAttrMap = getMfmaInsnGroupAttrMap();
-  MfmaTypeId mfmaId = chooseAppropriateMfmaId(elementTypeA, elementTypeB);
+  MfmaTypeId mfmaId =
+      chooseAppropriateMfmaId(elementTypeA, elementTypeB, allowXF32);
   MfmaInsnGroupSelectKey key = {mDim, nDim, mfmaId, mfmaVersion};
   auto it = mfmaInsnAttrMap.find(key);
   if (it == mfmaInsnAttrMap.end())