[AMD] Support gfx950 double rate mfma ops (#5831)

yiqian1 · web-flow · commit d664a09c0b78 · 2025-02-12T13:31:16.000-08:00
This patch adds support for new MFMA double-rate operations on gfx950. 

- We prefer to use double-rate ops for gfx950 when the input K size
  is large enough.
- The double rate mfma ops of bf16 datatype must preserve the bf16 type
  and not convert it into i16, as the llvm backend expects bf16.
- kpack is always 1 for gfx950.
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1409,6 +1409,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
       if (parseIntArrayAttr(parser, attr, instrShape, "instrShape").failed())
         return {};
     }
+
     if (attr.getName() == "isTransposed") {
       if (parseBool(parser, attr, isTransposed, "isTransposed").failed())
         return {};
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3420,9 +3420,20 @@ def get_test_dot_small_mn_fma_cases():
             for in_dtype, out_dtype in [('float16', 'float16'), ('float32', 'float32')]]
 
 
+def get_test_dot_double_rate_cases():
+    if not is_hip_cdna():
+        return []
+    return [(32, 32, 16, 4, False, False, 'None', 'ieee', 'float16', 'float32', 1, None),
+            (32, 32, 16, 4, False, False, 'None', 'ieee', 'bfloat16', 'float32', 1, None),
+            (16, 16, 32, 4, False, False, 'None', 'ieee', 'float16', 'float32', 1, None),
+            (16, 16, 32, 4, False, False, 'None', 'ieee', 'bfloat16', 'float32', 1, None)]
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize(
     "M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dtype, out_dtype, kpack, mma_nonk_size",
+    get_test_dot_double_rate_cases() + \
+    get_test_dot_base_cases() + \
     get_test_dot_base_cases() + \
     get_test_dot_mixed_sizes_cases() + \
     get_test_dot_transposed_op_base_cases() + \
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -6,7 +6,7 @@
 import triton.tools.experimental_descriptor
 from test_mxfp import MXFP4Tensor, MXScaleTensor
 import re
-from triton._internal_testing import is_cuda, is_hip, is_hip_mi200, is_hip_mi350, is_hip_cdna
+from triton._internal_testing import is_cuda, is_hip, is_hip_mi300, is_hip_mi350, is_hip_cdna
 
 
 def f8_to_f16(x, dtype):
@@ -84,8 +84,8 @@ def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K,
     if is_hip() and ((BLOCK_K * BLOCK_M + BLOCK_K * BLOCK_N) * NUM_STAGES * get_src_element_ty_size(dtype_src_str)
                      > 65536):
         pytest.skip("HIP path requires less than 64KB of shared memory")
-    if is_hip_mi200() and dtype_src_str == "tensorfloat32":
-        pytest.skip("HIP MI200 does not support tensorfloat32")
+    if is_hip() and (not is_hip_mi300()) and dtype_src_str == "tensorfloat32":
+        pytest.skip("tensorfloat32 is only supported on HIP MI300")
     if dtype_src_str == "float8e5" and BLOCK_K == 16:
         pytest.skip("Skipping cases small K for float8")
     if dtype_src_str == "float8e5" and device == "cuda" and torch.cuda.get_device_capability()[0] < 9:
diff --git a/test/TritonGPU/amd/mfma-double-rate.mlir b/test/TritonGPU/amd/mfma-double-rate.mlir
@@ -0,0 +1,60 @@
+// RUN: triton-opt %s  -split-input-file --convert-triton-amdgpu-to-llvm='arch=gfx950' | FileCheck %s
+
+// CHECK-LABEL:mfma_16x16x32_f16
+
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = false}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_16x16x32_f16(%arg0: tensor<16x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
+                         %arg1: tensor<32x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma>
+    // CHECK: rocdl.mfma.f32.16x16x32.f16 {{.*}} : (vector<8xf16>, vector<8xf16>
+    %dot = tt.dot %arg0, %arg1, %cst : tensor<16x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<32x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<16x16xf32, #mma>
+    tt.return
+ }
+}
+
+// -----
+
+// CHECK-LABEL:mfma_16x16x32_bf16
+
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = false}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_16x16x32_bf16(%arg0: tensor<16x32xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
+                         %arg1: tensor<32x16xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma>
+    // CHECK: rocdl.mfma.f32.16x16x32.bf16 {{.*}} : (vector<8xbf16>, vector<8xbf16>
+    %dot = tt.dot %arg0, %arg1, %cst : tensor<16x32xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<32x16xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<16x16xf32, #mma>
+    tt.return
+ }
+}
+
+// -----
+
+// CHECK-LABEL:mfma_32x32x16_f16
+
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = false}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_32x32x16_f16(%arg0: tensor<32x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
+                         %arg1: tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
+    // CHECK: rocdl.mfma.f32.32x32x16.f16 {{.*}} : (vector<8xf16>, vector<8xf16>
+    %dot = tt.dot %arg0, %arg1, %cst : tensor<32x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<32x32xf32, #mma>
+    tt.return
+ }
+}
+
+
+// -----
+
+// CHECK-LABEL:mfma_32x32x16_bf16
+
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = false}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_32x32x16_bf16(%arg0: tensor<32x16xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
+                         %arg1: tensor<16x32xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
+    // CHECK: rocdl.mfma.f32.32x32x16.bf16 {{.*}} : (vector<8xbf16>, vector<8xbf16>
+    %dot = tt.dot %arg0, %arg1, %cst : tensor<32x16xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x32xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<32x32xf32, #mma>
+    tt.return
+ }
+}
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -65,6 +65,9 @@ def __post_init__(self):
         # Ignore user-defined warp size for gfx9
         warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch or 'gfx12' in self.arch else 64
         object.__setattr__(self, 'warp_size', warp_size)
+        # Only kpack=1 is supported on gfx950
+        kpack = 1 if self.arch == 'gfx950' else self.kpack
+        object.__setattr__(self, 'kpack', kpack)
         libs = ["ocml", "ockl"]
         for lib in libs:
             extern_libs[lib] = str(default_libdir / f'{lib}.bc')
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h b/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h
@@ -25,7 +25,7 @@ enum class MfmaTypeId : uint32_t {
 };
 
 struct MfmaInsnGroupSelectKey {
-  unsigned mDim, nDim;
+  unsigned mDim, nDim, kDim;
   MfmaTypeId elemType;
   int mfmaVersion;
 };
@@ -51,21 +51,23 @@ constexpr typename std::underlying_type<T>::type cast_as_underlying(T t) {
 struct MfmaInsnGroupSelectKeyInfo
     : public llvm::DenseMapInfo<MfmaInsnGroupSelectKey> {
   static inline MfmaInsnGroupSelectKey getEmptyKey() {
-    return {32, 32, MfmaTypeId::Fp32TyId, 0};
+    return {32, 32, 0, MfmaTypeId::Fp32TyId, 0};
   }
 
   static inline MfmaInsnGroupSelectKey getTombstoneKey() {
-    return {32, 32, MfmaTypeId::Fp32TyId, -1};
+    return {32, 32, 0, MfmaTypeId::Fp32TyId, -1};
   }
 
   static inline bool isEqual(const MfmaInsnGroupSelectKey &lhs,
                              const MfmaInsnGroupSelectKey &rhs) {
     return lhs.mDim == rhs.mDim && lhs.nDim == rhs.nDim &&
-           lhs.elemType == rhs.elemType && lhs.mfmaVersion == rhs.mfmaVersion;
+           lhs.kDim == rhs.kDim && lhs.elemType == rhs.elemType &&
+           lhs.mfmaVersion == rhs.mfmaVersion;
   }
 
   static unsigned getHashValue(const MfmaInsnGroupSelectKey &key) {
     auto dimHash = llvm::detail::combineHashValue(key.mDim, key.nDim);
+    dimHash = llvm::detail::combineHashValue(dimHash, key.kDim);
     auto verHash = llvm::detail::combineHashValue(dimHash, key.mfmaVersion);
     auto elemHash = cast_as_underlying(key.elemType);
     return llvm::detail::combineHashValue(elemHash, verHash);
@@ -80,8 +82,9 @@ class MfmaInsn {
 
 public:
   static FailureOr<MfmaInsn> selectMfma(unsigned mDim, unsigned nDim,
-                                        Type elementTypeA, Type elementTypeB,
-                                        int mfmaVersion, bool allowXF32);
+                                        unsigned kDim, Type elementTypeA,
+                                        Type elementTypeB, int mfmaVersion,
+                                        bool allowXF32);
   MfmaInsn(Type elementTypeA, Type elementTypeB, const MfmaInsnAttr &attr);
   unsigned getKDim();
   unsigned getMDim();
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -271,11 +271,13 @@ struct DotOpMFMAConversionHelper {
     auto elemTyA = aTensorTy.getElementType();
     auto elemTyB = bTensorTy.getElementType();
 
+    const auto kDimOperandSize = aTensorTy.getShape().back();
+
     bool allowXF32 =
         op.getInputPrecision() == InputPrecision::TF32 && mfmaVersion == 3;
     StringRef mfmaInsnName;
-    auto maybeMfmaInsn = MfmaInsn::selectMfma(mDim, nDim, elemTyA, elemTyB,
-                                              mfmaVersion, allowXF32);
+    auto maybeMfmaInsn = MfmaInsn::selectMfma(
+        mDim, nDim, kDimOperandSize, elemTyA, elemTyB, mfmaVersion, allowXF32);
     if (failed(maybeMfmaInsn))
       llvm::report_fatal_error("No match found in MFMA database\n");
 
@@ -290,8 +292,6 @@ struct DotOpMFMAConversionHelper {
     if (aTensorTy.getElementType().isF32() && allowXF32)
       kWidth *= 2;
 
-    auto rank = aTensorTy.getShape().size();
-    const auto kDimOperandSize = aTensorTy.getShape()[rank - 1];
     const auto kDimInstrSize = mfmaLayout.getInstrShapeForOperand(kWidth, 0)[1];
 
     auto repA = mfmaLayout.getRepForOperand(aTensorTy.getShape(), kWidth, 0);
@@ -309,12 +309,13 @@ struct DotOpMFMAConversionHelper {
     auto numRepB = repA[0];
     assert(repA[0] == repB[0]);
 
+    bool preserveBF16 = mfmaInsnName.contains(".bf16") && mfmaVersion >= 4;
     auto operandA = getValuesFromDotOperandLayoutStruct(
         loadedA, numRepB, numRepM, numRepK, kWidth, kBase,
-        aTensorTy.getElementType(), allowXF32);
+        aTensorTy.getElementType(), allowXF32, preserveBF16);
     auto operandB = getValuesFromDotOperandLayoutStruct(
         loadedB, numRepB, numRepN, numRepK, kWidth, kBase,
-        aTensorTy.getElementType(), allowXF32);
+        aTensorTy.getElementType(), allowXF32, preserveBF16);
 
     auto dstElemTy = dTensorTy.getElementType();
     auto fc = unpackLLElements(loc, loadedC, rewriter);
@@ -379,19 +380,19 @@ struct DotOpMFMAConversionHelper {
   /// rawElems is a vector of kWidth elements. We need to prepare vector(s) of
   /// kBase elements for each mfma instruction
   SmallVector<Value> extractOperands(Value rawElems, int kWidth, int kBase,
-                                     Type type) const {
+                                     Type type, bool preserveBF16) const {
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     int kpack = kWidth / kBase;
     SmallVector<Value> results;
     auto vecTy = vec_ty(type, kBase);
-    if (type.isBF16())
+    if (type.isBF16() && !preserveBF16)
       vecTy = vec_ty(i16_ty, kBase);
     for (int k = 0; k < kpack; ++k) {
       Value vec = b.undef(vecTy);
       for (int elemId = 0; elemId < kBase; ++elemId) {
         auto val =
             b.extract_element(type, rawElems, b.i32_val(elemId + k * kBase));
-        if (type.isBF16()) {
+        if (type.isBF16() && !preserveBF16) {
           // rocdl.mfma.f32.32x32x8bf16.1k calls for input of i16 type
           auto cast = b.bitcast(val, i16_ty);
           vec = b.insert_element(vecTy, vec, cast, b.i32_val(elemId));
@@ -423,7 +424,7 @@ struct DotOpMFMAConversionHelper {
   virtual SmallVector<ValueTable>
   getValuesFromDotOperandLayoutStruct(Value value, int batch, int n0, int n1,
                                       int kWidth, int kBase, Type type,
-                                      bool allowXF32) const {
+                                      bool allowXF32, bool preserveBF16) const {
     auto tb = TritonLLVMOpBuilder(loc, rewriter);
     auto elems = unpackLLElements(loc, value, rewriter);
     int kpack = kWidth / kBase;
@@ -449,14 +450,18 @@ struct DotOpMFMAConversionHelper {
           } else {
             SmallVector<Value> vals;
             if (type.isF32() && allowXF32) {
-              vals = extractOperands(rawElems, kWidth, kBase, f32_ty);
+              vals = extractOperands(rawElems, kWidth, kBase, f32_ty,
+                                     preserveBF16);
             } else if (type.getIntOrFloatBitWidth() == 8) {
-              vals = extractOperands(rawElems, kWidth, kBase, i8_ty);
+              vals =
+                  extractOperands(rawElems, kWidth, kBase, i8_ty, preserveBF16);
             } else if (type.isBF16()) {
-              vals = extractOperands(rawElems, kWidth, kBase, bf16_ty);
+              vals = extractOperands(rawElems, kWidth, kBase, bf16_ty,
+                                     preserveBF16);
             } else {
               assert(type.isF16() && "Unsupported data type");
-              vals = extractOperands(rawElems, kWidth, kBase, f16_ty);
+              vals = extractOperands(rawElems, kWidth, kBase, f16_ty,
+                                     preserveBF16);
             }
             for (int k = 0; k < kpack; ++k) {
               dotOpVals[k][{b, i, j}] = vals[k];
@@ -518,6 +523,8 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     ScaleDotElemType aElemType = op.getLhsType();
     ScaleDotElemType bElemType = op.getRhsType();
 
+    const auto kDimOperandSize = aTensorTy.getShape().back();
+
     auto supportsTypes = [](ScaleDotElemType elemType) {
       return elemType == ScaleDotElemType::E2M1;
     };
@@ -529,7 +536,7 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     auto ctx = op.getContext();
     constexpr bool allowXF32 = false;
     auto maybeMfmaInsn = MfmaInsn::selectMfma(
-        mDim, nDim, scaleDotElemTypeToMLIRType(ctx, aElemType),
+        mDim, nDim, kDimOperandSize, scaleDotElemTypeToMLIRType(ctx, aElemType),
         scaleDotElemTypeToMLIRType(ctx, bElemType), mfmaVersion, allowXF32);
     if (failed(maybeMfmaInsn))
       llvm::report_fatal_error("No match found in MFMA database\n");
@@ -544,8 +551,6 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     auto aEncoding = cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
     auto bEncoding = cast<DotOperandEncodingAttr>(bTensorTy.getEncoding());
     int kWidth = aEncoding.getKWidth();
-    auto rank = aTensorTy.getShape().size();
-    const auto kDimOperandSize = aTensorTy.getShape()[rank - 1];
     const auto kDimInstrSize = mfmaLayout.getInstrShapeForOperand(kWidth, 0)[1];
 
     auto repA = mfmaLayout.getRepForOperand(aTensorTy.getShape(), kWidth, 0);
@@ -575,19 +580,19 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
 
     auto operandA = getValuesFromDotOperandLayoutStruct(
         loadedA, numRepB, numRepM, numRepK, kWidth, kBase,
-        aTensorTy.getElementType(), allowXF32);
+        aTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false);
     auto operandB = getValuesFromDotOperandLayoutStruct(
         loadedB, numRepB, numRepN, numRepK, kWidth, kBase,
-        bTensorTy.getElementType(), allowXF32);
+        bTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false);
 
     // Scales have the same replica distributions as their corresponding
     // operands.
     auto operandAScale = getValuesFromDotOperandLayoutStruct(
         loadedAScale, numRepB, numRepM, numRepK, scaleKWidth, scaleKBase,
-        aScaleTensorTy.getElementType(), allowXF32);
+        aScaleTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false);
     auto operandBScale = getValuesFromDotOperandLayoutStruct(
         loadedBScale, numRepB, numRepN, numRepK, scaleKWidth, scaleKBase,
-        bScaleTensorTy.getElementType(), allowXF32);
+        bScaleTensorTy.getElementType(), allowXF32, /*preserveBF16=*/false);
 
     auto dstElemTy = dTensorTy.getElementType();
     auto fc = unpackLLElements(loc, loadedC, rewriter);
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -136,8 +136,8 @@ FailureOr<MfmaInsn> chooseMfmaInstruction(RankedTensorType cType,
   if (mDim == 0 || nDim == 0)
     return failure();
 
-  auto maybeMfmaInsn = MfmaInsn::selectMfma(mDim, nDim, aElemType, bElemType,
-                                            mfmaVersion, allowXF32);
+  auto maybeMfmaInsn = MfmaInsn::selectMfma(mDim, nDim, inputKSize, aElemType,
+                                            bElemType, mfmaVersion, allowXF32);
   if (failed(maybeMfmaInsn))
     llvm::report_fatal_error("No match found in MFMA database\n");
 
@@ -511,7 +511,6 @@ class BlockedToMFMA : public OpRewritePattern<tt::DotOp> {
     Value dotOutput =
         convertAndCastTensor(rewriter, newDot, oldRetType.getEncoding(),
                              oldRetType.getElementType());
-
     rewriter.replaceOp(dotOp, dotOutput);
 
     return success();
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp

Original file line number	Diff line number	Diff line change
`@@ -1409,6 +1409,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {`
`1409`	`1409`	`if (parseIntArrayAttr(parser, attr, instrShape, "instrShape").failed())`
`1410`	`1410`	`return {};`
`1411`	`1411`	`}`
	`1412`	`+`
`1412`	`1413`	`if (attr.getName() == "isTransposed") {`
`1413`	`1414`	`if (parseBool(parser, attr, isTransposed, "isTransposed").failed())`
`1414`	`1415`	`return {};`