[AMD] Rework MFMA intrinsic mapping queries (#5937)

antiagainst · web-flow · commit 14d7bccb3e2d · 2025-02-20T14:44:37.000-08:00
This commit reworks how we encode MFMA intrinsics and query
accordingly. Now we use the (version, mDim, nDim, kDim,
aElemType, bElemType) as the key, and the value is a vector
only containing tuples of (symbol, kDim, kBase). This allows
us to drop using 0 as the kDim in the key for older generations,
and avoid data duplication in the map.

Along the way, fixed the fp8 types for gfx942 and gfx950:
gfx942 uses AMD variants, while gfx950 uses OCP ones.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -6328,7 +6328,8 @@ def matmul_kernel(  #
 @pytest.mark.interpreter
 @pytest.mark.parametrize("M, N, K", [(128, 256, 256)])
 @pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 256, 128), (64, 64, 64)])
-@pytest.mark.parametrize("in_type_str", ['float8e5', 'float8e4nv', 'float8e4b15'])
+@pytest.mark.parametrize(
+    "in_type_str", ['float8e5', 'float8e5b16', 'float8e4b8'] if is_hip() else ['float8e5', 'float8e4nv', 'float8e4b15'])
 @pytest.mark.parametrize("low_precision_acc", [0, 32, 64, 128])
 def test_dot_max_num_imprecise_acc(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, in_type_str, low_precision_acc, device):
     num_stages = 3
@@ -6338,8 +6339,8 @@ def test_dot_max_num_imprecise_acc(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, in_type_s
             pytest.skip("Dot op does not support fp8e4b15 on CUDA arch >= 90")
     elif is_hip():
         num_stages = 2
-        if in_type_str != 'float8e5':
-            pytest.skip('test_fp8_dot_acc for HIP currently broken in upstream.')
+        if in_type_str in ("float8e5b16", "float8e4b8") and not is_hip_mi300():
+            pytest.skip(f"{in_type_str} only supported on mi300")
 
     check_type_supported(in_type_str, device)
     A = numpy_random((M, K), dtype_str=in_type_str)
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir
@@ -7,7 +7,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
   tt.func public @mfma_dot_fp8e5m2(
       %arg0: tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
       %arg1: tensor<64x256xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
-      %arg2: tensor<128x256x!tt.ptr<f32>, #blocked> ) {
+      %arg2: tensor<128x256x!tt.ptr<f32>, #blocked>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #blocked>
     // CHECK: %[[A0:.+]] = ttg.convert_layout %arg0 : {{.*}} -> tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     // CHECK: %[[A1:.+]] = tt.fp_to_fp %[[A0]] : {{.*}} -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h b/third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h
@@ -1,98 +1,40 @@
 #ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_MFMAGROUP_H_
 #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTRANSFORMS_MFMAGROUP_H_
 
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallString.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace mlir {
 
-//===----------------------------------------------------------------------===//
-// AMDGPU MFMA instruction selection utilities
-//===----------------------------------------------------------------------===//
-
-enum class MfmaTypeId : uint32_t {
-  Fp32TyId = 0,
-  Xf32TyId,
-  Fp16TyId,
-  Bf16TyId,
-  I8TyId,
-  Fp8Fp8TyId,
-  Fp8Bf8TyId,
-  Bf8Fp8TyId,
-  Bf8Bf8TyId,
-  F8F6F4TyId,
-};
-
-struct MfmaInsnGroupSelectKey {
-  unsigned mDim, nDim, kDim;
-  MfmaTypeId elemType;
-  int mfmaVersion;
-};
-
-struct MfmaInsnAttr {
-  // m,n,k refer to the shapes of the two operands of mfma instructions.
-  // Operand A has shape m x k. Operand B has shape k x n.
-  // For mfma32 and mfma16 instructions, they are the same as
-  // the dims in the instruction name, i.e. mfma_DType_mxnxkxABType
-  unsigned m;
-  unsigned n;
-  unsigned k;
-  // kBase refers to the number of elements per thread
+struct MfmaIntrinsic {
+  // Chooses a suitable mfma instrinsic for the given input case.
+  static FailureOr<MfmaIntrinsic> selectFor(int version, unsigned mDim,
+                                            unsigned nDim, unsigned inputKDim,
+                                            Type aElemType, Type bElemType,
+                                            bool withScale, bool useTF32);
+
+  MfmaIntrinsic(StringRef symbol, unsigned m, unsigned n, unsigned k,
+                unsigned kB, Type aET, Type bET)
+      : name(symbol), mDim(m), nDim(n), kDim(k), kBase(kB), aElementType(aET),
+        bElementType(bET) {}
+  MfmaIntrinsic(const MfmaIntrinsic &other) = default;
+  MfmaIntrinsic(MfmaIntrinsic &&other) = default;
+
+  llvm::StringRef name;
+
+  // m, n, and k refer to the shapes of the two operands of an mfma intrinsic:
+  // Operand A has shape [m]x[k]; operand B has shape [k]x[n].
+  // For mfma32 and mfma16 intrinsics, they are encoded in the instruction
+  // name, i.e. mfma_DType_[m]x[n]x[k]xABType.
+  unsigned mDim;
+  unsigned nDim;
+  unsigned kDim;
+
+  // kBase is the number of elements each thread holds.
   unsigned kBase;
-  llvm::StringRef insn;
-};
-
-template <typename T>
-constexpr typename std::underlying_type<T>::type cast_as_underlying(T t) {
-  return static_cast<typename std::underlying_type<T>::type>(t);
-}
-
-struct MfmaInsnGroupSelectKeyInfo
-    : public llvm::DenseMapInfo<MfmaInsnGroupSelectKey> {
-  static inline MfmaInsnGroupSelectKey getEmptyKey() {
-    return {32, 32, 0, MfmaTypeId::Fp32TyId, 0};
-  }
-
-  static inline MfmaInsnGroupSelectKey getTombstoneKey() {
-    return {32, 32, 0, MfmaTypeId::Fp32TyId, -1};
-  }
-
-  static inline bool isEqual(const MfmaInsnGroupSelectKey &lhs,
-                             const MfmaInsnGroupSelectKey &rhs) {
-    return lhs.mDim == rhs.mDim && lhs.nDim == rhs.nDim &&
-           lhs.kDim == rhs.kDim && lhs.elemType == rhs.elemType &&
-           lhs.mfmaVersion == rhs.mfmaVersion;
-  }
-
-  static unsigned getHashValue(const MfmaInsnGroupSelectKey &key) {
-    auto dimHash = llvm::detail::combineHashValue(key.mDim, key.nDim);
-    dimHash = llvm::detail::combineHashValue(dimHash, key.kDim);
-    auto verHash = llvm::detail::combineHashValue(dimHash, key.mfmaVersion);
-    auto elemHash = cast_as_underlying(key.elemType);
-    return llvm::detail::combineHashValue(elemHash, verHash);
-  }
-};
-
-class MfmaInsn {
-private:
-  Type elementTypeA;
-  Type elementTypeB;
-  MfmaInsnAttr attr;
 
-public:
-  static FailureOr<MfmaInsn> selectMfma(unsigned mDim, unsigned nDim,
-                                        unsigned kDim, Type elementTypeA,
-                                        Type elementTypeB, int mfmaVersion,
-                                        bool allowXF32);
-  MfmaInsn(Type elementTypeA, Type elementTypeB, const MfmaInsnAttr &attr);
-  unsigned getKDim();
-  unsigned getMDim();
-  unsigned getNDim();
-  StringRef getInsnName();
-  unsigned getKBase();
-  Type getElementTypeA();
-  Type getElementTypeB();
+  Type aElementType;
+  Type bElementType;
 };
 } // namespace mlir
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -37,7 +37,6 @@ using ::mlir::LLVM::AMD::shuffleXor;
 using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::LinearEncodingAttr;
-using ::mlir::triton::gpu::SwizzledSharedEncodingAttr;
 
 using ValueTable = std::map<std::array<int, 3>, Value>;
 
@@ -75,12 +74,12 @@ struct DotOpMFMAConversionHelper {
       : mfmaLayout(mfmaLayout), rewriter(rewriter),
         typeConverter(typeConverter), loc(loc), ctx(mfmaLayout.getContext()) {}
 
-  Value generateMFMAOp(StringRef mfmaInsnName, Value valA, Value valB,
+  Value generateMFMAOp(StringRef intrinsicName, Value valA, Value valB,
                        Value valC) const {
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto resType = valC.getType();
     Value zeroFlag = b.i32_val(0);
-    OperationState loweredOp(loc, mfmaInsnName);
+    OperationState loweredOp(loc, intrinsicName);
     loweredOp.addTypes(resType);
     loweredOp.addOperands({valA, valB, valC, zeroFlag, zeroFlag, zeroFlag});
     return rewriter.create(loweredOp)->getResult(0);
@@ -228,14 +227,15 @@ struct DotOpMFMAConversionHelper {
 
   template <typename T>
   void packAndReplaceResult(T &op, SmallVector<Value> &fc,
-                            FailureOr<MfmaInsn> maybeMfmaInsn, Type dstElemTy,
-                            Type elemtTy, size_t mmaCount) const {
+                            const FailureOr<MfmaIntrinsic> &maybeMfmaIntrinsic,
+                            Type dstElemTy, Type elemtTy,
+                            size_t mmaCount) const {
     Type structTy = LLVM::LLVMStructType::getLiteral(
         ctx, SmallVector<Type>(fc.size(), dstElemTy));
     Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
 
-    setNumGeneratedMMAs(op, mmaCount, maybeMfmaInsn->getMDim(),
-                        maybeMfmaInsn->getNDim(), maybeMfmaInsn->getKDim(),
+    setNumGeneratedMMAs(op, mmaCount, maybeMfmaIntrinsic->mDim,
+                        maybeMfmaIntrinsic->nDim, maybeMfmaIntrinsic->kDim,
                         elemtTy);
 
     rewriter.replaceOp(op, res);
@@ -267,14 +267,15 @@ struct DotOpMFMAConversionHelper {
 
     bool allowXF32 =
         op.getInputPrecision() == InputPrecision::TF32 && mfmaVersion == 3;
-    StringRef mfmaInsnName;
-    auto maybeMfmaInsn = MfmaInsn::selectMfma(
-        mDim, nDim, kDimOperandSize, elemTyA, elemTyB, mfmaVersion, allowXF32);
-    if (failed(maybeMfmaInsn))
+    StringRef intrinsicName;
+    FailureOr<MfmaIntrinsic> maybeMfmaIntrinsic = MfmaIntrinsic::selectFor(
+        mfmaVersion, mDim, nDim, kDimOperandSize, elemTyA, elemTyB,
+        /*withScale=*/false, allowXF32);
+    if (failed(maybeMfmaIntrinsic))
       llvm::report_fatal_error("No match found in MFMA database\n");
 
-    mfmaInsnName = maybeMfmaInsn->getInsnName();
-    unsigned kBase = maybeMfmaInsn->getKBase();
+    intrinsicName = maybeMfmaIntrinsic->name;
+    unsigned kBase = maybeMfmaIntrinsic->kBase;
 
     auto aEncoding = cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
     auto bEncoding = cast<DotOperandEncodingAttr>(bTensorTy.getEncoding());
@@ -301,7 +302,7 @@ struct DotOpMFMAConversionHelper {
     auto numRepB = repA[0];
     assert(repA[0] == repB[0]);
 
-    bool preserveBF16 = mfmaInsnName.contains(".bf16") && mfmaVersion >= 4;
+    bool preserveBF16 = intrinsicName.contains(".bf16") && mfmaVersion >= 4;
     auto operandA = getValuesFromDotOperandLayoutStruct(
         loadedA, numRepB, numRepM, numRepK, kWidth, kBase,
         aTensorTy.getElementType(), allowXF32, preserveBF16);
@@ -335,12 +336,13 @@ struct DotOpMFMAConversionHelper {
           acc = zeroAuxiliarBlocks(subBlocks, acc);
           for (int k = 0; k < numRepK; k++) {
             for (int kPack = 0; kPack < kWidth / kBase; ++kPack) {
-              acc =
-                  mfmaLayout.getIsTransposed()
-                      ? generateMFMAOp(mfmaInsnName, operandB[kPack][{b, n, k}],
-                                       operandA[kPack][{b, m, k}], acc)
-                      : generateMFMAOp(mfmaInsnName, operandA[kPack][{b, m, k}],
-                                       operandB[kPack][{b, n, k}], acc);
+              acc = mfmaLayout.getIsTransposed()
+                        ? generateMFMAOp(intrinsicName,
+                                         operandB[kPack][{b, n, k}],
+                                         operandA[kPack][{b, m, k}], acc)
+                        : generateMFMAOp(intrinsicName,
+                                         operandA[kPack][{b, m, k}],
+                                         operandB[kPack][{b, n, k}], acc);
               if (!firstMfma)
                 firstMfma = acc;
             }
@@ -363,7 +365,8 @@ struct DotOpMFMAConversionHelper {
 
     const size_t mmaCount =
         numRepB * numRepM * numRepN * numRepK * kWidth / kBase;
-    packAndReplaceResult(op, fc, maybeMfmaInsn, dstElemTy, elemTyA, mmaCount);
+    packAndReplaceResult(op, fc, maybeMfmaIntrinsic, dstElemTy, elemTyA,
+                         mmaCount);
 
     return success();
   }
@@ -485,15 +488,15 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
                                   Location loc)
       : DotOpMFMAConversionHelper(mfmaLayout, rewriter, typeConverter, loc) {}
 
-  Value generateScaledMFMAOp(MfmaInsn &mfmaInsn, Value valA, Value valB,
-                             Value valC, Value valScaleA,
+  Value generateScaledMFMAOp(const MfmaIntrinsic &mfmaIntrinsic, Value valA,
+                             Value valB, Value valC, Value valScaleA,
                              Value valScaleB) const {
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto resType = valC.getType();
     Value zeroFlag = b.i32_val(0);
-    OperationState loweredOp(loc, mfmaInsn.getInsnName());
-    int32_t cbsz = getMfmaF8F6F4MatrixFormat(mfmaInsn.getElementTypeA());
-    int32_t blgp = getMfmaF8F6F4MatrixFormat(mfmaInsn.getElementTypeB());
+    OperationState loweredOp(loc, mfmaIntrinsic.name);
+    int32_t cbsz = getMfmaF8F6F4MatrixFormat(mfmaIntrinsic.aElementType);
+    int32_t blgp = getMfmaF8F6F4MatrixFormat(mfmaIntrinsic.bElementType);
     assert((cbsz != -1) && (blgp != -1));
     loweredOp.addTypes(resType);
     loweredOp.addOperands({valA, valB, valC, b.i32_val(cbsz), b.i32_val(blgp),
@@ -540,14 +543,16 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
 
     auto ctx = op.getContext();
     constexpr bool allowXF32 = false;
-    auto maybeMfmaInsn = MfmaInsn::selectMfma(
-        mDim, nDim, kDimOperandSize, scaleDotElemTypeToMLIRType(ctx, aElemType),
-        scaleDotElemTypeToMLIRType(ctx, bElemType), mfmaVersion, allowXF32);
-    if (failed(maybeMfmaInsn))
+    FailureOr<MfmaIntrinsic> maybeMfmaIntrinsic =
+        MfmaIntrinsic::selectFor(mfmaVersion, mDim, nDim, kDimOperandSize,
+                                 scaleDotElemTypeToMLIRType(ctx, aElemType),
+                                 scaleDotElemTypeToMLIRType(ctx, bElemType),
+                                 /*withScale=*/false, allowXF32);
+    if (failed(maybeMfmaIntrinsic))
       llvm::report_fatal_error("No match found in MFMA database\n");
 
-    StringRef mfmaInsnName = maybeMfmaInsn->getInsnName();
-    unsigned kBase = maybeMfmaInsn->getKBase();
+    StringRef intrinsicName = maybeMfmaIntrinsic->name;
+    unsigned kBase = maybeMfmaIntrinsic->kBase;
     // Two fp4 are packed into an uint8.
     if (aElemType == ScaleDotElemType::E2M1) {
       kBase /= 2;
@@ -629,12 +634,12 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
           for (int k = 0; k < numRepK; k++) {
             for (int kPack = 0; kPack < kWidth / kBase; ++kPack) {
               acc = mfmaLayout.getIsTransposed()
-                        ? generateScaledMFMAOp(maybeMfmaInsn.value(),
+                        ? generateScaledMFMAOp(maybeMfmaIntrinsic.value(),
                                                operandB[kPack][{b, n, k}],
                                                operandA[kPack][{b, m, k}], acc,
                                                operandBScale[kPack][{b, n, k}],
                                                operandAScale[kPack][{b, m, k}])
-                        : generateScaledMFMAOp(maybeMfmaInsn.value(),
+                        : generateScaledMFMAOp(maybeMfmaIntrinsic.value(),
                                                operandA[kPack][{b, m, k}],
                                                operandB[kPack][{b, n, k}], acc,
                                                operandAScale[kPack][{b, m, k}],
@@ -661,7 +666,8 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
 
     const size_t mmaCount =
         numRepB * numRepM * numRepN * numRepK * kWidth / kBase;
-    packAndReplaceResult(op, fc, maybeMfmaInsn, dstElemTy, elemTyA, mmaCount);
+    packAndReplaceResult(op, fc, maybeMfmaIntrinsic, dstElemTy, elemTyA,
+                         mmaCount);
 
     return success();
   }
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp