[AMD] Support fp64 MFMA instructions (#7461)

jammm · web-flow · commit 8a452911d736 · 2025-07-11T11:15:35.000-07:00
This commit adds support for lowering fp64 dot
to MFMA intrinsics in the AMD backend.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -1109,7 +1109,8 @@ w2 w2 w3 w3
     "unsigned":$MDim,
     "unsigned":$NDim,
     "bool":$isTransposed,
-    "CTALayoutAttr":$CTALayout
+    "CTALayoutAttr":$CTALayout,
+    DefaultValuedParameter<"std::optional<Type>", "FloatType::get($_ctxt, 32)">:$elementType
   );
 
   let builders = [
@@ -1118,9 +1119,11 @@ w2 w2 w3 w3
                      "unsigned":$MDim,
                      "unsigned":$NDim,
                      "bool":$isTransposed,
-                     "CTALayoutAttr":$CTALayout), [{
+                     "CTALayoutAttr":$CTALayout,
+                     "std::optional<Type>":$elementType), [{
       SmallVector<unsigned> tilesPerWarp(warpsPerCTA.size(), 1);
-      return $_get(context, version, warpsPerCTA, tilesPerWarp, MDim, NDim, isTransposed, CTALayout);
+
+      return $_get(context, version, warpsPerCTA, tilesPerWarp, MDim, NDim, isTransposed, CTALayout, elementType);
     }]>
   ];
 
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -561,6 +561,17 @@ static LogicalResult parseBool(AsmParser &parser, const NamedAttribute &attr,
   return parseBoolAttrValue(parser, attr.getValue(), value, desc);
 };
 
+static LogicalResult parseType(AsmParser &parser, const NamedAttribute &attr,
+                               std::optional<Type> &value, StringRef desc) {
+  auto typeAttr = mlir::dyn_cast<TypeAttr>(attr.getValue());
+  if (!typeAttr) {
+    parser.emitError(parser.getNameLoc(), "expected a Type in ") << desc;
+    return failure();
+  }
+  value = typeAttr.getValue();
+  return success();
+}
+
 // Print the CTALayout if it's not equal to the default.
 static void maybePrintCTALayout(mlir::MLIRContext *context,
                                 mlir::AsmPrinter &printer, CTALayoutAttr layout,
@@ -1327,6 +1338,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
   std::optional<SmallVector<unsigned>> CTAsPerCGA;
   std::optional<SmallVector<unsigned>> CTASplitNum;
   std::optional<SmallVector<unsigned>> CTAOrder;
+  std::optional<Type> elementType;
 
   for (const NamedAttribute &attr : dict) {
     if (attr.getName() == "version") {
@@ -1366,6 +1378,10 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
               .failed())
         return {};
     }
+    if (attr.getName() == "elementType") {
+      if (parseType(parser, attr, elementType, "elementType").failed())
+        return {};
+    }
   }
 
   if (tilesPerWarp.empty()) {
@@ -1379,7 +1395,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
 
   return parser.getChecked<AMDMfmaEncodingAttr>(
       parser.getContext(), version, warpsPerCTA, tilesPerWarp, instrShape[0],
-      instrShape[1], isTransposed, *CTALayout);
+      instrShape[1], isTransposed, *CTALayout, elementType);
 }
 
 void AMDMfmaEncodingAttr::print(AsmPrinter &printer) const {
@@ -1396,21 +1412,35 @@ void AMDMfmaEncodingAttr::print(AsmPrinter &printer) const {
           << ", isTransposed = " << getIsTransposed();
   maybePrintCTALayout(getContext(), printer, getCTALayout(),
                       /*rank=*/getRank());
+  if (getElementType() && !(getElementType()->isF32())) {
+    std::string typeStr;
+    llvm::raw_string_ostream rso(typeStr);
+    getElementType()->print(rso);
+    printer << ", elementType = " << rso.str();
+  }
   printer << "}>";
 }
 
 LogicalResult AMDMfmaEncodingAttr::verify(
     function_ref<mlir::InFlightDiagnostic()> emitError, unsigned version,
     llvm::ArrayRef<unsigned int> warpsPerCTA,
     llvm::ArrayRef<unsigned int> tilesPerWarp, unsigned mDim, unsigned nDim,
-    bool isTransposed, mlir::triton::gpu::CTALayoutAttr) {
+    bool isTransposed, mlir::triton::gpu::CTALayoutAttr,
+    std::optional<Type> elementType) {
   if (!(version >= 0 && version <= 4)) {
     return emitError() << "version must be in the [0, 4] range";
   }
   if (!((mDim == 32 && nDim == 32) || (mDim == 16 && nDim == 16))) {
     return emitError()
            << "(M, N) cases other than (32, 32) or (16, 16) unimplemented";
   }
+  if (elementType && !(elementType->isF64() || elementType->isF32() ||
+                       elementType->isInteger(32))) {
+    std::string typeStr;
+    llvm::raw_string_ostream rso(typeStr);
+    elementType->print(rso);
+    return emitError() << "element type must be f64, f32, i32, or none";
+  }
 
   return success();
 }
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -438,25 +438,56 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
           {outDimNames[order[0]], outDimNames[order[1]]});
   } else {
     assert(getMDim() == 16);
-    // For mfma with 16x16 output, each of the 64 threads holds 4 elements.
-    //
-    // For the register (i.e., element) dimension, these 4 elements are along
-    // the matrix C's M dimension, with 4 consecutive elements spanning 4 rows.
-    //
-    // For the lane (i.e., thread) dimension, these threads are along the
-    // matrix C's N dimension, with 16 consecutive threads covering a whole
-    // row and the next 16 threads start after a gap spanning 4 rows.
-    tileLayout = LinearLayout(
-        {{kRegister, {{0, 1}, {0, 2}}},
-         {kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 4}, {0, 8}}}},
-        {outDimNames[order[0]], outDimNames[order[1]]});
-    // For mfma.transposed layout, the element ownership among threads are
-    // "transposed" within each warp.
-    if (getIsTransposed())
+    auto elementType = getElementType();
+    if (!(elementType && elementType->isF64())) {
+      // For mfma with 16x16 output (<= 32 bits), each of the 64 threads holds 4
+      // elements.
+      //
+      // For the register (i.e., element) dimension, these 4 elements are along
+      // the matrix C's M dimension, with 4 consecutive elements spanning 4
+      // rows.
+      //
+      // For the lane (i.e., thread) dimension, these threads are along the
+      // matrix C's N dimension, with 16 consecutive threads covering a whole
+      // row and the next 16 threads start after a gap spanning 4 rows.
+      tileLayout = LinearLayout(
+          {{kRegister, {{0, 1}, {0, 2}}},
+           {kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 4}, {0, 8}}}},
+          {outDimNames[order[0]], outDimNames[order[1]]});
+      // For mfma.transposed layout, the element ownership among threads are
+      // "transposed" within each warp.
+      if (getIsTransposed())
+        tileLayout = LinearLayout(
+            {{kRegister, {{1, 0}, {2, 0}}},
+             {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, /*gap*/ {4, 0}, {8, 0}}}},
+            {outDimNames[order[0]], outDimNames[order[1]]});
+
+    } else {
+      // For 64 bit mfma with 16x16 output, each of the 64 threads holds 4
+      // elements across 8 VGPRs. each 64 bit element is split across pairs of 2
+      // VGPRs each. The first VGPR holds the first 32 bits and second holding
+      // the last 32 bits.
+      //
+      // For the register (i.e., element) dimension, these 4 elements are along
+      // the matrix C's M dimension, with 4 consecutive elements spanning 4
+      // rows.
+      //
+      // For the lane (i.e., thread) dimension, these threads are along the
+      // matrix C's N dimension, with each group of 16 consecutive threads
+      // covering a whole adjacent row. Unlike the <=32 bit cases, there's no
+      // row gaps between the groups.
       tileLayout = LinearLayout(
-          {{kRegister, {{1, 0}, {2, 0}}},
-           {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, /*gap*/ {4, 0}, {8, 0}}}},
+          {{kRegister, {{0, 4}, {0, 8}}},
+           {kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, {0, 1}, {0, 2}}}},
           {outDimNames[order[0]], outDimNames[order[1]]});
+      // For mfma.transposed layout, the element ownership among threads are
+      // "transposed" within each warp.
+      if (getIsTransposed())
+        tileLayout = LinearLayout(
+            {{kRegister, {{4, 0}, {8, 0}}},
+             {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}}},
+            {outDimNames[order[0]], outDimNames[order[1]]});
+    }
   }
 
   // Instead of defining the layout on a CTA tile and using the
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -112,8 +112,6 @@ def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K,
         pytest.skip("Float8 requires compute capability >= 9")
     if (dtype_src_str == "float64") != (dtype_dst_str == "float64"):
         pytest.skip("Skipping unsupported case")
-    if dtype_src_str == "float64" and not is_cuda():
-        pytest.skip("Float64 not supported on HIP yet")
     if "float32" in dtype_src_str and dtype_dst_str == "float16":
         pytest.skip("Skipping unsupported case")
     if "float32" == dtype_src_str and NUM_CTAS > 1:
diff --git a/test/TritonGPU/invalid-attributes.mlir b/test/TritonGPU/invalid-attributes.mlir
@@ -74,6 +74,11 @@
 
 // -----
 
+// expected-error@+1 {{element type must be f64, f32, i32, or none}}
+#mfma = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 1, 1], instrShape = [16, 16], isTransposed = false, elementType = f16}>
+
+// -----
+
 // expected-error@+1 {{interval values must all be power of two}}
 #shared = #ttg.padded_shared<[3:+2]>
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -145,6 +145,8 @@ struct DotOpMFMAConversionHelper {
       Value zero;
       if (elemType.isInteger(32))
         zero = b.i32_val(0);
+      else if (elemType.isF64())
+        zero = b.f64_val(0.0);
       else
         zero = b.f32_val(0.0);
       auto cond = b.icmp_ult(laneId, b.i32_val(subBlockSize));
@@ -462,9 +464,9 @@ struct DotOpMFMAConversionHelper {
           }
 
           // Step 2: process rawElems based on element type
-          // Note that for f32 input and XF32 is not allowed, nothing needs to
-          // be done and rawElems is inserted into the ValueTable directly
-          if (type.isF32() && !allowXF32) {
+          // Note that for f32/fp64 input and XF32 is not allowed, nothing needs
+          // to be done and rawElems is inserted into the ValueTable directly
+          if ((type.isF32() || type.isF64()) && !allowXF32) {
             dotOpVals[{b, nonK, kBaseVec}] =
                 tb.extract_element(type, rawElems, tb.i32_val(0));
           } else {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUtility.cpp
@@ -39,7 +39,7 @@ createTmpLayout(triton::gpu::DistributedEncodingTrait layout,
   if (auto src = dyn_cast<triton::gpu::AMDMfmaEncodingAttr>(layout))
     return triton::gpu::AMDMfmaEncodingAttr::get(
         ctx, src.getVersion(), warpsPerCTA, src.getMDim(), src.getNDim(),
-        src.getIsTransposed(), src.getCTALayout());
+        src.getIsTransposed(), src.getCTALayout(), src.getElementType());
   if (auto src = dyn_cast<triton::gpu::AMDWmmaEncodingAttr>(layout))
     return triton::gpu::AMDWmmaEncodingAttr::get(
         ctx, src.getVersion(), src.getIsTransposed(), warpsPerCTA,
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -155,8 +155,15 @@ chooseMfmaInstruction(Location loc, int mfmaVersion, RankedTensorType cType,
   } else {
     int minSize = std::min(M, N);
     if (minSize >= 32) {
-      mDim = 32;
-      nDim = 32;
+      // On CNDA2-4, if the element type is f64, we use 16x16 intrinsic as
+      // there's no 32x32 intrinsic.
+      if (aElemType.isF64() || bElemType.isF64()) {
+        mDim = 16;
+        nDim = 16;
+      } else {
+        mDim = 32;
+        nDim = 32;
+      }
     }
     if (minSize >= 16 && minSize < 32) {
       mDim = 16;
@@ -450,19 +457,22 @@ class BlockedToMFMA : public OpRewritePattern<tt::DotOp> {
     auto warpsPerTile =
         warpsPerTileMFMA(dotOp, retShape, numWarps, {mDim, nDim});
 
+    Type mfmaAccType;
+    if (oldRetType.getElementType().isIntOrIndex())
+      mfmaAccType = rewriter.getIntegerType(32);
+    else if (oldRetType.getElementType().isF64())
+      mfmaAccType = rewriter.getF64Type();
+    else
+      mfmaAccType = rewriter.getF32Type();
+
     // Use transposed mfma layout to enable larger vectorization for global
     // store instructions.
     auto aElemTy = mfmaInstr->aElementType;
     ttg::AMDMfmaEncodingAttr mfmaEnc = ttg::AMDMfmaEncodingAttr::get(
         oldRetType.getContext(),
         /*version*/ mfmaVersion, warpsPerTile,
-        /*instrShape*/ mDim, nDim, /*isTransposed=*/true, CTALayout);
-
-    Type mfmaAccType;
-    if (oldRetType.getElementType().isIntOrIndex())
-      mfmaAccType = rewriter.getIntegerType(32);
-    else
-      mfmaAccType = rewriter.getF32Type();
+        /*instrShape*/ mDim, nDim, /*isTransposed=*/true, CTALayout,
+        mfmaAccType);
 
     // convert accumulator
     auto oldAcc = dotOp.getC();
@@ -657,7 +667,7 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
     // for global store instructions.
     auto mfmaEnc = ttg::AMDMfmaEncodingAttr::get(
         ctx, /*version=*/mfmaVersion, mfmaWarpsPerCTA, /*instrShape=*/mDim,
-        nDim, /*isTransposed=*/true, ctaLayout);
+        nDim, /*isTransposed=*/true, ctaLayout, oldRetType.getElementType());
 
     auto newRetType = RankedTensorType::get(
         oldRetType.getShape(), oldRetType.getElementType(), mfmaEnc);
@@ -815,7 +825,8 @@ class ScaledBlockedToScaledMFMAF8F6F4 final
     // for global store instructions.
     auto mfmaEnc = ttg::AMDMfmaEncodingAttr::get(
         ctx, /*verison=*/mfmaVersion, warpsPerTile,
-        /*instrShape=*/mDim, nDim, /*isTransposed=*/true, ctaLayout);
+        /*instrShape=*/mDim, nDim, /*isTransposed=*/true, ctaLayout,
+        oldRetType.getElementType());
 
     auto newRetType =
         RankedTensorType::get(oldShape, oldRetType.getElementType(), mfmaEnc);
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/MfmaGroup.cpp
@@ -127,6 +127,7 @@ MfmaDatabase::MfmaDatabase(MLIRContext *context) {
       TRITON_MFMA_v2to4(m, n, aET, bET, symbol, k, kBase)
 
   Builder b(context);
+  auto f64T = b.getF64Type();
   auto f32T = b.getF32Type();
   auto tf32T = b.getTF32Type();
   auto f16T = b.getF16Type();
@@ -139,6 +140,13 @@ MfmaDatabase::MfmaDatabase(MLIRContext *context) {
   auto fp4T = b.getType<Float4E2M1FNType>();
 
   mfmaMap = {
+      // f64 inputs
+      // mfma_f64_16x16x4f64
+      TRITON_MFMA_v2to4(16, 16, f64T, f64T, mfma_f64_16x16x4f64, 4, 1),
+      // mfma_f64_4x4x4f64
+      TRITON_MFMA_v2to4(4, 4, f64T, f64T, mfma_f64_4x4x4f64, 16, 1),
+      TRITON_MFMA_v2to4(4, 16, f64T, f64T, mfma_f64_4x4x4f64, 4, 1),
+      TRITON_MFMA_v2to4(16, 4, f64T, f64T, mfma_f64_4x4x4f64, 4, 1),
       // f32 inputs
       // mfma_f32_32x32x2f32
       TRITON_MFMA_v1to4(32, 32, f32T, f32T, mfma_f32_32x32x2f32, 2, 1),
diff --git a/unittest/Dialect/TritonGPU/DialectTest.cpp b/unittest/Dialect/TritonGPU/DialectTest.cpp
@@ -473,14 +473,14 @@ class AMDMfmaLayoutTest : public AMDLayoutTest {
                                               ArrayRef<unsigned> warpsPerCTA) {
     return triton::gpu::AMDMfmaEncodingAttr::get(
         &ctx, /*version=*/2, warpsPerCTA, mDim, nDim,
-        /*isTransposed=*/false, ctaLayout);
+        /*isTransposed=*/false, ctaLayout, std::nullopt);
   }
 
   triton::gpu::AMDMfmaEncodingAttr
   createTransposedMFMA(int mDim, int nDim, ArrayRef<unsigned> warpsPerCTA) {
     return triton::gpu::AMDMfmaEncodingAttr::get(
         &ctx, /*version=*/2, warpsPerCTA, mDim, nDim,
-        /*isTransposed=*/true, ctaLayout);
+        /*isTransposed=*/true, ctaLayout, std::nullopt);
   }
 };
 
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -56,7 +56,8 @@ class LinearLayoutConversionsTest : public ::testing::Test {
   AMDMfmaEncodingAttr
   mfma(ArrayRef<unsigned> warps, unsigned mDim, unsigned nDim,
        bool isTransposed,
-       std::optional<ArrayRef<unsigned>> maybeTilesPerWarp = std::nullopt) {
+       std::optional<ArrayRef<unsigned>> maybeTilesPerWarp = std::nullopt,
+       std::optional<Type> elementType = std::nullopt) {
     SmallVector<unsigned> cpg(warps.size(), 1u);
     SmallVector<unsigned> cSplit(warps.size(), 1u);
     SmallVector<unsigned> cOrd(warps.size());
@@ -66,10 +67,11 @@ class LinearLayoutConversionsTest : public ::testing::Test {
 
     if (maybeTilesPerWarp.has_value()) {
       return AMDMfmaEncodingAttr::get(&ctx, 2, warps, maybeTilesPerWarp.value(),
-                                      mDim, nDim, isTransposed, ctaLayout);
+                                      mDim, nDim, isTransposed, ctaLayout,
+                                      elementType);
     } else {
       return AMDMfmaEncodingAttr::get(&ctx, 2, warps, mDim, nDim, isTransposed,
-                                      ctaLayout);
+                                      ctaLayout, elementType);
     }
   }