[AMD] Use a proper error message for no matching MFMA intrinsic (#7185)

antiagainst · web-flow · commit 19c842cdc54e · 2025-06-14T05:49:46.000Z
If we cannot match a proper MFMA intrinsic, it pretty much is due to
that the element type is not supported. So make the error message more
explanatory instead of using fatal error with a vague message.
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-unsupported.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-unsupported.mlir
@@ -0,0 +1,15 @@
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=0" --verify-diagnostics
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_fp8e4m3fn(
+      %arg0: tensor<128x64xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+      %arg1: tensor<64x256xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+      %arg2: tensor<128x256x!tt.ptr<f32>, #blocked>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #blocked>
+    // expected-error @+1 {{no matching matrix core intrinsic due to unsupported element type}}
+    %1 = tt.dot %arg0, %arg1, %cst : tensor<128x64xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x256xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x256xf32, #blocked>
+    tt.store %arg2, %1 : tensor<128x256x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -275,7 +275,8 @@ struct DotOpMFMAConversionHelper {
         mfmaVersion, mDim, nDim, kDimOperandSize, elemTyA, elemTyB,
         /*withScale=*/false, allowXF32);
     if (failed(maybeMfmaIntrinsic))
-      llvm::report_fatal_error("No match found in MFMA database\n");
+      return op.emitError(
+          "no matching matrix core intrinsic due to unsupported element type");
 
     unsigned kBase = maybeMfmaIntrinsic->kBase;
 
@@ -590,7 +591,8 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
         scaleDotElemTypeToMLIRType(ctx, bElemType),
         /*withScale=*/true, allowXF32);
     if (failed(maybeMfmaIntrinsic))
-      llvm::report_fatal_error("No match found in MFMA database\n");
+      return op.emitError(
+          "no matching matrix core intrinsic due to unsupported element type");
 
     StringRef intrinsicName = maybeMfmaIntrinsic->name;
     unsigned kBase = maybeMfmaIntrinsic->kBase;
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -136,9 +136,9 @@ warpsPerTileWMMA(Operation *dotOp, ArrayRef<int64_t> shape, int numWarps) {
 // If enforcedNonKDim is not zero, it will be used to overwrite the default
 // logic to choose a MFMA with matching M/N dim.
 FailureOr<MfmaIntrinsic>
-chooseMfmaInstruction(int mfmaVersion, RankedTensorType cType, Type aElemType,
-                      Type bElemType, int inputKSize, int enforcedNonKDim,
-                      bool withScale, bool allowXF32) {
+chooseMfmaInstruction(Location loc, int mfmaVersion, RankedTensorType cType,
+                      Type aElemType, Type bElemType, int inputKSize,
+                      int enforcedNonKDim, bool withScale, bool allowXF32) {
   // number of matrix elements along k dim per one MFMA instruction
   unsigned kDim = 0;
 
@@ -169,7 +169,8 @@ chooseMfmaInstruction(int mfmaVersion, RankedTensorType cType, Type aElemType,
       MfmaIntrinsic::selectFor(mfmaVersion, mDim, nDim, inputKSize, aElemType,
                                bElemType, withScale, allowXF32);
   if (failed(maybeMfmaIntrinsic))
-    llvm::report_fatal_error("No match found in MFMA database\n");
+    return emitError(loc, "no matching matrix core intrinsic due to "
+                          "unsupported element type");
 
   kDim = maybeMfmaIntrinsic->kDim;
   assert(kDim != 0);
@@ -188,7 +189,7 @@ FailureOr<MfmaIntrinsic> chooseMfmaInstruction(tt::DotOp dot, int mfmaVersion,
   bool allowXF32 =
       dot.getInputPrecision() == InputPrecision::TF32 && mfmaVersion == 3;
   return chooseMfmaInstruction(
-      mfmaVersion, dot.getC().getType(), aType.getElementType(),
+      dot.getLoc(), mfmaVersion, dot.getC().getType(), aType.getElementType(),
       dot.getB().getType().getElementType(), aType.getShape().back(), nonKDim,
       withScale, allowXF32);
 }
@@ -204,8 +205,8 @@ FailureOr<MfmaIntrinsic> chooseMfmaInstruction(tt::DotScaledOp dot,
   }
   Type aElemType = scaleDotElemTypeToMLIRType(ctx, dot.getAElemType());
   Type bElemType = scaleDotElemTypeToMLIRType(ctx, dot.getBElemType());
-  return chooseMfmaInstruction(mfmaVersion, dot.getC().getType(), aElemType,
-                               bElemType, inputKDim, nonKDim,
+  return chooseMfmaInstruction(dot.getLoc(), mfmaVersion, dot.getC().getType(),
+                               aElemType, bElemType, inputKDim, nonKDim,
                                /*withScale=*/true, /*allowXF32=*/false);
 }
 
@@ -215,9 +216,9 @@ FailureOr<MfmaIntrinsic> chooseMfmaInstruction(tt::DotScaledOp dot,
   // For scaled dot, we handle it with fp16 or bf16 emulation for now.
   Builder b(dot.getContext());
   Type elemType = useFp16 ? b.getF16Type() : b.getBF16Type();
-  return chooseMfmaInstruction(mfmaVersion, dot.getC().getType(), elemType,
-                               elemType, dot.getA().getType().getShape().back(),
-                               nonKDim,
+  return chooseMfmaInstruction(dot.getLoc(), mfmaVersion, dot.getC().getType(),
+                               elemType, elemType,
+                               dot.getA().getType().getShape().back(), nonKDim,
                                /*withScale=*/false, /*allowXF32=*/false);
 }