[AMD] Improve scaled dot with (b)f16 types on GFX950 (#7693)

yiqian1 · antiagainst · web-flow · commit e40c21360161 · 2025-07-31T21:11:11.000Z
For such cases we need to upcast mxfp into (b)f16 to
utilize (b)f16 mfma intrinsics. But the upcasting can have
native instruction support now.

Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -4442,6 +4442,9 @@ def make_finite(x, dtype):
             assert 'st.global.v4' in ptx
         assert (re.search(r'(mma|wgmma.mma_async).sync.aligned.m\d+n\d+k16(?:.row.col)?.f32.(f|bf)16.(f|bf)16', ptx)
                 or "tcgen05.mma.cta_group::1.kind::f16" in ptx)
+    if is_hip_cdna4() and normal_type in ["bf16", "fp16"]:
+        amdgcn = pgm.asm['amdgcn']
+        assert (re.search(r"v_cvt_scalef32_pk_.*?(fp4|fp8|bf8).*?op_sel", amdgcn))
 
 
 @pytest.mark.interpreter
diff --git a/test/Conversion/amd/upcast_mxfp.mlir b/test/Conversion/amd/upcast_mxfp.mlir
@@ -0,0 +1,58 @@
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck --check-prefixes=GFX950 %s
+
+// -----
+
+// GFX950-LABEL: upcast_mxfp4
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 4096 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @upcast_mxfp4(%arg0 : tensor<32x32xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, %arg1 : tensor<32x2xi8, #blocked>) {
+    // GFX950-DAG: %[[CST:.*]] = llvm.mlir.constant(23 : i32) : i32
+    // GFX950-DAG: %[[ISCALE:.*]] = llvm.zext %{{.*}} : i8 to i32
+    // GFX950: %[[INTS:.*]] = llvm.shl %[[ISCALE]], %[[CST]] : i32
+    // GFX950: %[[SCALE:.*]] = llvm.bitcast %[[INTS]] : i32 to f32
+    // GFX950: rocdl.cvt.scalef32.pk.bf16.fp4 %[[REG:.*]][0], %[[SCALE]] : vector<2xbf16>
+    // GFX950: rocdl.cvt.scalef32.pk.bf16.fp4 %[[REG]][2], %[[SCALE]] : vector<2xbf16>
+    // GFX950: rocdl.cvt.scalef32.pk.bf16.fp4 %[[REG]][1], %[[SCALE]] : vector<2xbf16>
+    // GFX950: rocdl.cvt.scalef32.pk.bf16.fp4 %[[REG]][3], %[[SCALE]] : vector<2xbf16>
+    %1 = amdgpu.upcast_mxfp %arg0, %arg1 fp_type = e2m1 {fastMath = false} : tensor<32x32xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>, tensor<32x2xi8, #blocked> -> tensor<64x32xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    tt.return
+  }
+}
+
+
+// -----
+
+// GFX950-LABEL: upcast_mxfp8
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 4096 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @upcast_mxfp8(%arg0 : tensor<64x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>, %arg1 : tensor<32x2xi8, #blocked>) {
+    // GFX950-DAG: %[[CST:.*]] = llvm.mlir.constant(23 : i32) : i32
+    // GFX950-DAG: %[[ISCALE:.*]] = llvm.zext %{{.*}} : i8 to i32
+    // GFX950: %[[INTS:.*]] = llvm.shl %[[ISCALE]], %[[CST]] : i32
+    // GFX950: %[[SCALE:.*]] = llvm.bitcast %[[INTS]] : i32 to f32
+    // GFX950: rocdl.cvt.scalef32.pk.bf16.fp8 %[[REG:.*]][false], %[[SCALE]] : vector<2xbf16>
+    // GFX950: rocdl.cvt.scalef32.pk.bf16.fp8 %[[REG]][true], %[[SCALE]] : vector<2xbf16>
+    %1 = amdgpu.upcast_mxfp %arg0, %arg1 fp_type = e4m3 {fastMath = false} : tensor<64x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>, tensor<32x2xi8, #blocked> -> tensor<64x32xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    tt.return
+  }
+}
+
+// -----
+
+// GFX950-LABEL: upcast_mxbf8
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 4096 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @upcast_mxbf8(%arg0 : tensor<64x32xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>, %arg1 : tensor<32x2xi8, #blocked>) {
+    // GFX950-DAG: %[[CST:.*]] = llvm.mlir.constant(23 : i32) : i32
+    // GFX950-DAG: %[[ISCALE:.*]] = llvm.zext %{{.*}} : i8 to i32
+    // GFX950: %[[INTS:.*]] = llvm.shl %[[ISCALE]], %[[CST]] : i32
+    // GFX950: %[[SCALE:.*]] = llvm.bitcast %[[INTS]] : i32 to f32
+    // GFX950: rocdl.cvt.scalef32.pk.f16.bf8 %[[REG:.*]][false], %[[SCALE]] : vector<2xf16>
+    // GFX950: rocdl.cvt.scalef32.pk.f16.bf8 %[[REG]][true], %[[SCALE]] : vector<2xf16>
+    %1 = amdgpu.upcast_mxfp %arg0, %arg1 fp_type = e5m2 {fastMath = false} : tensor<64x32xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>, tensor<32x2xi8, #blocked> -> tensor<64x32xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir
@@ -224,6 +224,34 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   }
 }
 
+// -----
+
+// CHECK-LABEL: mfma_dot_scaled_bf16_fp8e4
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mfma_dot_scaled_bf16_fp8e4(
+      %arg0: tensor<32x64xbf16, #blocked2>,
+      %arg1: tensor<64x32xf8E4M3FN, #blocked>,
+      %arg2: tensor<32x2xi8, #blocked1>,
+      %arg3: tensor<32x32x!tt.ptr<f32>, #blocked>
+    ) {
+    // CHECK-NOT: tt.fp_to_fp
+    // CHECK-NOT: tt.dot_scaled
+    // CHECK: %[[A:.*]] = ttg.convert_layout %{{.*}} : tensor<32x64xbf16, #blocked{{.*}}> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    // CHECK: %[[B:.+]] = ttg.convert_layout %{{.*}} : tensor<64x32xf8E4M3FN, #blocked{{.*}}> -> tensor<64x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    // CHECK: %[[S:.+]] = ttg.convert_layout %{{.*}} : tensor<32x2xi8, #blocked{{.*}}> -> tensor<32x2xi8, #blocked{{.*}}>
+    // CHECK: %[[UB:.+]] = amdgpu.upcast_mxfp %[[B]], %[[S]] fp_type = e4m3 {fastMath = false} : tensor<64x32xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>, tensor<32x2xi8, #blocked{{.*}}> -> tensor<64x32xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    // CHECK: %{{.*}} = tt.dot %[[A]], %[[UB]], %{{.*}} : tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<64x32xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<32x32xf32, #mma>
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked>
+    %1 = tt.dot_scaled %arg0, %arg1 scale %arg2, %cst lhs = bf16 rhs = e4m3 {fastMath = false} : tensor<32x64xbf16, #blocked2> * tensor<64x32xf8E4M3FN, #blocked>, tensor<32x2xi8, #blocked1> -> tensor<32x32xf32, #blocked>
+    tt.store %arg3, %1 : tensor<32x32x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+
 // -----
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
diff --git a/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp b/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
@@ -253,9 +253,11 @@ LogicalResult UpcastMXFPOp::verify() {
   Builder b(getContext());
   if (xTy.getElementType() != b.getBF16Type() &&
       xTy.getElementType() != b.getF16Type() &&
-      xTy.getElementType() != b.getI8Type()) {
-    return emitOpError(
-        "element type of the first operand must be bf16/fp16 or i8");
+      xTy.getElementType() != b.getI8Type() &&
+      xTy.getElementType() != b.getType<Float8E4M3FNType>() &&
+      xTy.getElementType() != b.getType<Float8E5M2Type>()) {
+    return emitOpError("element type of the first operand must be bf16/fp16, "
+                       "OCP fp8/bf8 or i8");
   }
 
   if (scaleTy.getElementType() != b.getI8Type()) {
@@ -328,27 +330,30 @@ UpcastMXFPOp::deduceOutputType(TypedValue<RankedTensorType> inputTensor,
                                Type outputElemType) {
   MLIRContext *ctx = inputTensor.getContext();
   auto xTy = inputTensor.getType();
-  if (inputElemType != ScaleDotElemType::E2M1)
+  if (!(inputElemType == ScaleDotElemType::E2M1 ||
+        inputElemType == ScaleDotElemType::E4M3 ||
+        inputElemType == ScaleDotElemType::E5M2))
     return xTy;
 
+  auto factor = inputElemType == ScaleDotElemType::E2M1 ? 2 : 1;
   auto xShape = xTy.getShape();
   auto newShape = llvm::to_vector(xShape);
   auto encoding = xTy.getEncoding();
   if (!encoding) {
-    newShape.back() *= 2;
+    newShape.back() *= factor;
     return RankedTensorType::get(xShape, outputElemType);
   }
 
   auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
-  auto newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
-                                                  oldEncoding.getParent(),
-                                                  oldEncoding.getKWidth() * 2);
+  auto newVEncoding = DotOperandEncodingAttr::get(
+      ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
+      oldEncoding.getKWidth() * factor);
   // Figure out the K dimension for the input A/B, given that the return
   // type is upcasted A/B type so we need to update the proper dim size.
   const int opIdx = oldEncoding.getOpIdx();
   const bool hasBatch = xShape.size() == 3;
   const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
-  newShape[kIdx] *= 2;
+  newShape[kIdx] *= factor;
   return RankedTensorType::get(newShape, outputElemType, newVEncoding);
 }
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp