[AMD] Support scaled dot for gfx12 (#7644)

joviliast · web-flow · commit f6626cd121a5 · 2025-08-01T17:11:08.000Z
Support emulation for scaled dot by decomposing
it into normal dot with upcasting operands.

Signed-off-by: Ilya Veselov &lt;iveselov.nn@gmail.com&gt;
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -2137,9 +2137,11 @@ LogicalResult DotOperandEncodingAttr::verify(
 
   if (auto parentAttr = mlir::dyn_cast<AMDWmmaEncodingAttr>(parent)) {
     if (kWidth != 16 && parentAttr.getVersion() == 1 ||
-        kWidth != 8 && kWidth != 16 && parentAttr.getVersion() == 2)
+        kWidth != 4 && kWidth != 8 && kWidth != 16 &&
+            parentAttr.getVersion() == 2)
       return emitError() << "ttg.dot_op kWidth parameter must be 16 for "
-                            "gfx11 and 8/16 for gfx12";
+                            "gfx11 and 4/8/16 for gfx12 (including packed "
+                            "cases for `scaled_dot`)";
     return success();
   }
 
diff --git a/lib/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.cpp b/lib/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.cpp
@@ -30,6 +30,10 @@ class DecomposeScaledBlocked : public OpRewritePattern<DotScaledOp> {
 
   LogicalResult matchAndRewrite(DotScaledOp scaledDotOp,
                                 PatternRewriter &rewriter) const override {
+    if (isa_and_nonnull<MmaEncodingTrait>(
+            scaledDotOp.getResult().getType().getEncoding()))
+      return failure();
+
     // TODO: add support for m/n packed formats.
     if (!scaledDotOp.getLhsKPack() || !scaledDotOp.getRhsKPack())
       return failure();
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -80,6 +80,8 @@ def promotion_numpy_2_0():
     # 0 is a special value for automatic heuristic
     if is_hip_cdna():
         mma_nonk_sizes = [0, 16, 32]
+    elif is_hip_gfx12():
+        mma_nonk_sizes = [16]
 else:
     THREADS_PER_WARP = 32
 
@@ -4196,12 +4198,12 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, mxfp_type, normal_type, nu
         if cc < (8, 9):
             pytest.skip("float8e4nv not supported on CUDA < 8.9")
     if is_hip():
-        if not is_hip_cdna():
-            pytest.skip("scaled_dot only implemented for HIP CDNA")
+        if not (is_hip_cdna() or is_hip_gfx12()):
+            pytest.skip("scaled_dot only implemented for HIP CDNA and gfx12")
         if "e4m3" in (mxfp_type, normal_type):
-            if not (is_hip_cdna3() or is_hip_cdna4()):
-                pytest.skip(f"scaled_dot({mxfp_type}, {normal_type}) only implemented for CDNA3 and CDNA4")
-        if mma == 16 and K == 64:
+            if not (is_hip_cdna3() or is_hip_cdna4() or is_hip_gfx12()):
+                pytest.skip(f"scaled_dot({mxfp_type}, {normal_type}) only implemented for CDNA3, CDNA4, gfx12")
+        if mma == 16 and K == 64 and not is_hip_gfx12():
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
 
     @triton.jit
diff --git a/test/TritonGPU/invalid-attributes.mlir b/test/TritonGPU/invalid-attributes.mlir
@@ -42,26 +42,21 @@
 
 // -----
 
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
+// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 4/8/16 for gfx12 (including packed cases for `scaled_dot`)}}
 #wmma = #ttg.amd_wmma<{version = 1, warpsPerCTA = [1, 4]}>
 #dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma}>
 
 // -----
 
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
+// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 4/8/16 for gfx12 (including packed cases for `scaled_dot`)}}
 #wmma = #ttg.amd_wmma<{version = 1, warpsPerCTA = [1, 4]}>
 #dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma, kWidth = 8}>
 
 // -----
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
+// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 4/8/16 for gfx12 (including packed cases for `scaled_dot`)}}
 #wmma = #ttg.amd_wmma<{version = 2, warpsPerCTA = [1, 4]}>
 #dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma, kWidth = 32}>
 
-// -----
-// expected-error@+2 {{ttg.dot_op kWidth parameter must be 16 for gfx11 and 8/16 for gfx12}}
-#wmma = #ttg.amd_wmma<{version = 2, warpsPerCTA = [1, 4]}>
-#dot_op = #ttg.dot_op<{opIdx = 1, parent = #wmma, kWidth = 4}>
-
 // -----
 
 // expected-error@+1 {{version must be in the [0, 4] range}}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt
@@ -22,6 +22,7 @@ add_triton_library(TritonAMDGPUToLLVM
     SPMDOpToLLVM.cpp
     SchedInstructions.cpp
     UpcastMXFPToLLVM.cpp
+    Fp4ToFpOpToLLVM.cpp
     MembarUtility.cpp
     ScalarizePackedFOps.cpp
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Fp4ToFpOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Fp4ToFpOpToLLVM.cpp
@@ -0,0 +1,72 @@
+#include "PatternTritonGPUOpToLLVM.h"
+
+#include "Utility.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Attributes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include <array>
+
+using namespace mlir;
+using namespace mlir::triton;
+using namespace mlir::triton::gpu;
+using ::mlir::LLVM::AMD::upcast8xMxfp4_SW;
+
+namespace {
+
+class Fp4ToFpOpPattern : public ConvertOpToLLVMPattern<Fp4ToFpOp> {
+public:
+  Fp4ToFpOpPattern(LLVMTypeConverter &typeConverter, PatternBenefit benefit)
+      : ConvertOpToLLVMPattern<Fp4ToFpOp>(typeConverter, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(Fp4ToFpOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto loc = op.getLoc();
+    auto elemType = op.getType().getElementType();
+    assert(elemType == f16_ty || elemType == bf16_ty);
+    bool toFp16 = elemType == f16_ty;
+
+    auto xVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
+
+    SmallVector<Value> results;
+    results.reserve(xVals.size() * 2);
+    assert(xVals.size() % 4 == 0);
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    for (int i = 0; i < xVals.size(); i += 4) {
+      Value packedVec = b.undef(vec_ty(i8_ty, 4));
+      for (int j : llvm::seq(4)) {
+        Value v = xVals[i + j];
+        packedVec = b.insert_element(packedVec, v, b.i32_val(j));
+      }
+      SmallVector<Value, 4> v4i32 =
+          upcast8xMxfp4_SW(rewriter, op, toFp16, packedVec);
+      for (int j = 0; j < 4; j++) {
+        Value elements = b.bitcast(v4i32[j], vec_ty(elemType, 2));
+        results.push_back(b.extract_element(elements, b.i32_val(0)));
+        results.push_back(b.extract_element(elements, b.i32_val(1)));
+      }
+    }
+
+    Value result = packLLElements(loc, getTypeConverter(), results, rewriter,
+                                  op.getType());
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+} // anonymous namespace
+
+void mlir::triton::AMD::populateFp4ToFpToLLVMPatterns(
+    LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
+    PatternBenefit benefit) {
+  patterns.add<Fp4ToFpOpPattern>(typeConverter, benefit);
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h b/third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h
@@ -43,6 +43,10 @@ void populateUpcastMXFPToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                       const TargetInfo &targetInfo,
                                       PatternBenefit benefit);
 
+void populateFp4ToFpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                   RewritePatternSet &patterns,
+                                   PatternBenefit benefit);
+
 } // namespace mlir::triton::AMD
 
 #endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_PATTERNTRITONGPUOPTOLLVM_H_
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp
@@ -203,6 +203,8 @@ struct ConvertTritonAMDGPUToLLVM
                                                           patterns, AMDBenefit);
     mlir::triton::AMD::populateUpcastMXFPToLLVMPatterns(typeConverter, patterns,
                                                         targetInfo, AMDBenefit);
+    mlir::triton::AMD::populateFp4ToFpToLLVMPatterns(typeConverter, patterns,
+                                                     AMDBenefit);
 
     // TODO(thomas): this should probably be done in a separate step to not
     // interfere with our own lowering of arith ops. Add arith/math's patterns
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -19,149 +19,10 @@
 using namespace mlir;
 using namespace mlir::triton;
 using namespace mlir::triton::gpu;
+using ::mlir::LLVM::AMD::upcast8xMxfp4_SW;
 
 namespace {
 
-SmallVector<Value, 4> upcast8xMxfp4_SW(RewriterBase &rewriter,
-                                       amdgpu::UpcastMXFPOp upcastOp,
-                                       bool tofp16, Value packedVec) {
-  Location loc = upcastOp.getLoc();
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-
-  // MXFP4 has 4 bits, S.EE.M, for Sign, Exponent, and Mantissa respectively.
-  // For a specific S, we have a total of 8 bit patterns. We can encode all
-  // these 8 resultant bf16/fp16 bit patterns in a lookup table (LUT). It
-  // happens that llvm.amdgcn.perm supports selecting 4 bytes from 8 input bytes
-  // using a 4-byte selector. So the overall idea is to use llvm.amdgcn.perm to
-  // implement such a LUT; though we need to select the two bytes for the
-  // resultant bf16/fp16 bit patterns separately. For the byte containing S, we
-  // also need to handle the S and E bits separately.
-
-  // FP4 has 4 bits: S.EE.M. Bf16/fp16 bit patterns for positive values:
-  //
-  // FP4    | BF16   | FP16   | Value
-  // ------ | ------ | ------ | -----
-  // 0.00.0 | 0x0000 | 0x0000 | + 0.0
-  // 0.00.1 | 0x3f00 | 0x3800 | + 0.5
-  // 0.01.0 | 0x3f80 | 0x3c00 | + 1.0
-  // 0.01.1 | 0x3fc0 | 0x3e00 | + 1.5
-  // 0.10.0 | 0x4000 | 0x4000 | + 2.0
-  // 0.10.1 | 0x4040 | 0x4200 | + 3.0
-  // 0.11.0 | 0x4080 | 0x4400 | + 4.0
-  // 0.11.1 | 0x40c0 | 0x4600 | + 6.0
-  //
-  // Encode Byte #0 (M) for BF16/FP16 in a LUT.
-  Value resB0LutLo = tofp16 ? b.i32_val(0) : b.i32_val(0xc0800000);
-  Value resB0LutHi = tofp16 ? b.i32_val(0) : b.i32_val(0xc0804000);
-  // Encode Byte #1 (EM, non-S part) for BF16/FP16 in a LUT.
-  Value resB1LutLoNoS = tofp16 ? b.i32_val(0x3e3c3800) : b.i32_val(0x3f3f3f00);
-  Value resB1LutHiNoS = tofp16 ? b.i32_val(0x46444240) : b.i32_val(0x40404040);
-
-  Type i32Ty = rewriter.getI32Type();
-  auto permU32FnTy = LLVM::LLVMFunctionType::get(i32Ty, {i32Ty, i32Ty, i32Ty});
-  LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(
-      rewriter, upcastOp, "llvm.amdgcn.perm", permU32FnTy);
-
-  // Start with 8 mxfp4 elements in a single i32 register
-  // | e7e6 | e5e4 | e3e2 | e1e0 |
-  Value input = b.bitcast(packedVec, i32Ty);
-
-  // Step 1: extract EM bits for elements 0,2,4,6 and 1,3,5,7 respectively.
-  // e2m1_6420_idx = | 0[0e6EM] | 0[0e4EM] | 0[0e2EM] | 0[0e0EM] |
-  Value e2m1_6420_idx = b.and_(input, b.i32_val(0x07070707));
-  // e2m1_7531_idx = | [0e7EM]0 | [0e5EM]0 | [0e3EM]0 | [0e1EM]0 |
-  Value e2m1_7531_idx = b.and_(input, b.i32_val(0x70707070));
-  // e2m1_7531_idx = | 0[0e7EM] | 0[0e5EM] | 0[0e3EM] | 0[0e1EM] |
-  e2m1_7531_idx = b.lshr(e2m1_7531_idx, b.i32_val(4));
-
-  // Step 2: extract S bit for elements 0,2,4,6 and 1,3,5,7
-  // s_6420 = | 0[e6S000] | 0[e4S000] | 0[e2S000] | 0[e0S000] |
-  Value s_6420 = b.and_(input, b.i32_val(0x08080808));
-  // s_6420 = | [e6S000]0 | [e4S000]0 | [e2S000]0 | [e0S000]0 |
-  s_6420 = b.shl(s_6420, b.i32_val(4));
-  // s_7531 = | [e7S000]0 | [e5S000]0 | [e3S000]0 | [e1S000]0 |
-  Value s_7531 = b.and_(input, b.i32_val(0x80808080));
-
-  // Step 3: Upcast elements 0,2,4,6 to 4 16-bit elements
-  // Select Byte #0. It's always 0 if upcasting to fp16.
-  // resB0_6420 = | e6B0 | e4B0 | e2B0 | e0B0 |
-  Value resB0_6420 = b.i32_val(0);
-  if (!tofp16) {
-    resB0_6420 = LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                                        {resB0LutHi, resB0LutLo, e2m1_6420_idx})
-                     .getResult();
-  }
-  // Select Byte #1
-  Value resB1NoS_6420 =
-      LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                             {resB1LutHiNoS, resB1LutLoNoS, e2m1_6420_idx})
-          .getResult();
-  // resB1_6420 = | e6B1 | e4B1 | e2B1 | e0B1 |
-  Value resB1_6420 = b.or_(resB1NoS_6420, s_6420);
-  // Construct 16-bit values of e0 and e2
-  // res_20 = | e2B1 | e2B0 | e0B1 | e0B0 | = | e2_f16 | e0_f16 |
-  Value res_20 =
-      LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                             {resB1_6420, resB0_6420, b.i32_val(0x05010400)})
-          .getResult();
-  // Construct 16-bit values of e4 and e6
-  // res_64 = | e6B1 | e6B0 | e4B1 | e4B0 | = | e6_f16 | e4_f16 |
-  Value res_64 =
-      LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                             {resB1_6420, resB0_6420, b.i32_val(0x07030602)})
-          .getResult();
-
-  // Step 4: Upcast elements 1,3,5,7 to 4 16-bit elements
-  // This is a copy of step 3 on different group of elements
-  // Select Byte #0. It's always 0 if upcasting to fp16.
-  // resB0_7531 = | e7B0 | e5B0 | e3B0 | e1B0 |
-  Value resB0_7531 = b.i32_val(0);
-  if (!tofp16) {
-    resB0_7531 = LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                                        {resB0LutHi, resB0LutLo, e2m1_7531_idx})
-                     .getResult();
-  }
-  // Select Byte #1
-  Value resB1NoS_7531 =
-      LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                             {resB1LutHiNoS, resB1LutLoNoS, e2m1_7531_idx})
-          .getResult();
-  // resB1_7531 = | e7B1 | e5B1 | e3B1 | e1B1 |
-  Value resB1_7531 = b.or_(resB1NoS_7531, s_7531);
-  // Construct 16-bit values of e1 and e3
-  // res_31 = | e3B1 | e3B0 | e1B1 | e1B0 | = | e3_f16 | e1_f16 |
-  Value res_31 =
-      LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                             {resB1_7531, resB0_7531, b.i32_val(0x05010400)})
-          .getResult();
-  // Construct 16-bit values of e5 and e7
-  // res_75 = | e7B1 | e7B0 | e5B1 | e5B0 | = | e7_f16 | e5_f16 |
-  Value res_75 =
-      LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                             {resB1_7531, resB0_7531, b.i32_val(0x07030602)})
-          .getResult();
-
-  // Step 5: Reorder 16-bit elements to be 0,1,2,3,4,5,6,7
-  // res_10 = | e1_f16 | e0_f16 |
-  Value res_10 = LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                                        {res_31, res_20, b.i32_val(0x05040100)})
-                     .getResult();
-  // res_32 = | e3_f16 | e2_f16 |
-  Value res_32 = LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                                        {res_31, res_20, b.i32_val(0x07060302)})
-                     .getResult();
-  // res_54 = | e5_f16 | e4_f16 |
-  Value res_54 = LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                                        {res_75, res_64, b.i32_val(0x05040100)})
-                     .getResult();
-  // res_76 = | e7_f16 | e6_f16 |
-  Value res_76 = LLVM::createLLVMCallOp(rewriter, loc, funcOp,
-                                        {res_75, res_64, b.i32_val(0x07060302)})
-                     .getResult();
-
-  return {res_10, res_32, res_54, res_76};
-}
-
 SmallVector<Value, 8> upcastMxfp4_SW(RewriterBase &rewriter,
                                      amdgpu::UpcastMXFPOp upcastOp, bool toFp16,
                                      ArrayRef<Value> values, int idx) {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp