yiqian1
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/test/unit/language/test_matmul.py‎
Lines changed: 17 additions & 4 deletions b/‎python/test/unit/language/test_matmul.py‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎python/triton/_internal_testing.py‎
Lines changed: 8 additions & 1 deletion b/‎python/triton/_internal_testing.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎test/TritonGPU/invalid-attributes.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/TritonGPU/invalid-attributes.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/amd/backend/compiler.py‎
Lines changed: 1 addition & 1 deletion b/‎third_party/amd/backend/compiler.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h‎
Lines changed: 2 additions & 1 deletion b/‎third_party/amd/include/TritonAMDGPUTransforms/MfmaGroup.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM.cpp‎
Lines changed: 26 additions & 1 deletion b/‎third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM.cpp‎
Lines changed: 26 additions & 1 deletion
@@ -869,6 +869,7 @@ It is characterized by the following parameters:
   - 1.0: gfx908, i.e. MI100
   - 2.0: gfx90a: i.e. MI200, MI210, MI250
   - 3.0: gfx940, gfx941, gfx942: MI300
+  - 4.0: gfx950: MI350
 - `warpsPerCTA` indicates the warp layout in the block.
 - `MDim` and `NDim` indicate the dimension of the output of the mfma instruction.
 - `isTransposed` indicates the result tensor is transposed so that it can be converted to dotOperand layout
@@ -938,7 +939,7 @@ The data will be distributed between threads as follows:
 
 Example 3:
 Suppose we have a tensor with a shape of [8, 8], warpsPerCTA set to [2, 2] and nonKDim set to 4.
-The data will be distributed between threads as follows(note that each element is duploicated in 16 threads):
+The data will be distributed between threads as follows(note that each element is duplicated in 16 threads):
 Suppose we have a tensor with a shape of [8, 8], warpsPerCTA set to [2, 2] and MDim=NDim=4.
 The data will be distributed between threads as follows(note that each element is duplicated in 16 threads):
 
 
@@ -1458,8 +1458,8 @@ AMDMfmaEncodingAttr::verify(function_ref<mlir::InFlightDiagnostic()> emitError,
                             llvm::ArrayRef<unsigned int> warpsPerCTA,
                             unsigned mDim, unsigned nDim, bool isTransposed,
                             mlir::triton::gpu::CTALayoutAttr) {
-  if (!(versionMajor >= 0 && versionMajor <= 3)) {
-    return emitError() << "major version must be in the [0, 3] range";
+  if (!(versionMajor >= 0 && versionMajor <= 4)) {
+    return emitError() << "major version must be in the [0, 4] range";
   }
   if (versionMinor != 0) {
     return emitError() << "minor version must be 0";
 
@@ -6,7 +6,7 @@
 import triton.tools.experimental_descriptor
 from test_mxfp import MXFP4Tensor, MXScaleTensor
 import re
-from triton._internal_testing import is_cuda, is_hip, is_hip_mi200
+from triton._internal_testing import is_cuda, is_hip, is_hip_mi200, is_hip_mi350, is_hip_cdna
 
 
 def f8_to_f16(x, dtype):
@@ -711,8 +711,18 @@ def block_scale_fp4_matmul(  #
                                                        (128, 256, 256), (128, 128, 64), (128, 64, 128)])
 @pytest.mark.parametrize(("scale_type", "VEC_SIZE"), [("float8_e8m0fnu", 32), ("float8_e4m3fn", 16)],
                          ids=["mxfp4", "nvfp4"])
-@pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 10, reason="Requires compute capability >= 10")
-def test_block_scale_fp4(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, VEC_SIZE, scale_type, device):
+@pytest.mark.parametrize("nonKDim", ([0, 16, 32] if is_hip_cdna() else []))
+def test_block_scale_fp4(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, VEC_SIZE, scale_type, nonKDim, device):
+    if is_cuda() and torch.cuda.get_device_capability()[0] < 10:
+        pytest.skip("Requires compute capability >= 10")
+    elif is_hip():
+        if not is_hip_mi350():
+            pytest.skip("Scaled fp4 matmul is only natively supported on MI350")
+        if scale_type != 'float8_e8m0fnu':
+            pytest.skip("MI350 only supports E8M0 scale")
+        if (nonKDim == 16 and BLOCK_K < 128) or (nonKDim == 32 and BLOCK_K < 64):
+            pytest.skip(f"MI350 does not support {BLOCK_K=} for scaled mfma {nonKDim=} variants")
+
     NUM_STAGES = 1
     torch.manual_seed(42)
     a_mxfp4 = MXFP4Tensor(size=(M, K), device=device).random()
@@ -744,9 +754,12 @@ def test_block_scale_fp4(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, VEC_SIZE, scale_typ
 
     output = a.new_empty((M, N), dtype=torch.float32)
     grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), 1)
+    kernel_kwargs = {}
+    if is_hip():
+        kernel_kwargs["matrix_instr_nonkdim"] = nonKDim
     block_scale_fp4_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, a_scale.stride(0), a.stride(0), a.stride(1),
                                  b.stride(0), b.stride(1), output.stride(0), output.stride(1), VEC_SIZE, BLOCK_M,
-                                 BLOCK_N, BLOCK_K, NUM_STAGES=NUM_STAGES)
+                                 BLOCK_N, BLOCK_K, NUM_STAGES=NUM_STAGES, **kernel_kwargs)
 
     torch.testing.assert_close(ref_out, output, atol=1e-2, rtol=1e-2)
 
 
@@ -60,8 +60,15 @@ def is_hip_mi300():
     return target.arch in ('gfx940', 'gfx941', 'gfx942')
 
 
+def is_hip_mi350():
+    target = get_current_target()
+    if target is None or target.backend != 'hip':
+        return False
+    return target.arch in ('gfx950')
+
+
 def is_hip_cdna():
-    return is_hip_mi200() or is_hip_mi300()
+    return is_hip_mi200() or is_hip_mi300() or is_hip_mi350()
 
 
 def is_xpu():
 
@@ -64,7 +64,7 @@
 
 // -----
 
-// expected-error@+1 {{major version must be in the [0, 3] range}}
+// expected-error@+1 {{major version must be in the [0, 4] range}}
 #mfma = #ttg.amd_mfma<{versionMajor = 10, versionMinor = 0, warpsPerCTA = [1, 1, 1], instrShape = [32, 32], isTransposed = false}>
 
 // -----
 
@@ -99,7 +99,7 @@ def parse_options(self, opts) -> Any:
 
         if "supported_fp8_dtypes" not in opts:
             supported_fp8_dtypes = set(HIPOptions.supported_fp8_dtypes)
-            if self.target.arch in ('gfx940', 'gfx941', 'gfx942'):
+            if self.target.arch in ('gfx940', 'gfx941', 'gfx942', 'gfx950'):
                 supported_fp8_dtypes.update({'fp8e4nv', 'fp8e4b8', 'fp8e5b16'})
             args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
 
 
@@ -20,7 +20,8 @@ enum class MfmaTypeId : uint32_t {
   Fp8Fp8TyId,
   Fp8Bf8TyId,
   Bf8Fp8TyId,
-  Bf8Bf8TyId
+  Bf8Bf8TyId,
+  F8F6F4TyId,
 };
 
 struct MfmaInsnGroupSelectKey {
 
@@ -15,14 +15,19 @@ LogicalResult convertMFMA(triton::DotOp op, triton::DotOp::Adaptor adaptor,
                           const LLVMTypeConverter *typeConverter,
                           ConversionPatternRewriter &rewriter);
 
+LogicalResult convertScaledMFMA(triton::DotScaledOp op,
+                                triton::DotScaledOp::Adaptor adaptor,
+                                const LLVMTypeConverter *typeConverter,
+                                ConversionPatternRewriter &rewriter);
+
 LogicalResult convertWMMA(triton::DotOp op, triton::DotOp::Adaptor adaptor,
                           const LLVMTypeConverter *typeConverter,
                           ConversionPatternRewriter &rewriter);
 } // namespace mlir::triton::AMD
 
 namespace {
 struct DotOpConversion : public ConvertOpToLLVMPattern<triton::DotOp> {
-  using ConvertOpToLLVMPattern<triton::DotOp>::ConvertOpToLLVMPattern;
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(triton::DotOp op, OpAdaptor adaptor,
@@ -47,6 +52,25 @@ struct DotOpConversion : public ConvertOpToLLVMPattern<triton::DotOp> {
         "Unsupported DotOp found when converting TritonGPU to LLVM.");
   }
 };
+
+struct ScaledDotOpConversion
+    : public ConvertOpToLLVMPattern<triton::DotScaledOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+  int mfmaVersion;
+  int nonKDim;
+  int kPack;
+
+  ScaledDotOpConversion(LLVMTypeConverter &typeConverter, int mfmaVersion,
+                        int nonKDim, int kPack, PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern(typeConverter, benefit),
+        mfmaVersion(mfmaVersion), nonKDim(nonKDim), kPack(kPack) {}
+
+  LogicalResult
+  matchAndRewrite(triton::DotScaledOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    return AMD::convertScaledMFMA(op, adaptor, getTypeConverter(), rewriter);
+  }
+};
 } // namespace
 
 namespace mlir::triton::AMD {
@@ -55,5 +79,6 @@ void populateDotOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                  ModuleAxisInfoAnalysis &axisInfoAnalysis,
                                  PatternBenefit benefit) {
   patterns.add<DotOpConversion>(typeConverter, benefit);
+  patterns.add<ScaledDotOpConversion>(typeConverter, benefit);
 }
 } // namespace mlir::triton::AMD