[release/2.7] Enable mx fp8 support on ROCm (#2199)

jagadish-amd · web-flow · commit d17e2227ecfd · 2025-06-04T11:40:52.000-05:00
Ported mx fp8 part from #2046 Current test stats (accounting only blockwise scale tests) PYTORCH_TEST_WITH_ROCM=1 python test/test_matmul_cuda.py -k test_blockwise -v Ran 225 tests in 8.256s FAILED (failures=1, skipped=150) _74 test pass_ **fp8 mx data type sample test case.** test_blockwise_mxfp8_numerics_test_case_name_data_random_scales_one_fast_accum_True_512_128_256_cuda (__main__.TestFP8MatmulCudaCUDA) hipblaslt-bench --api_method c -m 256 -n 512 -k 128 --lda 128 --ldb 128 --ldc 256 --ldd 256 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1 --beta 0 --transA T --transB N --batch_count 1 --scaleA 3 --scaleB 3 --a_type f8_r --b_type f8_r --c_type bf16_r --d_type bf16_r --compute_type f32_r --algo_method index --solution_index -2146957310 --rotating 0 --cold_iters 0 --iters 0 --------- Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -1532,6 +1532,16 @@ void scaled_gemm(
     matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
     matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
   }
+    else if(mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+#if ROCM_VERSION >= 70000
+          if (at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"})) {
+            // Validate matrix dimensions for MX format
+            TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
+                       "Matrix dimensions must be multiples of 32 for MX format. ",
+                       "Got m=", m, ", n=", n, ", k=", k);
+          }
+#endif
+  }
 #else
   // rowwise isn't supported using older hipblaslt
   TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt");
@@ -1570,11 +1580,11 @@ void scaled_gemm(
   }
 
   if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
-#if CUDA_VERSION >= 12080
+#if (!defined(USE_ROCM) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
 #else
-    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
+    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above");
 #endif // CUDA_VERSION >= 12080
   } else if (mat1_scale_dtype == kFloat && mat2_scale_dtype == kFloat && use_rowwise) {
 #if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1020,12 +1020,15 @@ ScalingType get_scaling_type(
     auto expected_b_size =
         BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks;
 
+    //TODO: enable the checks for ROCm
+#ifndef USE_ROCM
     TORCH_CHECK(scale_a.numel() == expected_a_size,
                 "For BlockWise scaling: Expected scale_a size to be ",
                 expected_a_size, " but got ", scale_a.numel());
     TORCH_CHECK(scale_b.numel() == expected_b_size,
                 "For BlockWise scaling: Expected scale_b size to be ",
                 expected_b_size, " but got ", scale_b.numel());
+#endif
 
     TORCH_CHECK(
         scale_a.is_contiguous() && scale_b.is_contiguous(),
@@ -1092,6 +1095,7 @@ ScalingType get_scaling_type(
 
 } // namespace
 
+
 // Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1155,6 +1159,14 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   // Type restrictions imposed by CuBLASLt as of CUDA-12.1
   TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
         "Multiplication of two Float8_e5m2 matrices is not supported");
+#endif
+#ifdef USE_ROCM
+  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above");
+  }
 #endif
   if (bias) {
     TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
@@ -1211,17 +1223,33 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   }
 #else
   if (scaling_choice == ScalingType::RowWise) {
-    // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
+    // For ROCm, match behavior of f8f8bf16_rowwise type checking
     Tensor b = mat2;
     if (_scaled_mm_is_fnuz()) {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz);
     }
     else {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
     }
-    // Until more than bf16 is supported.
+    // Until more than bf16 is supported
     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
-         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
+         "hipblaslt rowwise _scaled_mm only supports BFloat16 output");
+  }
+  else if (scaling_choice == ScalingType::BlockWise) {
+#if ROCM_VERSION >= 70000
+    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"}),
+               "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
+               mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
+               "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+                out.scalar_type() == ScalarType::Half,
+                "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
   }
 #endif
 
@@ -1300,10 +1328,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
       params.k = args.k;
       params.a = args.mata->data_ptr();
       params.a_scale_ptr = args.scale_mata_ptr;
+      params.a_scale_dtype = scale_a.scalar_type();
       params.lda = args.lda;
       params.a_dtype = args.mata->scalar_type();
       params.b = args.matb->data_ptr();
       params.b_scale_ptr = args.scale_matb_ptr;
+      params.b_scale_dtype = scale_b.scalar_type();
       params.ldb = args.ldb;
       params.b_dtype = args.matb->scalar_type();
       params.bias_ptr = bias ? bias->data_ptr(): nullptr;
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
@@ -929,12 +929,16 @@ def test_blockwise_mxfp8_numerics(self, test_case_name, fast_accum, mkn) -> None
         BLOCK_SIZE = 32
         require_exact_match = True
 
+        if torch.version.hip:
+            if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0):
+                raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping")
+
         def ceil_div(a, b):
             return (a + b - 1) // b
 
         if test_case_name == "a_eye_b_eye":
             if not ((M == K) and (M == N)):
-                return unittest.skip("this test is only defined for M == K == N, skipping")
+                raise unittest.SkipTest("this test is only defined for M == K == N, skipping")
             A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
             B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
 
@@ -943,9 +947,6 @@ def ceil_div(a, b):
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            # convert to swizzled format
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
 
         elif test_case_name == "a_ones_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -956,9 +957,6 @@ def ceil_div(a, b):
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            # convert to swizzled format
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
 
         elif test_case_name == "a_ones_modified_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -972,9 +970,6 @@ def ceil_div(a, b):
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            # convert to swizzled format
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
 
         elif test_case_name == "a_ones_b_ones_modified":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -988,9 +983,6 @@ def ceil_div(a, b):
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            # convert to swizzled format
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
 
         elif test_case_name == "a_scale_modified_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -1006,10 +998,6 @@ def ceil_div(a, b):
             A[1][0:BLOCK_SIZE] = 2
             A_scale[1][0] = 2
 
-            # convert to swizzled format
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
-
         elif test_case_name == "a_ones_b_scale_modified":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
             B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
@@ -1024,10 +1012,6 @@ def ceil_div(a, b):
             B[1][0:BLOCK_SIZE] = 2
             B_scale[1][0] = 2
 
-            # convert to swizzled format
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
-
         elif test_case_name == "data_random_scales_one":
             require_exact_match = False
             # scales all-ones, element data random while being exactly representable in float8_e4m3fn
@@ -1045,13 +1029,9 @@ def ceil_div(a, b):
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
 
-            # convert to swizzled format
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
-
         elif test_case_name == "data_random_scales_from_data":
             if not K % BLOCK_SIZE == 0:
-                return unittest.skip(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
+                raise unittest.SkipTest(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
             require_exact_match = False
             # random data, scales from data
             A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
@@ -1069,7 +1049,8 @@ def ceil_div(a, b):
             B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
             B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
 
-            # convert to swizzled format
+        # convert to swizzled format
+        if not torch.version.hip:
             A_scale = to_blocked(A_scale)
             B_scale = to_blocked(B_scale)
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
@@ -100,7 +100,15 @@ def evaluate_platform_supports_fp8():
 
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 
-PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
+def _platform_supports_mx_gemm():
+    if torch.cuda.is_available():
+        if torch.version.hip:
+            return 'gfx95' in torch.cuda.get_device_properties(0).gcnArchName
+        else:
+            return SM100OrLater
+    return False
+
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: _platform_supports_mx_gemm())
 
 if TEST_NUMBA:
     try:
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -3865,6 +3865,7 @@
         ("CUDA_C_64U", ("HIP_C_64U", CONV_TYPE, API_RUNTIME)),
         ("CUDA_R_8F_E4M3", ("HIP_R_8F_E4M3", CONV_TYPE, API_RUNTIME)),
         ("CUDA_R_8F_E5M2", ("HIP_R_8F_E5M2", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_4F_E2M1", ("HIP_R_4F_E2M1", CONV_TYPE, API_RUNTIME)),
         (
             "MAJOR_VERSION",
             ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
@@ -7325,6 +7326,10 @@
         ("CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", ("HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", ("HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),