Revert "torch._scaled_mm with MXFP8 (pytorch#147548)"

pytorchmergebot · pytorchmergebot · commit c82c1411c61b · 2025-02-25T23:28:15.000Z
This reverts commit e34c15a. Reverted pytorch#147548 on behalf of https://github.com/wdvr due to failing internal build - discussed with author ([comment](pytorch#147548 (comment)))
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -14,7 +14,6 @@
 #include <c10/macros/Export.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
-#include <c10/core/ScalarType.h>
 
 #ifdef USE_ROCM
 #include <hipblaslt/hipblaslt-ext.hpp>
@@ -1504,12 +1503,10 @@ void scaled_gemm(
     const void* mat1_scale_ptr,
     int64_t mat1_ld,
     ScalarType mat1_dtype,
-    ScalarType mat1_scale_dtype,
     const void* mat2_ptr,
     const void* mat2_scale_ptr,
     int64_t mat2_ld,
     ScalarType mat2_dtype,
-    ScalarType mat2_scale_dtype,
     const void* bias_ptr,
     ScalarType bias_dtype,
     void* result_ptr,
@@ -1537,8 +1534,10 @@ void scaled_gemm(
   // rowwise isn't supported using cublaslt or older hipblaslt
   TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
 #endif
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+  {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+  }
   if (result_scale_ptr != nullptr) {
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
   }
@@ -1561,15 +1560,6 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
   }
 
-  if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
-#if CUDA_VERSION >= 12080
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
-#else
-    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
-#endif // CUDA_VERSION >= 12080
-  }
-
   auto stream = c10::cuda::getCurrentCUDAStream();
   size_t workspaceSize = 0;
   auto workspace_ptr = _getWorkspace(workspaceSize);
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
@@ -130,12 +130,10 @@ void scaled_gemm(
     const void* mat1_scale_ptr,
     int64_t mat1_ld,
     ScalarType mat1_dtype,
-    ScalarType mat1_scale_dtype,
     const void* mat2_ptr,
     const void* mat2_scale_ptr,
     int64_t mat2_ld,
     ScalarType mat2_dtype,
-    ScalarType mat2_scale_dtype,
     const void* bias_ptr,
     ScalarType bias_dtype,
     void* result_ptr,
diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -10,7 +10,6 @@
 #pragma once
 
 #include <string>
-#include <c10/core/ScalarType.h>
 
 #include <ATen/cuda/tunable/TunableOp.h>
 #include <ATen/cuda/CUDABlas.h>
@@ -425,12 +424,10 @@ struct ScaledGemmParams : OpParams {
   const void* a_scale_ptr{};
   int64_t lda{};
   ScalarType a_dtype{};
-  ScalarType a_scale_dtype{};
   const void* b{};
   const void* b_scale_ptr{};
   int64_t ldb{};
   ScalarType b_dtype{};
-  ScalarType b_scale_dtype{};
   const void* bias_ptr{};
   ScalarType bias_dtype{};
   void* c{};
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
@@ -95,12 +95,10 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
           params->a_scale_ptr,
           params->lda,
           params->a_dtype,
-          params->a_scale_dtype,
           params->b,
           params->b_scale_ptr,
           params->ldb,
           params->b_dtype,
-          params->b_scale_dtype,
           params->bias_ptr,
           params->bias_dtype,
           params->c,
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1,5 +1,4 @@
 #include <cstdint>
-#include <c10/util/typeid.h>
 #include <c10/util/Exception.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
@@ -96,33 +95,11 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
 }
 
 struct cublasCommonArgs {
-  cublasCommonArgs(
-      const Tensor& mat1,
-      const Tensor& mat2,
-      Tensor& c,
-      const c10::optional<Tensor>& scale_a = c10::nullopt,
-      const c10::optional<Tensor>& scale_b = c10::nullopt,
-      const c10::optional<Tensor>& scale_result = c10::nullopt) {
+  cublasCommonArgs(const Tensor& mat1, const Tensor& mat2, Tensor& c) {
     bool transpose_result = false, transpose_mat1 = false, transpose_mat2 = false;
     result = prepare_matrix_for_cublas(c, transpose_result);
     mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result);
     matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result);
-
-    // Handle scale tensors if provided
-    if (scale_a && scale_b) {
-      // By default since we return in row-major we run the gemm
-      // as B.T @ A.T, check transpose_result to determine if we flip the scales
-      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
-      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
-      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
-      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
-    }
-
-    if (scale_result) {
-      scale_result_ptr = scale_result->data_ptr();
-      scale_result_dtype = scale_result->scalar_type();
-    }
-
     auto mat1_sizes = mat1.sizes();
     auto mat2_sizes = mat2.sizes();
     if (transpose_result) {
@@ -138,23 +115,13 @@ struct cublasCommonArgs {
     lda = mata->stride((transpose_mat1 == transpose_result) ? 1 : 0);
     ldb = matb->stride((transpose_mat2 == transpose_result) ? 1 : 0);
     result_ld = result->stride(transpose_result ? 0 : 1);
-    transa = transpose_mat1 ? mata->is_conj() ? 'c' : 't' : 'n';
-    transb = transpose_mat2 ? matb->is_conj() ? 'c' : 't' : 'n';
+    transa = transpose_mat1 ?  mata->is_conj() ? 'c' : 't' : 'n';
+    transb = transpose_mat2 ?  matb->is_conj() ? 'c' : 't' : 'n';
   }
-
-  // Matrix members
   char transa, transb;
   int64_t m, n, k;
   int64_t lda, ldb, result_ld;
   c10::MaybeOwned<Tensor> mata, matb, result;
-
-  // Scale members
-  void* scale_mata_ptr = nullptr;
-  void* scale_matb_ptr = nullptr;
-  void* scale_result_ptr = nullptr;
-  c10::optional<c10::ScalarType> scale_mata_dtype;
-  c10::optional<c10::ScalarType> scale_matb_dtype;
-  c10::optional<c10::ScalarType> scale_result_dtype;
 };
 } // namespace
 
@@ -936,24 +903,20 @@ static bool _scaled_mm_is_fnuz() {
 
 namespace{
 
-enum class ScalingType : std::uint8_t {
+enum class ScalingType {
   TensorWise,
   RowWise,
-  BlockWise,
   Error
 };
 /*
  * Scaling Type Determination:
  * ---------------------------
  * Conditions and corresponding Scaling Types:
  *
- * - If scale tensors are Float8_e8m0fnu:
- *   - Returns BlockWise (with additional size checks).
- *
  * - If scale_a.numel() == 1 && scale_b.numel() == 1:
  *   - Returns TensorWise.
  *
- * - Else if scale_a.dim() == 2 && scale_a.size(0) == dim_m && scale_b.size(0) == dim_n:
+ * - Else if scale_a.dim() == 1 && scale_a.size(0) == dim_m && scale_b.size(0) == dim_n:
  *   - Returns RowWise.
  *
  * - Otherwise:
@@ -966,40 +929,7 @@ ScalingType get_scaling_type(
     const at::Tensor& scale_a,
     const at::Tensor& scale_b,
     int64_t dim_m,
-    int64_t dim_k,
     int64_t dim_n) {
-  // Check for BlockWise scaling (FP8_E8M0 types)
-  if (scale_a.scalar_type() == scale_b.scalar_type() &&
-      scale_a.scalar_type() == at::kFloat8_e8m0fnu) {
-    constexpr int64_t BLOCK_SIZE_K = 32;
-    constexpr int64_t BLOCK_SIZE_MN = 128;
-
-    auto ceil_div = [](auto a, auto b) { return (a + b - 1) / b; };
-    auto num_k_blocks = ceil_div(dim_k, BLOCK_SIZE_K);
-    auto padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4;
-
-    // TODO: We might want to enforce some structure on the shapes of the scale
-    // tensors
-
-    // Check expected sizes for block-wise scaling
-    auto expected_a_size =
-        BLOCK_SIZE_MN * ceil_div(dim_m, BLOCK_SIZE_MN) * padded_num_k_blocks;
-    auto expected_b_size =
-        BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks;
-
-    TORCH_CHECK(scale_a.numel() == expected_a_size,
-                "For BlockWise scaling: Expected scale_a size to be ",
-                expected_a_size, " but got ", scale_a.numel());
-    TORCH_CHECK(scale_b.numel() == expected_b_size,
-                "For BlockWise scaling: Expected scale_b size to be ",
-                expected_b_size, " but got ", scale_b.numel());
-
-    TORCH_CHECK(
-        scale_a.is_contiguous() && scale_b.is_contiguous(),
-        "For BlockWise scaling: Both scale_a and scale_b must be contiguous");
-
-    return ScalingType::BlockWise;
-  }
   // Both Per-Tensor and Row-wise scaling expect fp32 tensors
   TORCH_CHECK(
       scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
@@ -1097,7 +1027,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
       mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
 
   // Check what type of scaling we are doing based on inputs
-  ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat1.size(1), mat2.size(1));
+  ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat2.size(1));
   TORCH_INTERNAL_ASSERT(scaling_choice != ScalingType::Error, "Scaling type not supported");
 
   TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
@@ -1190,7 +1120,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   }
 #endif
 
-  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result);
+  cublasCommonArgs args(mat1, mat2, out);
   const auto out_dtype_ = args.result->scalar_type();
   TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
 
@@ -1300,27 +1230,25 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   }
   else
 #endif
- {
+  {
     at::cuda::blas::scaled_gemm(
         args.transa,
         args.transb,
         args.m,
         args.n,
         args.k,
         args.mata->data_ptr(),
-        args.scale_mata_ptr,
+        scale_a.data_ptr(),
         args.lda,
         args.mata->scalar_type(),
-        args.scale_mata_dtype.value(),
         args.matb->data_ptr(),
-        args.scale_matb_ptr,
+        scale_b.data_ptr(),
         args.ldb,
         args.matb->scalar_type(),
-        args.scale_matb_dtype.value(),
         bias ? bias->data_ptr(): nullptr,
         bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
         args.result->data_ptr(),
-        args.scale_result_ptr,
+        scale_result ? scale_result->data_ptr() : nullptr,
         args.result_ld,
         out_dtype_,
         use_fast_accum,
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py