Fix 2:4 sparsify meta registrations (#2366)

jcaip · web-flow · commit a2504181f436 · 2025-06-12T11:35:14.000-04:00
* fix 2:4 meta registrations

Summary:

We need to register in python for symbolic shape support, which is
needed for vLLM

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

* add meta for sparse gemm
diff --git a/test/sparsity/test_activation24.py b/test/sparsity/test_activation24.py
@@ -171,7 +171,7 @@ def test_sparse24_fp8_sm90_cutlass_gemm_eye(
     # Check MM with scale
     b_scale = torch.randn([1, A.shape[1]], device=eye.device, dtype=torch.float32)
     a_scale = torch.randn([A.shape[0], 1], device=eye.device, dtype=torch.float32)
-    A_reconstructed = torch.ops.torchao._sparse24_fp8_sm90_cutlass_gemm(
+    A_reconstructed = torch.ops.torchao.sparse24_fp8_sm90_cutlass_gemm(
         A_packed, A_mdata, eye, a_scale=a_scale, b_scale=b_scale
     )
     assert torch.allclose(
diff --git a/torchao/csrc/cuda/activation24/sparse_gemm.cu b/torchao/csrc/cuda/activation24/sparse_gemm.cu
@@ -132,9 +132,6 @@ struct SparseRowwiseKernel<cutlass::float_e4m3_t> {
 
 template <>
 struct SparseRowwiseKernel<cutlass::bfloat16_t> {
-  static constexpr auto kElementOutAt = at::ScalarType::BFloat16;
-  static constexpr auto kElementAAt = at::ScalarType::BFloat16;
-
   using ElementA = cutlass::bfloat16_t;
   using ElementB = cutlass::bfloat16_t;
   using ElementOut = cutlass::bfloat16_t;
@@ -209,7 +206,6 @@ struct SparseRowwiseKernel<cutlass::bfloat16_t> {
   using ElementE = CollectiveMainloop::ElementE;
 };
 
-template <bool kIsMeta>
 Tensor _sparse24_fp8_sm90_cutlass_gemm(
     const Tensor& tensor_a,
     const Tensor& tensor_e, // metadata for `A`
@@ -221,20 +217,16 @@ Tensor _sparse24_fp8_sm90_cutlass_gemm(
     std::string swizzle_axis,
     int64_t sm_count) {
   std::optional<at::cuda::CUDAGuard> device_guard;
-  if (!kIsMeta) {
-    device_guard.emplace(tensor_a.device());
-  }
+  device_guard.emplace(tensor_a.device());
 
   using K = SparseRowwiseKernel<cutlass::float_e4m3_t>;
 
   // For now, only CC 9.x devices are supported.
-  if (!kIsMeta) {
-    const auto dprops = at::cuda::getCurrentDeviceProperties();
-    TORCH_CHECK(
-        dprops && dprops->major == 9,
-        "_sparse24_gemm_fp8_sm90: Supported only on GPUs with "
-        "compute capability 9.x");
-  }
+  const auto dprops = at::cuda::getCurrentDeviceProperties();
+  TORCH_CHECK(
+      dprops && dprops->major == 9,
+      "_sparse24_gemm_fp8_sm90: Supported only on GPUs with "
+      "compute capability 9.x");
 
   // Validate layouts of input tensors.
   TORCH_CHECK(tensor_a.device() == tensor_b.device());
@@ -340,12 +332,7 @@ Tensor _sparse24_fp8_sm90_cutlass_gemm(
 TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("torchao::sparse24_fp8_sm90_cutlass_gemm"),
-      TORCH_FN(_sparse24_fp8_sm90_cutlass_gemm<false>));
+      TORCH_FN(_sparse24_fp8_sm90_cutlass_gemm));
 }
 
-TORCH_LIBRARY_IMPL(torchao, Meta, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("torchao::sparse24_fp8_sm90_cutlass_gemm"),
-      TORCH_FN(_sparse24_fp8_sm90_cutlass_gemm<true>));
-}
 #endif
diff --git a/torchao/csrc/cuda/activation24/sparsify24.cu b/torchao/csrc/cuda/activation24/sparsify24.cu
@@ -263,7 +263,6 @@ struct SparsifyKernelParams {
 };
 
 template <
-    bool kIsMeta,
     typename MetadataFormat,
     typename ElementIn,
     typename ElementOut,
@@ -274,10 +273,8 @@ std::tuple<at::Tensor, at::Tensor> sparse24_sm90_sparsify_specialized(
     std::string sp_selection_algo,
     std::optional<at::Tensor> scale) {
   std::optional<at::cuda::CUDAGuard> device_guard;
-  if (!kIsMeta) {
-    TORCH_CHECK(input.is_cuda(), "All tensors must be on GPU");
-    device_guard.emplace(input.device());
-  }
+  TORCH_CHECK(input.is_cuda(), "All tensors must be on GPU");
+  device_guard.emplace(input.device());
 
   TORCH_CHECK(input.dim() == 2, "Can only sparsify 2d tensors");
   TORCH_CHECK(
@@ -306,9 +303,6 @@ std::tuple<at::Tensor, at::Tensor> sparse24_sm90_sparsify_specialized(
   auto launchKernel = [&](auto algo, std::string const& algo_name) {
     if (algo_name == sp_selection_algo) {
       kernel_launched = true;
-      if (kIsMeta) {
-        return;
-      }
       using Params = SparsifyKernelParams<
           ElementIn,
           ElementOut,
@@ -347,7 +341,6 @@ struct SquaredReLU {
   }
 };
 
-template <bool kIsMeta = false>
 std::tuple<at::Tensor, at::Tensor> sparse24_sm90_sparsify(
     at::Tensor input,
     std::string metadata_fmt,
@@ -363,7 +356,6 @@ std::tuple<at::Tensor, at::Tensor> sparse24_sm90_sparsify(
         using ElementIn = decltype(in_type);
         using ElementOut = decltype(out_type);
         return sparse24_sm90_sparsify_specialized<
-            kIsMeta,
             decltype(mdatafmt),
             ElementIn,
             ElementOut>(input, act, sp_selection_algo, scale);
@@ -409,11 +401,5 @@ std::tuple<at::Tensor, at::Tensor> sparse24_sm90_sparsify(
 TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("torchao::sparse24_sm90_sparsify"),
-      TORCH_FN(sparse24_sm90_sparsify<false>));
-}
-
-TORCH_LIBRARY_IMPL(torchao, Meta, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("torchao::sparse24_sm90_sparsify"),
-      TORCH_FN(sparse24_sm90_sparsify<true>));
+      TORCH_FN(sparse24_sm90_sparsify));
 }
diff --git a/torchao/ops.py b/torchao/ops.py
@@ -843,15 +843,39 @@ def sparse24_sm90_sparsify(
     )
 
 
+@register_custom_op("torchao::sparse24_sm90_sparsify")
+def _(
+    input_tensor: Tensor,
+    metadata_format: str,
+    activation: str,
+    algorithm: str,
+    dtype=None,
+    scale=None,
+):
+    out_dtype = dtype if dtype is not None else input_tensor.dtype
+    return (
+        torch.empty(
+            (input_tensor.shape[0], input_tensor.shape[1] // 2),
+            dtype=out_dtype,
+            device=input_tensor.device,
+        ),
+        torch.empty(
+            (input_tensor.shape[0], input_tensor.shape[1] // 8),
+            dtype=torch.uint8,
+            device=input_tensor.device,
+        ),
+    )
+
+
 def sparse24_fp8_sm90_cutlass_gemm(
     a: Tensor,
     meta: Tensor,
     b: Tensor,
-    a_scale: Optional[Tensor],
-    b_scale: Optional[Tensor],
-    swizzle_size: int,
-    swizzle_axis: str,
-    sm_count: int,
+    a_scale: Optional[Tensor] = None,
+    b_scale: Optional[Tensor] = None,
+    swizzle_size: int = 8,
+    swizzle_axis: str = "n",
+    sm_count: int = 128,
 ) -> Tensor:
     return torch.ops.torchao.sparse24_fp8_sm90_cutlass_gemm(
         a,
@@ -865,6 +889,20 @@ def sparse24_fp8_sm90_cutlass_gemm(
     )
 
 
+@register_custom_op("torchao::sparse24_fp8_sm90_cutlass_gemm")
+def _(
+    a: Tensor,
+    meta: Tensor,
+    b: Tensor,
+    a_scale: Optional[Tensor] = None,
+    b_scale: Optional[Tensor] = None,
+    swizzle_size: int = 8,
+    swizzle_axis: str = "n",
+    sm_count: int = 128,
+):
+    return torch.empty((a.shape[0], b.shape[1]), dtype=torch.bfloat16, device=a.device)
+
+
 def swizzle_mm(
     mat1: Tensor, mat2: Tensor, mat1_is_swizzled: bool, mat2_is_swizzled: bool
 ) -> Tensor:

Original file line number	Diff line number	Diff line change
`@@ -171,7 +171,7 @@ def test_sparse24_fp8_sm90_cutlass_gemm_eye(`
`171`	`171`	`# Check MM with scale`
`172`	`172`	`b_scale = torch.randn([1, A.shape[1]], device=eye.device, dtype=torch.float32)`
`173`	`173`	`a_scale = torch.randn([A.shape[0], 1], device=eye.device, dtype=torch.float32)`
`174`		`- A_reconstructed = torch.ops.torchao._sparse24_fp8_sm90_cutlass_gemm(`
	`174`	`+ A_reconstructed = torch.ops.torchao.sparse24_fp8_sm90_cutlass_gemm(`
`175`	`175`	`A_packed, A_mdata, eye, a_scale=a_scale, b_scale=b_scale`
`176`	`176`	`)`
`177`	`177`	`assert torch.allclose(`