pytorch
diff --git a/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py‎
Lines changed: 0 additions & 101 deletions b/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py‎
Lines changed: 0 additions & 101 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16_grouped.cu‎
Lines changed: 18 additions & 111 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16_grouped.cu‎
Lines changed: 18 additions & 111 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16_grouped/f4f4bf16_grouped_128_128_256_1_1_1_f.cu‎
Lines changed: 0 additions & 33 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16_grouped/f4f4bf16_grouped_128_128_256_1_1_1_f.cu‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16_grouped/f4f4bf16_grouped_128_128_256_1_1_1_t.cu‎
Lines changed: 0 additions & 33 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16_grouped/f4f4bf16_grouped_128_128_256_1_1_1_t.cu‎
Lines changed: 0 additions & 33 deletions
@@ -2426,107 +2426,6 @@ def cuda(self) -> bool:
         return True
 
 
-@register_quantize_op
-class MXFP4GroupedGemm(QuantizeOpBase):
-    """
-    MXFP4 grouped matmul with blockwise scaling.
-    """
-
-    def preprocess(self, x, w):
-        wq, w_scale = zip(*[triton_quantize_mx4_unpack(i) for i in w])
-        return x, wq, w_scale
-
-    def quantize(self, x, wq, w_scale):
-        xq, x_scale = zip(*[triton_quantize_mx4_unpack(i) for i in x])
-        return xq, wq, x_scale, w_scale
-
-    def compute(self, xq, wq, x_scale, w_scale):
-        return torch.ops.fbgemm.f4f4bf16_grouped(
-            xq,
-            wq,
-            x_scale,
-            w_scale,
-        )
-
-    def quantize_and_compute(self, x, wq, w_scale):
-        xq, wq, x_scale, w_scale = self.quantize(x, wq, w_scale)
-        return self.compute(xq, wq, x_scale, w_scale)
-
-    @property
-    def name(self) -> str:
-        return "cutlass_f4f4bf16_grouped"
-
-    @property
-    def hip(self) -> bool:
-        # F4F4BF16_grouped only supported for cuda.
-        return False
-
-    @property
-    def cuda(self) -> bool:
-        return True
-
-
-@register_quantize_op
-class NVFP4GroupedGemm(QuantizeOpBase):
-    """
-    NVFP4 grouped matmul with blockwise scaling.
-    """
-
-    def quantize(self, x, w):
-        def get_global_scale(x, w):
-            x_global_scale = (448.0 * 6.0) / torch.amax(
-                torch.abs(x.flatten()), dim=-1
-            ).to(torch.float32)
-            w_global_scale = (448.0 * 6.0) / torch.amax(
-                torch.abs(w.flatten()), dim=-1
-            ).to(torch.float32)
-            global_scale = 1 / (x_global_scale * w_global_scale)
-            return x_global_scale, w_global_scale, global_scale
-
-        # Compute global scale for each group
-        G = len(x)
-        x_global_scale = []
-        w_global_scale = []
-        global_scale = []
-        for i in range(G):
-            x_global_scale_, w_global_scale_, global_scale_ = get_global_scale(
-                x[i], w[i]
-            )
-            x_global_scale.append(x_global_scale_)
-            w_global_scale.append(w_global_scale_)
-            global_scale.append(global_scale_)
-
-        # Quantize weights and activations
-        wq, w_scale = zip(
-            *[triton_scale_nvfp4_quant(w[i], w_global_scale[i]) for i in range(G)]
-        )
-        xq, x_scale = zip(
-            *[triton_scale_nvfp4_quant(x[i], x_global_scale[i]) for i in range(G)]
-        )
-        return xq, wq, x_scale, w_scale, global_scale
-
-    def compute(self, xq, wq, x_scale, w_scale, global_scale):
-        return torch.ops.fbgemm.f4f4bf16_grouped(
-            xq, wq, x_scale, w_scale, global_scale, use_mx=False
-        )
-
-    def quantize_and_compute(self, x, w):
-        xq, wq, x_scale, w_scale, global_scale = self.quantize(x, w)
-        return self.compute(xq, wq, x_scale, w_scale, global_scale)
-
-    @property
-    def name(self) -> str:
-        return "cutlass_nv_f4f4bf16_grouped"
-
-    @property
-    def hip(self) -> bool:
-        return False
-
-    @property
-    def cuda(self) -> bool:
-        return True
-
-
 @register_quantize_op
 class MXFP4StackedGroupedGemm(QuantizeOpBase):
     """
 
@@ -27,8 +27,7 @@ namespace fbgemm_gpu {
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
 
-template <typename InputType>
-Kernel_f4f4bf16_grouped<InputType>
+Kernel_f4f4bf16_grouped
 get_kernel_via_heuristics(int total_M, int N, int K, int G, bool use_mx) {
   // MXFP4
   if (use_mx) {
@@ -151,40 +150,35 @@ get_kernel_via_heuristics(int total_M, int N, int K, int G, bool use_mx) {
   }
 }
 
-template <typename InputType>
 at::Tensor dispatch_fp4_grouped_kernel(
     int total_M,
     int N,
     int K,
     int G,
-    InputType XQ, // FP4
-    InputType WQ, // FP4
-    InputType x_scale,
-    InputType w_scale,
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
     at::Tensor output,
     std::optional<at::Tensor> zero_start_index_M = std::nullopt,
     std::optional<at::Tensor> M_sizes = std::nullopt,
-    std::optional<InputType> global_scale = std::nullopt,
+    std::optional<at::Tensor> global_scale = std::nullopt,
     std::optional<at::Tensor> starting_row_after_padding = std::nullopt,
     bool use_mx = true) {
-  if constexpr (std::is_same_v<InputType, at::TensorList>) {
-    TORCH_CHECK(WQ.size() == G);
-  } else {
-    TORCH_CHECK(
-        zero_start_index_M.has_value() != M_sizes.has_value(),
-        "One of zero_start_index_M or M_sizes must be provided.");
-    TORCH_CHECK(M_sizes.has_value(), "M_sizes is assumed to be provided.");
-    TORCH_CHECK(
-        starting_row_after_padding.has_value(),
-        "starting_row_after_padding is assumed to be provided.");
-    at::Tensor starting_row_after_padding_actual =
-        starting_row_after_padding.value_or(at::zeros({0}));
-    TORCH_CHECK(starting_row_after_padding_actual.size(0) % (G + 1) == 0);
-  }
+  TORCH_CHECK(
+      zero_start_index_M.has_value() != M_sizes.has_value(),
+      "One of zero_start_index_M or M_sizes must be provided.");
+  TORCH_CHECK(M_sizes.has_value(), "M_sizes is assumed to be provided.");
+  TORCH_CHECK(
+      starting_row_after_padding.has_value(),
+      "starting_row_after_padding is assumed to be provided.");
+  at::Tensor starting_row_after_padding_actual =
+      starting_row_after_padding.value_or(at::zeros({0}));
+  TORCH_CHECK(starting_row_after_padding_actual.size(0) % (G + 1) == 0);
 
   // Select kernel to run via heuristics.
   auto kernel = [&]() {
-    return get_kernel_via_heuristics<InputType>(total_M, N, K, G, use_mx);
+    return get_kernel_via_heuristics(total_M, N, K, G, use_mx);
   }();
   // Invoke kernel
   return kernel(
@@ -200,82 +194,6 @@ at::Tensor dispatch_fp4_grouped_kernel(
       starting_row_after_padding);
 }
 
-template <typename OutputType>
-OutputType _f4f4bf16_grouped(
-    at::TensorList XQ, // FP4
-    at::TensorList WQ, // FP4
-    at::TensorList x_scale,
-    at::TensorList w_scale,
-    std::optional<at::TensorList> global_scale,
-    bool use_mx) {
-  at::Tensor Y;
-  int64_t total_M = 0;
-  int64_t max_N = 0;
-  int64_t max_K = 0;
-  int64_t G = XQ.size();
-
-  // Allocate output tensor.
-  std::vector<int64_t> output_sizes;
-  int64_t total_output_size = 0;
-  for (int i = 0; i < G; ++i) {
-    int64_t M = XQ[i].size(0);
-    int64_t N = WQ[i].size(0);
-    int64_t K = WQ[i].size(1);
-    total_M += M;
-    if (N > max_N) {
-      max_N = N;
-    }
-    if (K > max_K) {
-      max_K = K;
-    }
-    const int64_t output_size = M * N;
-    total_output_size += output_size;
-    output_sizes.push_back(output_size);
-  }
-  Y = at::empty(total_output_size, XQ[0].options().dtype(at::kBFloat16));
-
-  // Run kernel.
-  at::Tensor g_out = dispatch_fp4_grouped_kernel<at::TensorList>(
-      total_M,
-      max_N,
-      max_K * 2, // Since K is packed
-      G,
-      XQ,
-      WQ,
-      x_scale,
-      w_scale,
-      Y,
-      std::nullopt,
-      std::nullopt,
-      global_scale,
-      std::nullopt,
-      use_mx);
-
-  // Return appropriate output type.
-  if constexpr (std::is_same_v<OutputType, at::Tensor>) {
-    int64_t N = WQ[0].size(0);
-    return g_out.view({total_M, N});
-  } else {
-    // Return grouped view of output.
-    std::vector<at::Tensor> output_group = g_out.split(output_sizes);
-    for (int i = 0; i < G; ++i) {
-      output_group[i] = output_group[i].view({XQ[i].size(0), WQ[i].size(0)});
-    }
-    return output_group;
-  }
-}
-
-std::vector<at::Tensor> f4f4bf16_grouped(
-    at::TensorList XQ, // FP4
-    at::TensorList WQ, // FP4
-    at::TensorList x_scale,
-    at::TensorList w_scale,
-    std::optional<at::TensorList> global_scale = std::nullopt,
-    bool use_mx = true) {
-  return _f4f4bf16_grouped<std::vector<at::Tensor>>(
-      XQ, WQ, x_scale, w_scale, global_scale, use_mx);
-}
-
 at::Tensor f4f4bf16_grouped_stacked(
     at::Tensor XQ, // FP4
     at::Tensor WQ, // FP4
@@ -300,7 +218,7 @@ at::Tensor f4f4bf16_grouped_stacked(
     return Y;
   }
   // Return continuous view of output.
-  return dispatch_fp4_grouped_kernel<at::Tensor>(
+  return dispatch_fp4_grouped_kernel(
       total_M,
       N,
       K * 2, // Since K is packed
@@ -319,17 +237,6 @@ at::Tensor f4f4bf16_grouped_stacked(
 
 #else
 
-std::vector<at::Tensor> f4f4bf16_grouped(
-    at::TensorList XQ, // FP4
-    at::TensorList WQ, // FP4
-    at::TensorList x_scale,
-    at::TensorList w_scale,
-    std::optional<at::TensorList> global_scale = std::nullopt,
-    bool use_mx = true) {
-  throw std::runtime_error(
-      "CUDA version is older than 12.8"); // requires CUDA>=12.8
-}
-
 at::Tensor f4f4bf16_grouped_stacked(
     at::Tensor XQ, // FP4
     at::Tensor WQ, // FP4
 
@@ -24,39 +24,6 @@ at::Tensor f4f4bf16_grouped_128_128_256_1_1_1_f(
     std::optional<at::Tensor> global_scale,
     std::optional<at::Tensor> starting_row_after_padding) {
   return f4f4bf16_grouped_impl<
-      at::Tensor,
-      cutlass::nv_float4_t<cutlass::float_e2m1_t>,
-      128,
-      128,
-      256,
-      1,
-      1,
-      1>(
-      XQ,
-      WQ,
-      x_scale,
-      w_scale,
-      output,
-      G,
-      zero_start_index_M,
-      M_sizes,
-      global_scale,
-      starting_row_after_padding);
-}
-
-at::Tensor f4f4bf16_grouped_128_128_256_1_1_1_f(
-    at::TensorList XQ, // FP4
-    at::TensorList WQ, // FP4
-    at::TensorList x_scale,
-    at::TensorList w_scale,
-    at::Tensor output,
-    int64_t G,
-    std::optional<at::Tensor> zero_start_index_M,
-    std::optional<at::Tensor> M_sizes,
-    std::optional<at::TensorList> global_scale,
-    std::optional<at::Tensor> starting_row_after_padding) {
-  return f4f4bf16_grouped_impl<
-      at::TensorList,
       cutlass::nv_float4_t<cutlass::float_e2m1_t>,
       128,
       128,
 
@@ -24,39 +24,6 @@ at::Tensor f4f4bf16_grouped_128_128_256_1_1_1_t(
     std::optional<at::Tensor> global_scale,
     std::optional<at::Tensor> starting_row_after_padding) {
   return f4f4bf16_grouped_impl<
-      at::Tensor,
-      cutlass::mx_float4_t<cutlass::float_e2m1_t>,
-      128,
-      128,
-      256,
-      1,
-      1,
-      1>(
-      XQ,
-      WQ,
-      x_scale,
-      w_scale,
-      output,
-      G,
-      zero_start_index_M,
-      M_sizes,
-      global_scale,
-      starting_row_after_padding);
-}
-
-at::Tensor f4f4bf16_grouped_128_128_256_1_1_1_t(
-    at::TensorList XQ, // FP4
-    at::TensorList WQ, // FP4
-    at::TensorList x_scale,
-    at::TensorList w_scale,
-    at::Tensor output,
-    int64_t G,
-    std::optional<at::Tensor> zero_start_index_M,
-    std::optional<at::Tensor> M_sizes,
-    std::optional<at::TensorList> global_scale,
-    std::optional<at::Tensor> starting_row_after_padding) {
-  return f4f4bf16_grouped_impl<
-      at::TensorList,
       cutlass::mx_float4_t<cutlass::float_e2m1_t>,
       128,
       128,