[Refactor] Remove unused cutlass moe problem size function (vllm-project#32047)

yewentao256 · web-flow · commit eebc58df0c52 · 2026-01-18T12:46:59.000-08:00
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
 
-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
     const torch::Tensor& expert_first_token_offset,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 }
 }  // namespace
 
-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
-  auto options_int32 =
-      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
-  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
-
-  // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab =
-      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
-                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));
-
-  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
-                               atomic_buffer, num_experts, n, k, stream,
-                               may_swap_ab);
-}
-
 template <bool SWAP_AB>
 __global__ void compute_problem_sizes_from_expert_offsets(
     const int64_t* __restrict__ expert_first_token_offset,
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
 
-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
     const torch::Tensor& expert_first_token_offset,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
       version_num, ". Required capability: 90, 100, or 120");
 }
 
-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
-    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
-  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
-                                          problem_sizes2, num_experts, n, k,
-                                          blockscale_offsets, force_swap_ab);
-  return;
-#endif
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
-      "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90, 100, or 120");
-}
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
     const torch::Tensor& expert_first_token_offset,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "()");
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
-  // A function that computes problem sizes for each expert's multiplication
-  // used by the two mms called from fused MoE operation. It takes topk_ids as
-  // an input, and computes problem_sizes1 and problem_sizes2 only.
-  ops.def(
-      "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
-      "                                 Tensor! problem_sizes1, "
-      "                                 Tensor! problem_sizes2, "
-      "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets, "
-      "                                 bool? force_swap_ab) -> ()");
-  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
-           &get_cutlass_moe_mm_problem_sizes);
-
   // compute per-expert problem sizes from expert_first_token_offset
   // produced by vLLM's moe_permute kernel
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1044,41 +1044,6 @@ def get_cutlass_moe_mm_data(
     )
 
 
-def get_cutlass_moe_mm_problem_sizes(
-    topk_ids: torch.Tensor,
-    problem_sizes1: torch.Tensor,
-    problem_sizes2: torch.Tensor,
-    num_experts: int,
-    n: int,
-    k: int,
-    blockscale_offsets: torch.Tensor | None = None,
-    force_swap_ab: bool | None = None,
-):
-    """
-    Compute only the per-expert problem sizes needed by the two grouped matrix
-    multiplications used in CUTLASS-based fused MoE.
-
-    The function takes in topk_ids (token→expert mapping) and computes:
-    - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
-                                    multiplication for the two grouped MMs
-                                    used in the fused MoE operation.
-    Optional:
-    - force_swap_ab: If set to True or False, explicitly enable or disable the
-                     A/B input swap optimization. If None (default), the swap
-                     is selected automatically based on tensor sizes.
-    """
-    return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
-        topk_ids,
-        problem_sizes1,
-        problem_sizes2,
-        num_experts,
-        n,
-        k,
-        blockscale_offsets,
-        force_swap_ab,
-    )
-
-
 def get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
     expert_first_token_offset: torch.Tensor,
     problem_sizes1: torch.Tensor,