Skip to content

Commit eebc58d

Browse files
authored
[Refactor] Remove unused cutlass moe problem size function (vllm-project#32047)
Signed-off-by: yewentao256 <[email protected]>
1 parent 16de822 commit eebc58d

File tree

5 files changed

+0
-101
lines changed

5 files changed

+0
-101
lines changed

csrc/ops.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
260260
const int64_t num_experts, const int64_t n, const int64_t k,
261261
const std::optional<torch::Tensor>& blockscale_offsets);
262262

263-
void get_cutlass_moe_mm_problem_sizes(
264-
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
265-
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
266-
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
267-
std::optional<bool> force_swap_ab = std::nullopt);
268-
269263
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
270264
const torch::Tensor& expert_first_token_offset,
271265
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,

csrc/quantization/w8a8/cutlass/moe/moe_data.cu

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
130130
}
131131
} // namespace
132132

133-
void get_cutlass_moe_mm_problem_sizes_caller(
134-
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
135-
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
136-
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
137-
std::optional<bool> force_swap_ab = std::nullopt) {
138-
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
139-
auto options_int32 =
140-
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
141-
torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
142-
143-
// Swap-AB should be disabled for FP4 path
144-
bool may_swap_ab =
145-
force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
146-
(topk_ids.numel() <= SWAP_AB_THRESHOLD));
147-
148-
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
149-
atomic_buffer, num_experts, n, k, stream,
150-
may_swap_ab);
151-
}
152-
153133
template <bool SWAP_AB>
154134
__global__ void compute_problem_sizes_from_expert_offsets(
155135
const int64_t* __restrict__ expert_first_token_offset,

csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
7777
const int64_t num_experts, const int64_t n, const int64_t k,
7878
const std::optional<torch::Tensor>& blockscale_offsets);
7979

80-
void get_cutlass_moe_mm_problem_sizes_caller(
81-
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
82-
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
83-
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
84-
std::optional<bool> force_swap_ab = std::nullopt);
85-
8680
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
8781
const torch::Tensor& expert_first_token_offset,
8882
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
306300
version_num, ". Required capability: 90, 100, or 120");
307301
}
308302

309-
void get_cutlass_moe_mm_problem_sizes(
310-
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
311-
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
312-
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
313-
std::optional<bool> force_swap_ab = std::nullopt) {
314-
int32_t version_num = get_sm_version_num();
315-
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
316-
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
317-
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
318-
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
319-
problem_sizes2, num_experts, n, k,
320-
blockscale_offsets, force_swap_ab);
321-
return;
322-
#endif
323-
TORCH_CHECK_NOT_IMPLEMENTED(
324-
false,
325-
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
326-
"kernel for CUDA device capability: ",
327-
version_num, ". Required capability: 90, 100, or 120");
328-
}
329-
330303
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
331304
const torch::Tensor& expert_first_token_offset,
332305
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,

csrc/torch_bindings.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
474474
"()");
475475
ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
476476

477-
// A function that computes problem sizes for each expert's multiplication
478-
// used by the two mms called from fused MoE operation. It takes topk_ids as
479-
// an input, and computes problem_sizes1 and problem_sizes2 only.
480-
ops.def(
481-
"get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
482-
" Tensor! problem_sizes1, "
483-
" Tensor! problem_sizes2, "
484-
" int num_experts, int n, int k, "
485-
" Tensor? blockscale_offsets, "
486-
" bool? force_swap_ab) -> ()");
487-
ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
488-
&get_cutlass_moe_mm_problem_sizes);
489-
490477
// compute per-expert problem sizes from expert_first_token_offset
491478
// produced by vLLM's moe_permute kernel
492479
ops.def(

vllm/_custom_ops.py

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,41 +1044,6 @@ def get_cutlass_moe_mm_data(
10441044
)
10451045

10461046

1047-
def get_cutlass_moe_mm_problem_sizes(
1048-
topk_ids: torch.Tensor,
1049-
problem_sizes1: torch.Tensor,
1050-
problem_sizes2: torch.Tensor,
1051-
num_experts: int,
1052-
n: int,
1053-
k: int,
1054-
blockscale_offsets: torch.Tensor | None = None,
1055-
force_swap_ab: bool | None = None,
1056-
):
1057-
"""
1058-
Compute only the per-expert problem sizes needed by the two grouped matrix
1059-
multiplications used in CUTLASS-based fused MoE.
1060-
1061-
The function takes in topk_ids (token→expert mapping) and computes:
1062-
- problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
1063-
multiplication for the two grouped MMs
1064-
used in the fused MoE operation.
1065-
Optional:
1066-
- force_swap_ab: If set to True or False, explicitly enable or disable the
1067-
A/B input swap optimization. If None (default), the swap
1068-
is selected automatically based on tensor sizes.
1069-
"""
1070-
return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
1071-
topk_ids,
1072-
problem_sizes1,
1073-
problem_sizes2,
1074-
num_experts,
1075-
n,
1076-
k,
1077-
blockscale_offsets,
1078-
force_swap_ab,
1079-
)
1080-
1081-
10821047
def get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
10831048
expert_first_token_offset: torch.Tensor,
10841049
problem_sizes1: torch.Tensor,

0 commit comments

Comments
 (0)