xiao-llm
diff --git a/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Lines changed: 34 additions & 1 deletion b/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Lines changed: 34 additions & 1 deletion
diff --git a/‎csrc/moe/moe_permute_unpermute_op.cu
Lines changed: 14 additions & 19 deletions b/‎csrc/moe/moe_permute_unpermute_op.cu
Lines changed: 14 additions & 19 deletions
diff --git a/‎csrc/ops.h
Lines changed: 5 additions & 0 deletions b/‎csrc/ops.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
Lines changed: 4 additions & 2 deletions b/‎csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
Lines changed: 4 additions & 2 deletions
diff --git a/‎csrc/quantization/cutlass_w8a8/moe/moe_data.cu
Lines changed: 50 additions & 15 deletions b/‎csrc/quantization/cutlass_w8a8/moe/moe_data.cu
Lines changed: 50 additions & 15 deletions
diff --git a/‎csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
Lines changed: 24 additions & 0 deletions b/‎csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
Lines changed: 24 additions & 0 deletions
diff --git a/‎csrc/torch_bindings.cpp
Lines changed: 13 additions & 0 deletions b/‎csrc/torch_bindings.cpp
Lines changed: 13 additions & 0 deletions
diff --git a/‎tests/kernels/moe/test_cutlass_moe.py
Lines changed: 14 additions & 4 deletions b/‎tests/kernels/moe/test_cutlass_moe.py
Lines changed: 14 additions & 4 deletions
diff --git a/‎tests/kernels/moe/test_moe_permute_unpermute.py
Lines changed: 5 additions & 1 deletion b/‎tests/kernels/moe/test_moe_permute_unpermute.py
Lines changed: 5 additions & 1 deletion
@@ -80,6 +80,11 @@ def bench_run(
         a, score, topk, renormalize=False
     )
 
+    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
     def run_triton_moe(
         a: torch.Tensor,
         w1: torch.Tensor,
@@ -111,6 +116,10 @@ def run_cutlass_moe(
         w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         per_act_token: bool,
@@ -125,6 +134,10 @@ def run_cutlass_moe(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -136,6 +149,10 @@ def run_cutlass_from_graph(
         w2_q: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
     ):
@@ -150,6 +167,10 @@ def run_cutlass_from_graph(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -194,6 +215,10 @@ def replay_graph(graph, num_repeats):
             w2_q,
             w1_scale,
             w2_scale,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
             topk_weights,
             topk_ids,
         )
@@ -231,6 +256,10 @@ def replay_graph(graph, num_repeats):
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
         "per_act_token": per_act_token,
+        "ab_strides1": ab_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides1": c_strides1,
+        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -289,6 +318,10 @@ def replay_graph(graph, num_repeats):
         w2_q,
         w1_scale,
         w2_scale,
+        ab_strides1,
+        ab_strides2,
+        c_strides1,
+        c_strides2,
         topk_weights,
         topk_ids,
         per_act_token,
@@ -297,7 +330,7 @@ def replay_graph(graph, num_repeats):
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
 
@@ -45,8 +45,6 @@ void moe_permute(
   auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
   auto permuted_experts_id = torch::empty_like(topk_ids);
   auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
-  auto align_expert_first_token_offset =
-      torch::zeros_like(expert_first_token_offset);
 
   CubKeyValueSorter sorter{};
   int64_t* valid_num_ptr = nullptr;
@@ -85,12 +83,14 @@ void moe_permute(
   });
 
   // get m_indices and update expert_first_token_offset with align block
-  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
-              get_ptr<int64_t>(align_expert_first_token_offset),
-              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
-              stream);
+  // this is only required for DeepGemm and not required for CUTLASS group gemm
   if (align_block_size.has_value()) {
-    // update align_expert_first_token_offset
+    auto align_expert_first_token_offset =
+        torch::zeros_like(expert_first_token_offset);
+    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+                get_ptr<int64_t>(align_expert_first_token_offset),
+                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
+                stream);
     expert_first_token_offset.copy_(align_expert_first_token_offset);
   }
 }
@@ -195,19 +195,14 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                  torch::Tensor& expert_first_token_offset,
                  torch::Tensor& src_row_id2dst_row_id_map,
                  torch::Tensor& m_indices) {
-  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
 }
 
-void moe_unpermute(const torch::Tensor& input,
-                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
-                   const torch::Tensor& token_expert_indices,
-                   const std::optional<torch::Tensor>& expert_map,
-                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
-                   const std::optional<int64_t>& align_block_size,
-                   torch::Tensor& permuted_input,
-                   torch::Tensor& expert_first_token_offset,
-                   torch::Tensor& src_row_id2dst_row_id_map,
-                   torch::Tensor& m_indices) {
+void moe_unpermute(
+    const torch::Tensor& permuted_hidden_states,
+    const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx,
+    const std::optional<torch::Tensor>& expert_first_token_offset, int64_t topk,
+    torch::Tensor& hidden_states) {
   TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
 }
 
@@ -224,4 +219,4 @@ bool moe_permute_unpermute_supported() {
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_permute", &moe_permute);
   m.impl("moe_unpermute", &moe_unpermute);
-}
+}
@@ -229,6 +229,11 @@ void get_cutlass_moe_mm_data(
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
 
+void get_cutlass_moe_mm_problem_sizes(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
                                   torch::Tensor& problem_sizes2,
 
@@ -10,7 +10,7 @@
 
 template <typename ElementAB, typename ElementC, typename ElementAccumulator>
 __global__ void get_group_gemm_starts(
-    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    int64_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
     ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
     ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int,
     ElementAB* b_base_as_int, ElementC* out_base_as_int,
@@ -34,7 +34,7 @@ __global__ void get_group_gemm_starts(
   else if (out_tensors.dtype() == TENSOR_C_TYPE) {                         \
     get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float>            \
         <<<1, num_experts, 0, stream>>>(                                   \
-            static_cast<int32_t*>(expert_offsets.data_ptr()),              \
+            static_cast<int64_t*>(expert_offsets.data_ptr()),              \
             static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),       \
             static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),       \
             static_cast<C_TYPE**>(out_ptrs.data_ptr()),                    \
@@ -61,6 +61,8 @@ void run_get_group_gemm_starts(
   TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  // expect int64_t to avoid overflow during offset calculations
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
 
   int num_experts = static_cast<int>(expert_offsets.size(0));
   bool per_act_token = a_scales.numel() != 1;
 
@@ -104,6 +104,53 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
   }
 }
 
+namespace {
+inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         torch::Tensor& atomic_buffer,
+                                         int64_t num_experts, int64_t n,
+                                         int64_t k, cudaStream_t stream,
+                                         const bool swap_ab) {
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+
+  const int32_t* topk_ptr = static_cast<const int32_t*>(topk_ids.data_ptr());
+  int32_t* ps1_ptr = static_cast<int32_t*>(problem_sizes1.data_ptr());
+  int32_t* ps2_ptr = static_cast<int32_t*>(problem_sizes2.data_ptr());
+  int32_t* atomic_ptr = static_cast<int32_t*>(atomic_buffer.data_ptr());
+
+  if (swap_ab) {
+    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
+        topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
+        static_cast<int>(topk_ids.numel()), static_cast<int>(n),
+        static_cast<int>(k));
+  } else {
+    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
+        topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
+        static_cast<int>(topk_ids.numel()), static_cast<int>(n),
+        static_cast<int>(k));
+  }
+}
+}  // namespace
+
+void get_cutlass_moe_mm_problem_sizes_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  // Swap-AB should be disabled for FP4 path
+  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
+                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
+
+  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
+                               atomic_buffer, num_experts, n, k, stream,
+                               may_swap_ab);
+}
+
 void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -121,21 +168,9 @@ void get_cutlass_moe_mm_data_caller(
   bool may_swap_ab = (!blockscale_offsets.has_value()) &&
                      (topk_ids.numel() <= SWAP_AB_THRESHOLD);
 
-  if (may_swap_ab) {
-    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  } else {
-    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  }
+  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
+                               atomic_buffer, num_experts, n, k, stream,
+                               may_swap_ab);
 
   if (blockscale_offsets.has_value()) {
     // fp4 path
 
@@ -76,6 +76,11 @@ void get_cutlass_moe_mm_data_caller(
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
 
+void get_cutlass_moe_mm_problem_sizes_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+
 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                          torch::Tensor& problem_sizes1,
                                          torch::Tensor& problem_sizes2,
@@ -293,6 +298,25 @@ void get_cutlass_moe_mm_data(
       version_num, ". Required capability: 90 or 100");
 }
 
+void get_cutlass_moe_mm_problem_sizes(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+  int32_t version_num = get_sm_version_num();
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
+                                          problem_sizes2, num_experts, n, k,
+                                          blockscale_offsets);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
+      "kernel for CUDA device capability: ",
+      version_num, ". Required capability: 90 or 100");
+}
+
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
                                   torch::Tensor& problem_sizes2,
 
@@ -440,6 +440,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       {stride_tag});
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
+  // A function that computes problem sizes for each expert's multiplication
+  // used by the two mms called from fused MoE operation. It takes topk_ids as
+  // an input, and computes problem_sizes1 and problem_sizes2 only.
+  ops.def(
+      "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
+      "                                 Tensor! problem_sizes1, "
+      "                                 Tensor! problem_sizes2, "
+      "                                 int num_experts, int n, int k, "
+      "                                 Tensor? blockscale_offsets) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
+           &get_cutlass_moe_mm_problem_sizes);
+
   // A function that computes data required to run fused MoE with w8a8 grouped
   // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
   // as an input, and computes expert_offsets (token start indices of each
 
@@ -207,6 +207,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
         'topk_ids': topk_ids,
         'w1_scale': moe_tensors.w1_scale,
         'w2_scale': moe_tensors.w2_scale,
+        'ab_strides1': moe_tensors.ab_strides1,
+        'ab_strides2': moe_tensors.ab_strides2,
+        'c_strides1': moe_tensors.c_strides1,
+        'c_strides2': moe_tensors.c_strides2,
         'per_act_token': per_act_token,
         'a1_scale': None  #moe_tensors.a_scale
     }
@@ -424,8 +428,8 @@ def test_run_cutlass_moe_fp8(
         topk_ids[0][1] = 1
 
         workspace13_shape = (m * topk, max(2 * n, k))
-        workspace2_shape = (m * topk, n)
-        output_shape = (m * topk, k)
+        workspace2_shape = (m * topk, max(n, k))
+        output_shape = (m, k)
 
         workspace13 = torch.empty(prod(workspace13_shape),
                                   device="cuda",
@@ -440,6 +444,11 @@ def test_run_cutlass_moe_fp8(
         expert_map[start:end] = list(range(num_local_experts))
         expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
 
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
         activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
         a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
                                                    torch.float8_e4m3fn,
@@ -448,8 +457,9 @@ def test_run_cutlass_moe_fp8(
         func = lambda output: run_cutlass_moe_fp8(
             output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
             global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
-            a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
-            per_act_token, per_out_channel, False)
+            a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2,
+            workspace13, workspace2, None, mt.a.dtype, per_act_token,
+            per_out_channel, False, topk_weights)
 
         workspace13.random_()
         output_random_workspace = torch.empty(output_shape,
 
@@ -238,7 +238,11 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
                                atol=0,
                                rtol=0)
     # check mindice
-    torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
+    # current kernel usage assumes deepgemm requires align_block_size
+    # when it's not provided then we don't compute m_indices (for cutlass)
+    if align_block_size is not None:
+        torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
+
     # check permuted_hidden_states, only valid token
     torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
                                permuted_hidden_states[valid_row_idx],