vllm-project
diff --git a/‎csrc/moe/mxfp8_grouped_gemm/es_sm100_mxfp8_blockscaled.cu‎
Lines changed: 24 additions & 18 deletions b/‎csrc/moe/mxfp8_grouped_gemm/es_sm100_mxfp8_blockscaled.cu‎
Lines changed: 24 additions & 18 deletions
diff --git a/‎csrc/moe/mxfp8_grouped_gemm/es_sm100_mxfp8_blockscaled_functor.cuh‎
Lines changed: 22 additions & 20 deletions b/‎csrc/moe/mxfp8_grouped_gemm/es_sm100_mxfp8_blockscaled_functor.cuh‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎csrc/moe/mxfp8_grouped_gemm/es_sm100_mxfp8_blockscaled_group_quant.cu‎
Lines changed: 13 additions & 10 deletions b/‎csrc/moe/mxfp8_grouped_gemm/es_sm100_mxfp8_blockscaled_group_quant.cu‎
Lines changed: 13 additions & 10 deletions
@@ -8,39 +8,45 @@
 #include "es_sm100_mxfp8_blockscaled_launcher.cuh"
 
 void es_sm100_mxfp8_blockscaled_grouped_mm(
-    const torch::Tensor& a,
-    const torch::Tensor& b,
-    const torch::Tensor& sfa,
-    const torch::Tensor& sfb,
-    torch::Tensor& d,
-    const torch::Tensor& problem_sizes,
-    const torch::Tensor& expert_offsets,
+    const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& sfa,
+    const torch::Tensor& sfb, torch::Tensor& d,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets,
     const torch::Tensor& blockscale_offsets) {
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
   TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
-  TORCH_CHECK(problem_sizes.size(1) == 3, "problem_sizes must have shape (num_experts, 3)");
-  TORCH_CHECK(
-      problem_sizes.size(0) == expert_offsets.size(0), "Number of experts in problem_sizes must match expert_offsets");
-  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, "problem_sizes must be int32");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
   TORCH_CHECK(a.dim() == 2, "a must be a 2D tensor of shape (num_tokens, k)");
-  TORCH_CHECK(b.dim() == 3, "b must be a 3D tensor of shape (num_experts, k, n)");
-  TORCH_CHECK(a.size(1) == b.size(1) && a.size(1) % 128 == 0, "k should align 128");
+  TORCH_CHECK(b.dim() == 3,
+              "b must be a 3D tensor of shape (num_experts, k, n)");
+  TORCH_CHECK(a.size(1) == b.size(1) && a.size(1) % 128 == 0,
+              "k should align 128");
   TORCH_CHECK(b.size(2) % 128 == 0, "n should align 128");
   TORCH_CHECK(a.strides()[1] == 1, "a must be row major");
   TORCH_CHECK(b.strides()[1] == 1, "b must be column major");
 
   auto stream = at::cuda::getCurrentCUDAStream();
   if (d.dtype() == torch::kBFloat16) {
-    expert_specialization::es_sm100_mxfp8_blockscaled_group_mm_dispatch_out_dtype<cutlass::bfloat16_t>(
-        a, b, sfa, sfb, d, problem_sizes, expert_offsets, blockscale_offsets, stream);
+    expert_specialization::
+        es_sm100_mxfp8_blockscaled_group_mm_dispatch_out_dtype<
+            cutlass::bfloat16_t>(a, b, sfa, sfb, d, problem_sizes,
+                                 expert_offsets, blockscale_offsets, stream);
   } else if (d.dtype() == torch::kFloat16) {
-    expert_specialization::es_sm100_mxfp8_blockscaled_group_mm_dispatch_out_dtype<cutlass::half_t>(
-        a, b, sfa, sfb, d, problem_sizes, expert_offsets, blockscale_offsets, stream);
+    expert_specialization::
+        es_sm100_mxfp8_blockscaled_group_mm_dispatch_out_dtype<cutlass::half_t>(
+            a, b, sfa, sfb, d, problem_sizes, expert_offsets,
+            blockscale_offsets, stream);
   } else {
     TORCH_CHECK(false, "dtype must be kFloat16 or kBFloat16");
   }
 #else
-  TORCH_CHECK(false, "No implemented es_sm100_mxfp8_blockscaled_grouped_mm for current device");
+  TORCH_CHECK(false,
+              "No implemented es_sm100_mxfp8_blockscaled_grouped_mm for "
+              "current device");
 #endif
 }
 
 
@@ -38,18 +38,10 @@ struct Sm100Mxfp8BlockScaledOffsetFunctor {
 
   Sm100Mxfp8BlockScaledOffsetFunctor() = default;
   Sm100Mxfp8BlockScaledOffsetFunctor(
-      int* _expert_offsets,
-      int* _blockscale_offsets,
-      ElementA* _a_base,
-      ElementB* _b_base,
-      ElementSF* _sfa_base,
-      ElementSF* _sfb_base,
-      ElementD* _d_base,
-      ElementA** _a_offsets,
-      ElementB** _b_offsets,
-      ElementSF** _sfa_offsets,
-      ElementSF** _sfb_offsets,
-      ElementD** _d_offsets)
+      int* _expert_offsets, int* _blockscale_offsets, ElementA* _a_base,
+      ElementB* _b_base, ElementSF* _sfa_base, ElementSF* _sfb_base,
+      ElementD* _d_base, ElementA** _a_offsets, ElementB** _b_offsets,
+      ElementSF** _sfa_offsets, ElementSF** _sfb_offsets, ElementD** _d_offsets)
       : expert_offsets{_expert_offsets},
         blockscale_offsets{_blockscale_offsets},
         a_base(_a_base),
@@ -65,7 +57,8 @@ struct Sm100Mxfp8BlockScaledOffsetFunctor {
 
   void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
     int64_t expert_offset = static_cast<int64_t>(expert_offsets[expert_id]);
-    int64_t blockscale_offset = static_cast<int64_t>(blockscale_offsets[expert_id]);
+    int64_t blockscale_offset =
+        static_cast<int64_t>(blockscale_offsets[expert_id]);
     int64_t a_stride = expert_offset * k;
     int64_t b_stride = expert_id * k * n;
     int64_t d_stride = expert_offset * n;
@@ -89,14 +82,17 @@ struct Sm100Mxfp8BlockScaledLayoutFunctor {
   LayoutSFB* layout_sfb_base{nullptr};
 
   Sm100Mxfp8BlockScaledLayoutFunctor() = default;
-  Sm100Mxfp8BlockScaledLayoutFunctor(LayoutSFA* _layout_sfa_base, LayoutSFB* _layout_sfb_base)
+  Sm100Mxfp8BlockScaledLayoutFunctor(LayoutSFA* _layout_sfa_base,
+                                     LayoutSFB* _layout_sfb_base)
       : layout_sfa_base(_layout_sfa_base), layout_sfb_base(_layout_sfb_base) {}
 
   void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
     LayoutSFA* layout_sfa_ptr = layout_sfa_base + expert_id;
     LayoutSFB* layout_sfb_ptr = layout_sfb_base + expert_id;
-    *layout_sfa_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
-    *layout_sfb_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+    *layout_sfa_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(
+        cute::make_shape(m, n, k, 1));
+    *layout_sfb_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(
+        cute::make_shape(m, n, k, 1));
   }
 };
 
@@ -110,8 +106,12 @@ struct Sm100Mxfp8BlockScaledStrideFunctor {
   StrideD* stride_D_base{nullptr};
 
   Sm100Mxfp8BlockScaledStrideFunctor() = default;
-  Sm100Mxfp8BlockScaledStrideFunctor(StrideA* _stride_A_base, StrideB* _stride_B_base, StrideD* _stride_D_base)
-      : stride_A_base(_stride_A_base), stride_B_base(_stride_B_base), stride_D_base(_stride_D_base) {}
+  Sm100Mxfp8BlockScaledStrideFunctor(StrideA* _stride_A_base,
+                                     StrideB* _stride_B_base,
+                                     StrideD* _stride_D_base)
+      : stride_A_base(_stride_A_base),
+        stride_B_base(_stride_B_base),
+        stride_D_base(_stride_D_base) {}
 
   void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
     StrideA* stride_A = stride_A_base + expert_id;
@@ -123,9 +123,11 @@ struct Sm100Mxfp8BlockScaledStrideFunctor {
   }
 };
 
-template <typename OffsetFunctor, typename LayoutFunctor, typename StrideFunctor>
+template <typename OffsetFunctor, typename LayoutFunctor,
+          typename StrideFunctor>
 __global__ void sm100Mxfp8BlockscaledGroupedGemmPreComputeKernel(
-    int* problem_sizes, OffsetFunctor offset_functor, LayoutFunctor layout_functor, StrideFunctor stride_functor) {
+    int* problem_sizes, OffsetFunctor offset_functor,
+    LayoutFunctor layout_functor, StrideFunctor stride_functor) {
   int64_t expert_id = static_cast<int64_t>(threadIdx.x);
   int m = problem_sizes[expert_id * 3 + 0];
   int n = problem_sizes[expert_id * 3 + 1];
 
@@ -8,11 +8,9 @@
 #include "es_sm100_mxfp8_blockscaled_group_quant.cuh"
 
 void es_sm100_mxfp8_blockscaled_grouped_quant(
-    const torch::Tensor& input,
-    const torch::Tensor& problem_sizes,
+    const torch::Tensor& input, const torch::Tensor& problem_sizes,
     const torch::Tensor& expert_offsets,
-    const torch::Tensor& blockscale_offsets,
-    torch::Tensor& quant_output,
+    const torch::Tensor& blockscale_offsets, torch::Tensor& quant_output,
     torch::Tensor& scale_factor) {
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
   TORCH_CHECK(input.dim() == 2, "input must be 2D tensor");
@@ -26,20 +24,25 @@ void es_sm100_mxfp8_blockscaled_grouped_quant(
       "expert_offsets must be 1D and have size equal to the number of groups");
   TORCH_CHECK(
       blockscale_offsets.dim() == 1 && blockscale_offsets.size(0) == groups,
-      "blockscale_offsets must be 1D and have size equal to the number of groups");
+      "blockscale_offsets must be 1D and have size equal to the number of "
+      "groups");
 
   auto stream = at::cuda::getCurrentCUDAStream();
   if (input.dtype() == torch::kBFloat16) {
-    expert_specialization::launch_es_sm100_mxfp8_blockscaled_grouped_quant<__nv_bfloat16>(
-        input, problem_sizes, expert_offsets, blockscale_offsets, quant_output, scale_factor);
+    expert_specialization::launch_es_sm100_mxfp8_blockscaled_grouped_quant<
+        __nv_bfloat16>(input, problem_sizes, expert_offsets, blockscale_offsets,
+                       quant_output, scale_factor);
   } else if (input.dtype() == torch::kFloat16) {
-    expert_specialization::launch_es_sm100_mxfp8_blockscaled_grouped_quant<__half>(
-        input, problem_sizes, expert_offsets, blockscale_offsets, quant_output, scale_factor);
+    expert_specialization::launch_es_sm100_mxfp8_blockscaled_grouped_quant<
+        __half>(input, problem_sizes, expert_offsets, blockscale_offsets,
+                quant_output, scale_factor);
   } else {
     TORCH_CHECK(false, "dtype must be kFloat16 or kBFloat16");
   }
 #else
-  TORCH_CHECK(false, "No implemented es_sm100_mxfp8_blockscaled_grouped_quant for current device");
+  TORCH_CHECK(false,
+              "No implemented es_sm100_mxfp8_blockscaled_grouped_quant for "
+              "current device");
 #endif
 }