bugfix: fixed cutlass fused moe usage of FP4QuantizationSFLayout::SWIZZLED (#1371)

yzh119 · web-flow · commit 22a62eac2895 · 2025-08-03T16:15:00.000-07:00
## 📌 Description cutlass fused moe modules are broken after #1355 because the structure of `FP4QuantizationSFLayout` has changed. This PR fixes the issue. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes cc @wenscarl @ttyio
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -983,7 +983,7 @@ __device__ auto quantizePackedFPXValue(
   auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
                                                    NumThreadsPerSF, VecSize>(
       std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED);
+      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED_128x4);
 
   // Do the conversion and set the output and scaling factor
   auto func = [&]() {
@@ -1023,15 +1023,15 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id,
   auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
                                                    NumThreadsPerSF, VecSize>(
       std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED);
+      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED_128x4);
   if (sf_out) {
     if (input_sf) {
       auto const sf_in =
           cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
                                              NumThreadsPerSF, VecSize>(
               std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
               num_cols, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
-              FP4QuantizationSFLayout::SWIZZLED);
+              FP4QuantizationSFLayout::SWIZZLED_128x4);
       *sf_out = *sf_in;
     } else {
       *sf_out = 0x00;