bugfix: add check for empty MoE tactics and allow sm121 to use sm120 config (#1861)

yongwww · web-flow · commit d3e9b4402a05 · 2025-10-04T13:34:42.000-07:00
## 📌 Description - Add safety check for empty tactics to provide clear error message - Allow SM121 devices to use SM120 kernel configurations - Mark test_moe_mxfp8_mxfp4 as xfail (kernel not yet implemented) for sm120/121 ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes cc: @djmmoss @yzh119 @bkryu @cyx-6 @nv-yunzheq
diff --git a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu
@@ -220,6 +220,11 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
 
     mProfiler = std::make_shared<kernels::GemmProfilerBackend>();
     mAllProfiles = mKernelRunner->getTactics();
+    TVM_FFI_ICHECK(!mAllProfiles.empty())
+        << "No valid tactics available for fused moe op with the requested input combination "
+           "Activation: "
+        << DLDataTypeToString(mActivationDtype) << ", Weight: " << DLDataTypeToString(mWeightDtype)
+        << ", Output: " << DLDataTypeToString(mOutputDtype);
   }
 
   void runMoe(Tensor output, Tensor input, Tensor token_selected_experts,
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -731,7 +731,9 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
         // 110 below logging helps confirming the cutlass pipeline matches the device major version
         bool is_sm110 = inputs.gemm_config.sm_version == 100 && sm_ == 110;
         bool is_sm103 = inputs.gemm_config.sm_version == 100 && sm_ == 103;
-        TLLM_CHECK_WITH_INFO(is_same_sm || is_sm110 || is_sm103,
+        // SM120 and SM121 are architecturally identical
+        bool is_sm120 = (inputs.gemm_config.sm_version == 120) && (sm_ == 120 || sm_ == 121);
+        TLLM_CHECK_WITH_INFO(is_same_sm || is_sm110 || is_sm103 || is_sm120,
                              "Using SM %d configuration for SM %d device",
                              inputs.gemm_config.sm_version, sm_);
         TLLM_CHECK_WITH_INFO(inputs.biases != nullptr || hopper_inputs.ptr_c == nullptr,
diff --git a/tests/moe/test_trtllm_cutlass_fused_moe.py b/tests/moe/test_trtllm_cutlass_fused_moe.py
@@ -1093,8 +1093,8 @@ def dequant_mxfp4_batches(
     ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
 )
 @pytest.mark.skipif(
-    torch.cuda.get_device_capability()[0] not in [10, 11, 12],
-    reason="MXFP8xMXFP4 is only supported on SM100, SM110 and SM120",
+    torch.cuda.get_device_capability()[0] not in [10, 11],
+    reason="MXFP8xMXFP4 is only supported on SM100 and SM110",
 )
 def test_moe_mxfp8_mxfp4(
     batch_size,

Original file line number	Diff line number	Diff line change
`@@ -1093,8 +1093,8 @@ def dequant_mxfp4_batches(`
`1093`	`1093`	`("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]`
`1094`	`1094`	`)`
`1095`	`1095`	`@pytest.mark.skipif(`
`1096`		`- torch.cuda.get_device_capability()[0] not in [10, 11, 12],`
`1097`		`- reason="MXFP8xMXFP4 is only supported on SM100, SM110 and SM120",`
	`1096`	`+ torch.cuda.get_device_capability()[0] not in [10, 11],`
	`1097`	`+ reason="MXFP8xMXFP4 is only supported on SM100 and SM110",`
`1098`	`1098`	`)`
`1099`	`1099`	`def test_moe_mxfp8_mxfp4(`
`1100`	`1100`	`batch_size,`