feat: Enable multiple fused-moe backends (#1472)

amirkl94 · web-flow · commit 89d146f5ceaf · 2025-08-12T12:55:38.000-07:00
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -236,7 +236,7 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
         raise RuntimeError(f"Failed to generate Cutlass kernels: {e}") from e
 
     return gen_jit_spec(
-        "fused_moe_sm100",
+        "fused_moe_cutlass_sm100",
         [
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu",
@@ -322,7 +322,7 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
 
 @functools.cache
 def get_cutlass_fused_moe_sm100_module(use_fast_build: bool = False):
-    gen_cutlass_fused_moe_sm100_module(use_fast_build).build_and_load(
+    FusedMoeRunner = gen_cutlass_fused_moe_sm100_module(use_fast_build).build_and_load(
         class_name="FusedMoeRunner"
     )
 
@@ -385,15 +385,13 @@ def __init__(
             )
 
             if instance_key not in MoERunner.runner_dict:
-                MoERunner.runner_dict[instance_key] = (
-                    torch.classes.fused_moe_sm100.FusedMoeRunner(
-                        x_dtype,
-                        weight_dtype,
-                        output_dtype,
-                        use_deepseek_fp8_block_scale,
-                        use_w4a8_group_scaling,
-                        use_mxfp8_act_scaling,
-                    )
+                MoERunner.runner_dict[instance_key] = FusedMoeRunner(
+                    x_dtype,
+                    weight_dtype,
+                    output_dtype,
+                    use_deepseek_fp8_block_scale,
+                    use_w4a8_group_scaling,
+                    use_mxfp8_act_scaling,
                 )
 
             self.fused_moe_runner = MoERunner.runner_dict[instance_key]
@@ -819,7 +817,7 @@ def trtllm_gen_fused_moe_sm100_module() -> JitSpec:
     ]
 
     return gen_jit_spec(
-        "fused_moe_sm100",
+        "fused_moe_trtllm_sm100",
         [
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/envUtils.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/logger.cpp",