refactor: unify autotuner for fp4 backends

ttyio · ttyio · commit f601746e2916 · 2025-08-08T23:30:07.000Z
Signed-off-by: Vincent Huang &lt;vincenth@nvidia.com&gt;
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -359,70 +359,22 @@ def forward(
             tactic: int = -1,
             do_preparation: bool = False,
         ):
-            a, b, a_descale, b_descale, alpha, out, workspace_buffer = inputs
+            workspace_buffer, a, b, a_descale, b_descale, alpha, out = inputs
             module.fp4_gemm.default(
                 a, b, a_descale, b_descale, alpha, out, workspace_buffer, tactic
             )
             return out
 
     @register_custom_op(
-        "flashinfer::cutlass_fp4_gemm",
+        "flashinfer::cutlass_fp4_gemm_runner",
         mutates_args=(""),
     )
-    def cutlass_fp4_gemm(
-        a: torch.Tensor,
-        b: torch.Tensor,
-        a_descale: torch.Tensor,
-        b_descale: torch.Tensor,
-        alpha: torch.Tensor,
-        out: torch.Tensor,
-        workspace_buffer: torch.Tensor,
-    ):
-        tuner = AutoTuner.get()
-
-        a_tensor_index = 0
-        a_scale_tensor_index = 2
-        out_tensor_index = 5
-
-        def pad_up(x, y):
-            return ((x + y - 1) // y) * y
-
-        tuning_config = TuningConfig(
-            dynamic_tensor_specs=(
-                DynamicTensorSpec(
-                    a_tensor_index,
-                    0,
-                    get_last_power_of_2_num_tokens_buckets,
-                    last_positive_power_of_2,
-                ),
-            ),
-            constraint_specs=(
-                ConstraintSpec(
-                    a_scale_tensor_index,
-                    0,
-                    lambda shapes: pad_up(shapes[a_tensor_index][0], 128),
-                ),
-                ConstraintSpec(
-                    out_tensor_index, 0, lambda shapes: shapes[a_tensor_index][0]
-                ),
-            ),
-        )
-
-        fp4_runner = CutlassFp4GemmRunner()
-
-        inputs = [a, b, a_descale, b_descale, alpha, out, workspace_buffer]
-        _, tactic = tuner.choose_one(
-            "cutlass_fp4_gemm",
-            [fp4_runner],
-            tuning_config,
-            inputs,
-        )
-
-        fp4_runner(inputs=inputs, tactic=tactic)
+    def cutlass_fp4_gemm_runner():
+        return CutlassFp4GemmRunner()
 
     # Register the module
     return SimpleNamespace(
-        cutlass_fp4_gemm=cutlass_fp4_gemm,
+        cutlass_fp4_gemm_runner=cutlass_fp4_gemm_runner,
     )
 
 
@@ -1316,7 +1268,7 @@ def mm_fp4(
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "cudnn",
 ) -> torch.Tensor:
     r"""MM FP4
 
@@ -1470,25 +1422,47 @@ def mm_fp4(
                 f"Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations."
             )
 
-        get_trtllm_fp4_gemm_module().trtllm_fp4_gemm(
+        fp4_gemm_sm100(
             a,
             b.T,
             a_descale,
             b_descale.T,
             alpha,
             out,
-            use_8x4_sf_layout=use_8x4_sf_layout,
-            workspace_buffer=workspace_buffer,
+            workspace_buffer,
+            use_8x4_sf_layout,
+            ["trtllm"],
         )
     elif backend == "cutlass":
         # cutlass require uint8 scale when a/b is fp4 packed uint8.
         if a.dtype == torch.uint8 and a_descale.dtype == torch.float8_e4m3fn:
             a_descale = a_descale.view(torch.uint8)
         if b.dtype == torch.uint8 and b_descale.dtype == torch.float8_e4m3fn:
             b_descale = b_descale.view(torch.uint8)
-        get_gemm_sm100_module_cutlass_fp4().cutlass_fp4_gemm(
-            a, b.T, a_descale, b_descale.T, alpha, out, workspace_buffer
+        fp4_gemm_sm100(
+            a,
+            b.T,
+            a_descale,
+            b_descale.T,
+            alpha,
+            out,
+            workspace_buffer,
+            use_8x4_sf_layout,
+            ["cutlass"],
         )
+    elif backend == "auto":
+        fp4_gemm_sm100(
+            a,
+            b.T,
+            a_descale,
+            b_descale.T,
+            alpha,
+            out,
+            workspace_buffer,
+            use_8x4_sf_layout,
+            ["trtllm", "cutlass"],
+        )
+
     return out
 
 
@@ -1782,76 +1756,92 @@ def forward(
             return out
 
     @register_custom_op(
-        "flashinfer::trtllm_fp4_gemm",
+        "flashinfer::trtllm_fp4_gemm_runner",
         mutates_args=(""),
     )
-    def trtllm_fp4_gemm(
-        a: torch.Tensor,
-        b: torch.Tensor,
-        a_descale: torch.Tensor,
-        b_descale: torch.Tensor,
-        alpha: torch.Tensor,
-        out: torch.Tensor,
-        use_8x4_sf_layout: bool,
-        workspace_buffer: torch.Tensor,
-    ):
-        tuner = AutoTuner.get()
-
-        a_tensor_index = 1
-        a_scale_tensor_index = 3
-        out_tensor_index = 6
-
-        def pad_up(x, y):
-            return ((x + y - 1) // y) * y
-
-        tuning_config = TuningConfig(
-            dynamic_tensor_specs=(
-                DynamicTensorSpec(
-                    a_tensor_index,
-                    0,
-                    get_last_power_of_2_num_tokens_buckets,
-                    last_positive_power_of_2,
-                ),
-            ),
-            constraint_specs=(
-                ConstraintSpec(
-                    a_scale_tensor_index,
-                    0,
-                    lambda shapes: pad_up(
-                        shapes[a_tensor_index][0], 8 if use_8x4_sf_layout else 128
-                    ),
-                ),
-                ConstraintSpec(
-                    out_tensor_index, 0, lambda shapes: shapes[a_tensor_index][0]
-                ),
-            ),
-        )
+    def trtllm_fp4_gemm_runner(use_8x4_sf_layout):
+        return TrtllmFp4GemmRunner(use_8x4_sf_layout)
 
-        fp4_runner = TrtllmFp4GemmRunner(use_8x4_sf_layout)
+    # Register the module
+    return SimpleNamespace(
+        trtllm_fp4_gemm_runner=trtllm_fp4_gemm_runner,
+    )
 
-        inputs = [
-            workspace_buffer,
-            a,
-            b,
-            a_descale,
-            b_descale,
-            alpha,
-            out,
-        ]
-        _, tactic = tuner.choose_one(
-            "trtllm_fp4_gemm_8x4" if use_8x4_sf_layout else "trtllm_fp4_gemm_128x4",
-            [fp4_runner],
-            tuning_config,
-            inputs,
-        )
 
-        fp4_runner(inputs=inputs, tactic=tactic)
+def fp4_gemm_sm100(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_descale: torch.Tensor,
+    b_descale: torch.Tensor,
+    alpha: torch.Tensor,
+    out: torch.Tensor,
+    workspace_buffer: torch.Tensor,
+    use_8x4_sf_layout: bool,
+    runner_names: List[str],
+):
+    runners = []
+
+    if "trtllm" in runner_names:
+        runners.append(
+            get_trtllm_fp4_gemm_module().trtllm_fp4_gemm_runner(
+                use_8x4_sf_layout=use_8x4_sf_layout
+            )
+        )
+    if "cutlass" in runner_names and not use_8x4_sf_layout:
+        runners.append(get_gemm_sm100_module_cutlass_fp4().cutlass_fp4_gemm_runner())
+    if len(runners) == 0:
+        raise ValueError("No runner specified")
+
+    tuner = AutoTuner.get()
+
+    a_tensor_index = 1
+    a_scale_tensor_index = 3
+    out_tensor_index = 6
+
+    def pad_up(x, y):
+        return ((x + y - 1) // y) * y
+
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(
+            DynamicTensorSpec(
+                a_tensor_index,
+                0,
+                get_last_power_of_2_num_tokens_buckets,
+                last_positive_power_of_2,
+            ),
+        ),
+        constraint_specs=(
+            ConstraintSpec(
+                a_scale_tensor_index,
+                0,
+                lambda shapes: pad_up(
+                    shapes[a_tensor_index][0], 8 if use_8x4_sf_layout else 128
+                ),
+            ),
+            ConstraintSpec(
+                out_tensor_index, 0, lambda shapes: shapes[a_tensor_index][0]
+            ),
+        ),
+    )
 
-    # Register the module
-    return SimpleNamespace(
-        trtllm_fp4_gemm=trtllm_fp4_gemm,
+    inputs = [
+        workspace_buffer,
+        a,
+        b,
+        a_descale,
+        b_descale,
+        alpha,
+        out,
+    ]
+    runner, tactic = tuner.choose_one(
+        "fp4_gemm_auto_128x4",
+        runners,
+        tuning_config,
+        inputs,
     )
 
+    runner(inputs=inputs, tactic=tactic)
+
 
 def gemm_fp8_nt_blockscaled(
     a: torch.Tensor,