flashinfer-ai · hypdeb · Oct 4, 2025 · Oct 4, 2025 · Oct 5, 2025 · Oct 5, 2025
diff --git a/benchmarks/bench_mm_fp8.py b/benchmarks/bench_mm_fp8.py
@@ -0,0 +1,98 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Dict
+from flashinfer.autotuner import autotune
+from flashinfer.trtllm_low_latency_gemm import prepare_low_latency_gemm_weights
+import numpy as np
+import torch
+
+from flashinfer import mm_fp8
+from flashinfer.testing.utils import bench_gpu_time
+
+_cache_permute_indices: Dict[torch.Size, torch.Tensor] = {}
+
+
+def to_float8(
+    x: torch.Tensor, dtype=torch.float8_e4m3fn
+) -> tuple[torch.Tensor, torch.Tensor]:
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+def bench_mm_fp8(m, n, k, in_dtype, out_dtype):
+    torch.manual_seed(123)
+    input_tensor = torch.randn([m, k], device="cuda", dtype=torch.bfloat16)
+    input_fp8, input_inv_s = to_float8(input_tensor, dtype=in_dtype)
+
+    # mat2 row  major -> column major
+    mat2 = torch.randn([n, k], device="cuda", dtype=torch.bfloat16)
+    mat2_fp8, mat2_inv_s = to_float8(mat2, dtype=in_dtype)
+
+    res = torch.zeros([m, n], device="cuda", dtype=out_dtype)
+    global_scale = input_inv_s * mat2_inv_s
+
+    # Do row shuffling.
+    prepared_weights = prepare_low_latency_gemm_weights(
+        mat2_fp8, _cache_permute_indices
+    )
+
+    with autotune(True):
+        mm_fp8(
+            input_fp8,
+            prepared_weights,
+            global_scale,
+            out=res,
+        )
+
+    measurements = bench_gpu_time(
+        lambda: mm_fp8(
+            input_fp8,
+            prepared_weights,
+            global_scale,
+            res,
+        ),
+        dry_run_time_ms=500,
+        repeat_time_ms=2500,
+        use_cuda_graph=True,
+    )
+    ms = np.median(measurements)
+    tflops_per_second = 2 * m * n * k * 1e-9 / ms
+
+    bandwidth = (
+        (
+            input_fp8.numel() * input_fp8.element_size()
+            + prepared_weights.numel() * prepared_weights.element_size()
+            + res.numel() * res.element_size()
+        )
+        / ms
+        / 1e9
+    )
+
+    print(
+        f"mm_fp8 m={m} n={n} k={k} in_dtype={in_dtype} out_dtype={out_dtype}: {tflops_per_second:.2f} TFLOPs/s over {ms:.6f} ms, {bandwidth:.2f} TB/s"
+    )
+
+
+if __name__ == "__main__":
+    for m in [1, 2, 4, 8, 16, 32, 64]:
+        for n in [2560, 5120, 8192]:
+            for k in [16384, 32768]:
+                bench_mm_fp8(m, n, k, torch.float8_e4m3fn, torch.bfloat16)
diff --git a/csrc/trtllm_gemm_runner.cu b/csrc/trtllm_gemm_runner.cu
@@ -43,19 +43,20 @@ struct TrtllmGenGemmRunnerOptions {
 int64_t select_kernel_fp8(int32_t M, int32_t N, int32_t K,
                           const gemm::gemm::GemmInterface& interface) {
   static constexpr const char* KERNEL_NAME_HIGH_N_K_RATIO =
-      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s6_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_"
+      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s6_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_"
+      "transOut_"
       "noShflA_dsFp8_schedP2x2x1x3_sm100f";
 
   static constexpr const char* KERNEL_NAME_LOW_N_K_RATIO =
-      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_"
+      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_"
       "transOut_noShflA_dsFp8_schedS_sm100f";
 
   static constexpr const char* KERNEL_NAME_LARGE_N =
-      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_"
+      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_"
       "transOut_noShflA_dsFp8_schedP2x2x1x3_sm100f";
 
   static constexpr const char* KERNEL_NAME_DEFAULT =
-      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_"
+      "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_"
       "transOut_noShflA_dsFp8_schedS_sm100f";
 
   double const n_k_ratio = static_cast<double>(N) / static_cast<double>(K);