tests: Update support for tgv_gemm to SM100 only and add to ut (#1810)

jimmyzho · web-flow · commit 35e099ee7f34 · 2025-10-01T14:05:04.000-07:00
## 📌 Description add tgv_gemm to tests and update support surface ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
@@ -53,7 +53,7 @@
     gen_gemm_sm100_module,
     gen_gemm_sm100_module_cutlass_fp4,
     gen_gemm_sm100_module_cutlass_fp8,
-    gen_gemm_sm100_module_tgv,
+    gen_tgv_gemm_sm10x_module,
     gen_gemm_sm120_module,
     gen_gemm_sm120_module_cutlass_fp4,
     gen_trtllm_gen_gemm_module,
@@ -412,6 +412,7 @@ def gen_all_modules(
     jit_specs: List[JitSpec] = []
     has_sm90 = sm_capabilities.get("sm90", False)
     has_sm100 = sm_capabilities.get("sm100", False)
+    has_sm100f = sm_capabilities.get("sm100f", False)
     has_sm103 = sm_capabilities.get("sm103", False)
     has_sm110 = sm_capabilities.get("sm110", False)
     has_sm120 = sm_capabilities.get("sm120", False)
@@ -449,11 +450,21 @@ def gen_all_modules(
             jit_specs.append(gen_gemm_sm100_module_cutlass_fp4())
             jit_specs.append(gen_gemm_sm100_module_cutlass_fp8())
             # Add TGV GEMM modules for both bf16 and fp16
-            jit_specs.append(gen_gemm_sm100_module_tgv(torch.bfloat16))
-            jit_specs.append(gen_gemm_sm100_module_tgv(torch.float16))
+            jit_specs.append(
+                gen_tgv_gemm_sm10x_module(torch.bfloat16, use_sm_100f=False)
+            )
+            jit_specs.append(
+                gen_tgv_gemm_sm10x_module(torch.float16, use_sm_100f=False)
+            )
             jit_specs.append(gen_mxfp8_quantization_sm100_module())
             jit_specs.append(gen_trtllm_gen_gemm_module())
             jit_specs.append(gen_trtllm_gen_fused_moe_sm100_module())
+        if has_sm100f:
+            # Add TGV GEMM modules compiled with SM100f flags for both bf16 and fp16
+            jit_specs.append(
+                gen_tgv_gemm_sm10x_module(torch.bfloat16, use_sm_100f=True)
+            )
+            jit_specs.append(gen_tgv_gemm_sm10x_module(torch.float16, use_sm_100f=True))
         if has_sm103:
             jit_specs.append(gen_fp4_quantization_sm103_module())
         if has_sm110:
@@ -588,6 +599,7 @@ def has_sm(compute: str, version: str) -> bool:
     return {
         "sm90": has_sm("compute_90", "12.3"),
         "sm100": has_sm("compute_100", "12.8"),
+        "sm100f": has_sm("compute_100", "12.9"),
         "sm103": has_sm("compute_103", "12.8"),
         "sm110": has_sm("compute_110", "12.9"),
         "sm120": has_sm("compute_120", "13.0"),
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -40,6 +40,7 @@
 from .jit.cubin_loader import get_cubin
 from .utils import (
     is_sm100a_supported,
+    is_sm100f_supported,
     is_sm120a_supported,
     is_sm121a_supported,
     LibraryError,
@@ -65,6 +66,7 @@
     gen_jit_spec,
     sm90a_nvcc_flags,
     sm100a_nvcc_flags,
+    sm100f_nvcc_flags,
     current_compilation_context,
 )
 from .jit.cubin_loader import setup_cubin_loader
@@ -869,12 +871,16 @@ def get_gemm_sm120_module_cutlass_fp4():
     )
 
 
-def gen_gemm_sm100_module_tgv(dtype: torch.dtype = torch.bfloat16) -> JitSpec:
+def gen_tgv_gemm_sm10x_module(
+    dtype: torch.dtype = torch.bfloat16, use_sm_100f: bool = False
+) -> JitSpec:
     """
     Generate TGV GEMM module for SM100 architecture.
 
     Args:
         dtype: Data type for the GEMM operation (torch.bfloat16 or torch.float16)
+        use_sm_100f: Whether to compile with SM100f flags (default: False), which makes the compiled kernel
+            compatible with both B200 and B300 GPUs. However, it's only available with CUDA 12.9+.
 
     Returns:
         JitSpec for the TGV GEMM module
@@ -926,7 +932,7 @@ def gen_gemm_sm100_module_tgv(dtype: torch.dtype = torch.bfloat16) -> JitSpec:
     return gen_jit_spec(
         module_name,
         source_paths,
-        extra_cuda_cflags=sm100a_nvcc_flags,
+        extra_cuda_cflags=sm100f_nvcc_flags if use_sm_100f else sm100a_nvcc_flags,
         extra_include_paths=[
             jit_env.FLASHINFER_INCLUDE_DIR,
             jit_env.FLASHINFER_CSRC_DIR,
@@ -935,17 +941,21 @@ def gen_gemm_sm100_module_tgv(dtype: torch.dtype = torch.bfloat16) -> JitSpec:
 
 
 @functools.cache
-def get_gemm_sm100_module_tgv(dtype: torch.dtype = torch.bfloat16):
+def get_tgv_gemm_sm10x_module(
+    dtype: torch.dtype = torch.bfloat16, use_sm_100f: bool = False
+):
     """
     Get and build the TGV GEMM module for the specified dtype.
 
     Args:
         dtype: Data type for the GEMM operation (torch.bfloat16 or torch.float16)
+        use_sm_100f: Whether to compile with SM100f flags (default: False), which makes the compiled kernel
+            compatible with both B200 and B300 GPUs. However, it's only available with CUDA 12.9+.
 
     Returns:
         SimpleNamespace with the runner function
     """
-    module = gen_gemm_sm100_module_tgv(dtype).build_and_load()
+    module = gen_tgv_gemm_sm10x_module(dtype, use_sm_100f).build_and_load()
 
     def tgv_gemm_runner():
         class TGVGemmRunner(TunableRunner):
@@ -1013,8 +1023,8 @@ def tgv_gemm_sm100(
         - Tensor b is expected to be in column-major layout (transposed from typical PyTorch row-major)
     """
     # Verify SM100 architecture support
-    if not _match_sm_version(a.device, ["100", "103", "110"]):
-        raise ValueError("TGV GEMM requires SM100, SM103, or SM110 architecture")
+    if not _match_sm_version(a.device, ["100", "103"]):
+        raise ValueError("TGV GEMM requires SM100, SM103 architecture")
 
     # Verify dtype support
     if a.dtype not in [torch.bfloat16, torch.float16]:
@@ -1028,7 +1038,8 @@ def tgv_gemm_sm100(
         )
 
     runners = []
-    runners.append(get_gemm_sm100_module_tgv(a.dtype).tgv_gemm_runner())
+    use_sm_100f = is_sm100f_supported(a.device)
+    runners.append(get_tgv_gemm_sm10x_module(a.dtype, use_sm_100f).tgv_gemm_runner())
 
     tuner = AutoTuner.get()
     a_tensor_index = 0
diff --git a/flashinfer/jit/__init__.py b/flashinfer/jit/__init__.py
@@ -63,6 +63,7 @@
 from .core import gen_jit_spec as gen_jit_spec
 from .core import sm90a_nvcc_flags as sm90a_nvcc_flags
 from .core import sm100a_nvcc_flags as sm100a_nvcc_flags
+from .core import sm100f_nvcc_flags as sm100f_nvcc_flags
 from .core import sm103a_nvcc_flags as sm103a_nvcc_flags
 from .core import sm110a_nvcc_flags as sm110a_nvcc_flags
 from .core import sm120a_nvcc_flags as sm120a_nvcc_flags
diff --git a/flashinfer/jit/core.py b/flashinfer/jit/core.py
@@ -75,6 +75,7 @@ def clear_cache_dir():
 sm90a_nvcc_flags = ["-gencode=arch=compute_90a,code=sm_90a"] + common_nvcc_flags
 sm100a_nvcc_flags = ["-gencode=arch=compute_100a,code=sm_100a"] + common_nvcc_flags
 sm103a_nvcc_flags = ["-gencode=arch=compute_103a,code=sm_103a"] + common_nvcc_flags
+sm100f_nvcc_flags = ["-gencode=arch=compute_100f,code=sm_100f"] + common_nvcc_flags
 sm110a_nvcc_flags = ["-gencode=arch=compute_110a,code=sm_110a"] + common_nvcc_flags
 sm120a_nvcc_flags = ["-gencode=arch=compute_120a,code=sm_120a"] + common_nvcc_flags
 sm121a_nvcc_flags = ["-gencode=arch=compute_121a,code=sm_121a"] + common_nvcc_flags
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -466,6 +466,11 @@ def is_sm100a_supported(device: torch.device) -> bool:
     return major == 10 and version_at_least(torch.version.cuda, "12.8")
 
 
+def is_sm100f_supported(device: torch.device) -> bool:
+    major, _ = get_compute_capability(device)
+    return major == 10 and version_at_least(torch.version.cuda, "12.9")
+
+
 def is_sm110a_supported(device: torch.device) -> bool:
     major, _ = get_compute_capability(device)
     return major == 11 and version_at_least(torch.version.cuda, "13.0")
diff --git a/tests/GEMM/test_tgv_gemm.py b/tests/GEMM/test_tgv_gemm.py
@@ -6,6 +6,8 @@
     tgv_gemm_sm100,
 )
 
+from flashinfer.gemm import _match_sm_version
+
 
 @pytest.mark.parametrize("m", [1, 8, 16, 32, 64])
 @pytest.mark.parametrize("n", [1024, 2048, 4096])
@@ -17,6 +19,9 @@ def test_tgv_gemm_sm100(m, n, k, dtype):
     B = torch.randn(n, k, device="cuda", dtype=dtype).t()  # column major
     bias = torch.randn(n, device="cuda", dtype=dtype)
 
+    if not _match_sm_version(A.device, ["100", "103"]):
+        pytest.skip("TGV GEMM requires SM100, SM103 architecture")
+
     print(
         f"Input tensors: A {A.shape}, B {B.shape}, bias {bias.shape}, dtype: {A.dtype}",
         flush=True,