bugfix: gen_trtllm_comm_module: fix device capability detection (#1356)

dtrifiro · yzh119 · web-flow · commit ece16899bbbe · 2025-07-31T00:27:27.000-07:00
fixes #1256 --------- Co-authored-by: Zihao Ye <expye@outlook.com>
diff --git a/flashinfer/comm/trtllm_ar.py b/flashinfer/comm/trtllm_ar.py
@@ -23,11 +23,12 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
+from torch.utils.cpp_extension import _get_cuda_arch_flags
 
 from ..jit import JitSpec
 from ..jit import env as jit_env
 from ..jit import gen_jit_spec, sm100a_nvcc_flags
-from ..utils import register_custom_op, round_up
+from ..utils import register_custom_op, round_up, version_at_least
 from .cuda_ipc import create_shared_buffer, cudart, free_shared_buffer
 
 
@@ -96,15 +97,18 @@ class FP4QuantizationSFLayout:
 
 
 def gen_trtllm_comm_module() -> JitSpec:
-    major, minor = torch.cuda.get_device_capability()
+    gencode_flags = _get_cuda_arch_flags()
+    has_sm100 = any(
+        "compute_100" in flag for flag in gencode_flags
+    ) and version_at_least(torch.version.cuda, "12.8")
     return gen_jit_spec(
         "trtllm_comm",
         [
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_allreduce.cu",
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_allreduce_fusion.cu",
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_moe_allreduce_fusion.cu",
         ],
-        extra_cuda_cflags=sm100a_nvcc_flags if major >= 10 and minor >= 0 else [],
+        extra_cuda_cflags=sm100a_nvcc_flags if has_sm100 else [],
     )