support gemm+allreduce only on arch >= blackwell

benzh-2025 · benzh-2025 · commit 81861804387c · 2025-12-30T06:44:23.000Z
Signed-off-by: benzh-2025 &lt;benzh@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -676,8 +676,19 @@ def __init__(
         dtype_supported = config.torch_dtype in (torch.float16, torch.bfloat16)
         tp_valid = self.mapping.tp_size > 1
         quant_valid = self.is_nvfp4 is not None and self.is_nvfp4
-        use_fused_gemm_allreduce = all(
-            [mpi_enabled, dtype_supported, tp_valid, quant_valid])
+
+        device_supported = False
+        if torch.cuda.is_available():
+            capability = torch.cuda.get_device_capability(
+                torch.device('cuda:0'))
+            sm_version = capability[0] * 10 + capability[1]
+            if sm_version >= 100:
+                device_supported = True
+
+        use_fused_gemm_allreduce = all([
+            mpi_enabled, dtype_supported, tp_valid, quant_valid,
+            device_supported
+        ])
 
         def check_in_out_features(in_features, out_features):
             in_feature_valid = in_features % 128 == 0 and in_features >= 1024
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -2163,9 +2163,19 @@ def __init__(
         tp_valid = self.tp_mode is not None and self.tp_mode == TensorParallelMode.ROW and self.tp_size > 1
         quant_valid = self.quant_config is not None and self.quant_config.layer_quant_mode.has_nvfp4(
         )
+
+        device_supported = False
+        if torch.cuda.is_available():
+            capability = torch.cuda.get_device_capability(
+                torch.device('cuda:0'))
+            sm_version = capability[0] * 10 + capability[1]
+            if sm_version >= 100:
+                device_supported = True
+
         self.use_fused_gemm_allreduce = all([
             self.reduce_output, mpi_enabled, dtype_supported,
-            in_features_aligned, out_features_aligned, tp_valid, quant_valid
+            in_features_aligned, out_features_aligned, tp_valid, quant_valid,
+            device_supported
         ])
 
         self.enable_cuda_core = False