Make GPU architecture detection optional for evoformer

sdvillal · sdvillal · commit 01d56b9a2f8a · 2026-01-17T16:49:45.000+01:00
diff --git a/docs/_tutorials/ds4sci_evoformerattention.md b/docs/_tutorials/ds4sci_evoformerattention.md
@@ -27,7 +27,8 @@ export CUTLASS_PATH=/path/to/cutlass
 The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time.
 
 `DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher (NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is recommended to use CUDA 11.7 or later for better performance. Besides, the performance of backward kernel on V100 kernel is not as good as that on A100 for now.
-The extension checks both requirements and fails if any is not met. To disable the check, for example for cross-compiling in a system without GPUs, you can set the environment variable ```DS_IGNORE_CUDA_DETECTION=TRUE```.
+The extension checks both requirements and fails if any is not met. To disable the check, for example for cross-compiling in a system without GPUs, you can set the environment variable ```DS_IGNORE_CUDA_DETECTION=TRUE```
+and the environment value ```DS_EVOFORMER_GPU_ARCH={70|75|80}```, which controls the target GPU (80 being the last supported and meaning NVIDIA Ampere and later).
 
 ### 3.2 Unit test and benchmark
 
diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
@@ -16,6 +16,10 @@ def __init__(self, name=None):
         name = self.NAME if name is None else name
         super().__init__(name=name)
         self.cutlass_path = os.environ.get('CUTLASS_PATH')
+        # Target GPU architecture
+        # Current useful values: >70, >75, >80, see gemm_kernel_utils.h
+        # For modern GPUs, >80 is obfiously the right value
+        self.gpu_arch = os.environ.get('DS_EVOFORMER_GPU_ARCH')
 
     def absolute_name(self):
         return f'deepspeed.ops.{self.NAME}_op'
@@ -32,14 +36,17 @@ def sources(self):
 
     def nvcc_args(self):
         args = super().nvcc_args()
-        try:
-            import torch
-        except ImportError:
-            self.warning("Please install torch if trying to pre-compile kernels")
-            return args
-        major = torch.cuda.get_device_properties(0).major  #ignore-cuda
-        minor = torch.cuda.get_device_properties(0).minor  #ignore-cuda
-        args.append(f"-DGPU_ARCH={major}{minor}")
+        if not self.gpu_arch:
+            try:
+                import torch
+            except ImportError:
+                self.warning("Please install torch if trying to pre-compile kernels")
+                return args
+            major = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            minor = torch.cuda.get_device_properties(0).minor  #ignore-cuda
+            args.append(f"-DGPU_ARCH={major}{minor}")
+        else:
+            args.append(f"-DGPU_ARCH={self.gpu_arch}")
         return args
 
     def is_compatible(self, verbose=False):