setup.py: Support TORCH_CUDA_ARCH_LIST for targeted CUDA builds

ryan-williams · claude · ryan-williams · commit 2beeed67eafe · 2025-08-18T17:10:51.000-04:00
Allow specifying specific CUDA architectures via TORCH_CUDA_ARCH_LIST environment variable to significantly speed up builds in CI/testing. When TORCH_CUDA_ARCH_LIST is set (e.g., "8.6" for A10G or "8.9" for L4), only build for that specific architecture instead of all supported ones. This reduces build time from 30+ minutes to ~3 minutes on single-GPU instances. Falls back to building for all architectures when not set, preserving existing behavior for production builds. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/setup.py b/setup.py
@@ -172,25 +172,39 @@ def append_nvcc_threads(nvcc_extra_args):
                     "Note: make sure nvcc has a supported version by running nvcc -V."
                 )
 
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_53,code=sm_53")
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_62,code=sm_62")
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_70,code=sm_70")
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_72,code=sm_72")
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_80,code=sm_80")
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_87,code=sm_87")
-
-        if bare_metal_version >= Version("11.8"):
+        # Check for TORCH_CUDA_ARCH_LIST environment variable (for CI/testing)
+        # Format: "7.5" or "7.5;8.6" or "7.5 8.6"
+        cuda_arch_list = os.getenv("TORCH_CUDA_ARCH_LIST", "").replace(";", " ").split()
+
+        if cuda_arch_list:
+            # Use only the specified architectures
+            print(f"Building for specific CUDA architectures: {cuda_arch_list}")
+            for arch in cuda_arch_list:
+                arch_num = arch.replace(".", "")
+                cc_flag.append("-gencode")
+                cc_flag.append(f"arch=compute_{arch_num},code=sm_{arch_num}")
+        else:
+            # Default: build for all supported architectures
+            print("Building for all supported CUDA architectures (set TORCH_CUDA_ARCH_LIST to override)")
             cc_flag.append("-gencode")
-            cc_flag.append("arch=compute_90,code=sm_90")
-        if bare_metal_version >= Version("12.8"):
+            cc_flag.append("arch=compute_53,code=sm_53")
             cc_flag.append("-gencode")
-            cc_flag.append("arch=compute_100,code=sm_100")
+            cc_flag.append("arch=compute_62,code=sm_62")
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_70,code=sm_70")
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_72,code=sm_72")
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_80,code=sm_80")
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_87,code=sm_87")
+
+            if bare_metal_version >= Version("11.8"):
+                cc_flag.append("-gencode")
+                cc_flag.append("arch=compute_90,code=sm_90")
+            if bare_metal_version >= Version("12.8"):
+                cc_flag.append("-gencode")
+                cc_flag.append("arch=compute_100,code=sm_100")
 
 
     # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as