Improve CUDA capability handling (#329)

danieldk · web-flow · commit 9ea57a83f784 · 2025-12-18T12:09:32.000+01:00
We computed a kernel's capabilities by taking the loose intersection of
the stated kernel capabilities (or the default) and the capabilities
reported to be supported by CMake/Torch. However, this led to issues
with e.g. capability 8.9, which is not in these lists (anymore?), but is
fine to compile for.

To solve this issue, we will ignore the capabilities reported by
CMake/Torch and instead use our own list of capabilities for the loose
intersection with the kernel capabilities. This list is the list of all
capabilities supported by a CUDA version minus some really old
capabilities that are not supported by Torch anyway. This behavior is
used by enabling the new `BUILD_ALL_SUPPORTED_ARCHS` CMake option (which
is the default for the Nix and Windows builders).

When `BUILD_ALL_SUPPORTED_ARCHS` is not set, we will try to detect the
capability of the user's CUDA GPU. This speeds up development - since
one then only has to compile for a single capability. If this fails for
some reason, we'll revert to using all capabilities as if
`BUILD_ALL_SUPPORTED_ARCHS` was set.
diff --git a/build2cmake/src/templates/cuda/kernel.cmake b/build2cmake/src/templates/cuda/kernel.cmake
@@ -18,7 +18,7 @@ if(GPU_LANG STREQUAL "CUDA")
   {% if cuda_capabilities %}
     cuda_archs_loose_intersection({{kernel_name}}_ARCHS "{{ cuda_capabilities|join(";") }}" "${CUDA_ARCHS}")
   {% else %}
-    cuda_archs_loose_intersection({{kernel_name}}_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}")
+    set({{kernel_name}}_ARCHS "${CUDA_KERNEL_ARCHS}")
   {% endif %}
   message(STATUS "Capabilities for kernel {{kernel_name}}: {{ '${' + kernel_name + '_ARCHS}'}}")
   set_gencode_flags_for_srcs(SRCS {{'"${' + kernel_name + '_SRC}"'}} CUDA_ARCHS "{{ '${' + kernel_name + '_ARCHS}'}}")
diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake
@@ -9,8 +9,6 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
-set(CUDA_SUPPORTED_ARCHS "{{ cuda_supported_archs }}")
-
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
@@ -50,6 +48,8 @@ if (NOT TARGET_DEVICE STREQUAL "cuda" AND
     return()
 endif()
 
+option(BUILD_ALL_SUPPORTED_ARCHS "Build all supported architectures" off)
+
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
  set(CUDA_DEFAULT_KERNEL_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0+PTX")
@@ -90,13 +90,26 @@ endif()
 
 
 if(GPU_LANG STREQUAL "CUDA")
-  clear_cuda_arches(CUDA_ARCH_FLAGS)
-  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
-  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
-  # Filter the target architectures by the supported supported archs
-  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
-  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+  # This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need
+  # to set our own set of capabilities.
+  clear_gencode_flags()
+
+  # Get the capabilities without +PTX suffixes, so that we can use them as
+  # the target archs in the loose intersection with a kernel's capabilities.
+  cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}")
+
+  if(BUILD_ALL_SUPPORTED_ARCHS)
+    set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  else()
+    try_run_python(CUDA_KERNEL_ARCHS SUCCESS "import torch; cc=torch.cuda.get_device_capability(); print(f\"{cc[0]}.{cc[1]}\")" "Failed to get CUDA capability")
+    if(NOT SUCCESS)
+      message(WARNING "Failed to detect CUDA capability, using default capabilities.")
+      set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+    endif()
+  endif()
+
+  message(STATUS "CUDA supported kernel architectures: ${CUDA_KERNEL_ARCHS}")
 
   if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
     list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")
diff --git a/build2cmake/src/templates/utils.cmake b/build2cmake/src/templates/utils.cmake
@@ -42,6 +42,29 @@ function (run_python OUT EXPR ERR_MSG)
   set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
 endfunction()
 
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, `SUCCESS` is set to FALSE. If successful, `SUCCESS` is set to TRUE.
+#
+function (try_run_python OUT SUCCESS EXPR)
+  execute_process(
+    COMMAND
+    "${Python3_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_QUIET
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    set(${SUCCESS} FALSE PARENT_SCOPE)
+    set(${OUT} "" PARENT_SCOPE)
+  else()
+    set(${SUCCESS} TRUE PARENT_SCOPE)
+    set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+  endif()
+endfunction()
+
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
 # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
 macro (append_cmake_prefix_path PKG EXPR)
@@ -152,34 +175,28 @@ macro(string_to_ver OUT_VER IN_STR)
 endmacro()
 
 #
-# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
-# `CUDA_ARCH_FLAGS`.
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS`.
 #
 # Example:
 #   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
-#   clear_cuda_arches(CUDA_ARCH_FLAGS)
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   clear_gencode_flags()
 #   CMAKE_CUDA_FLAGS="-Wall"
 #
-macro(clear_cuda_arches CUDA_ARCH_FLAGS)
-    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
-    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-
+macro(clear_gencode_flags)
     # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
     # and passed back via the `CUDA_ARCHITECTURES` property.
     string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
       ${CMAKE_CUDA_FLAGS})
 endmacro()
 
 #
-# Extract unique CUDA architectures from a list of compute capabilities codes in 
-# the form `<major><minor>[<letter>]`, convert them to the form sort 
-# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
+# Extract unique CUDA architectures from a list of compute capabilities codes in
+# the form `<major><minor>[<letter>]`, convert them to the form sort
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
 # stores them in `OUT_ARCHES`.
 #
 # Example:
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
 #   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
 #   OUT_ARCHES="7.5;...;9.0"
 function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
@@ -200,15 +217,15 @@ function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
 endfunction()
 
 #
-# For a specific file set the `-gencode` flag in compile options conditionally 
-# for the CUDA language. 
+# For a specific file set the `-gencode` flag in compile options conditionally
+# for the CUDA language.
 #
 # Example:
 #   set_gencode_flag_for_srcs(
 #     SRCS "foo.cu"
 #     ARCH "compute_75"
 #     CODE "sm_75")
-#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
 #    `foo.cu` (only for the CUDA language).
 #
 macro(set_gencode_flag_for_srcs)
@@ -228,14 +245,14 @@ macro(set_gencode_flag_for_srcs)
 endmacro(set_gencode_flag_for_srcs)
 
 #
-# For a list of source files set the `-gencode` flags in the files specific 
+# For a list of source files set the `-gencode` flags in the files specific
 #  compile options (specifically for the CUDA language).
 #
 # arguments are:
 #  SRCS: list of source files
 #  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
 #  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
-#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
 #    that is larger than BUILD_PTX_FOR_ARCH.
 #
 macro(set_gencode_flags_for_srcs)
@@ -383,12 +400,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   endforeach()
   set(_CUDA_ARCHS ${_FINAL_ARCHS})
 
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
   set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
 endfunction()
 
 #
-# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form 
-# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list. 
+# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
+# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
 # The loose intersection is defined as:
 #   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
 #  where `<=` is the version comparison operator.
@@ -404,28 +423,48 @@ endfunction()
 #
 function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
   list(REMOVE_DUPLICATES SRC_ROCM_ARCHS)
-  
+
   # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit
   # and x is a letter. We can sort them by string comparison which works for this format.
   list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING)
-  
+
   set(_ROCM_ARCHS)
-  
+
   # Find the intersection of supported architectures
   foreach(_SRC_ARCH ${SRC_ROCM_ARCHS})
     if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS)
       list(APPEND _ROCM_ARCHS ${_SRC_ARCH})
     endif()
   endforeach()
-  
+
   list(REMOVE_DUPLICATES _ROCM_ARCHS)
   set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE)
 endfunction()
 
+function(cuda_remove_ptx_suffixes OUT_CUDA_ARCHS CUDA_ARCHS)
+  set(_CUDA_ARCHS "${CUDA_ARCHS}")
+
+  # handle +PTX suffix: separate base arch for matching, record PTX requests
+  foreach(_arch ${CUDA_ARCHS})
+    if(_arch MATCHES "\\+PTX$")
+      string(REPLACE "+PTX" "" _base "${_arch}")
+      list(REMOVE_ITEM _CUDA_ARCHS "${_arch}")
+      list(APPEND _CUDA_ARCHS "${_base}")
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
+
+
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
-# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
 # the architectures on a per file basis.
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs
@@ -18,13 +18,6 @@ static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
 static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
 static REGISTRATION_H: &str = include_str!("../templates/registration.h");
 static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
-static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");
-
-fn cuda_supported_archs() -> String {
-    let supported_archs: Vec<String> = serde_json::from_str(CUDA_SUPPORTED_ARCHS_JSON)
-        .expect("Error parsing supported CUDA archs");
-    supported_archs.join(";")
-}
 
 pub fn write_torch_ext_cuda(
     env: &Environment,
@@ -417,7 +410,6 @@ pub fn render_preamble(
                 cuda_maxver => cuda_maxver.map(|v| v.to_string()),
                 torch_minver => torch_minver.map(|v| v.to_string()),
                 torch_maxver => torch_maxver.map(|v| v.to_string()),
-                cuda_supported_archs => cuda_supported_archs(),
                 platform => env::consts::OS
             },
             &mut *write,
diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix
@@ -216,6 +216,7 @@ stdenv.mkDerivation (prevAttrs: {
   dontSetupCUDAToolkitCompilers = true;
 
   cmakeFlags = [
+    (lib.cmakeBool "BUILD_ALL_SUPPORTED_ARCHS" true)
     (lib.cmakeFeature "Python_EXECUTABLE" "${python3.withPackages (ps: [ torch ])}/bin/python")
     # Fix: file RPATH_CHANGE could not write new RPATH, we are rewriting
     # rpaths anyway.
diff --git a/pkgs/build2cmake/default.nix b/pkgs/build2cmake/default.nix
@@ -21,7 +21,6 @@ rustPlatform.buildRustPackage {
         || file.name == "Cargo.lock"
         || file.name == "pyproject.toml"
         || file.name == "pyproject_universal.toml"
-        || file.name == "cuda_supported_archs.json"
         || file.name == "python_dependencies.json"
         || (builtins.any file.hasExt [
           "cmake"
diff --git a/scripts/windows/builder.ps1 b/scripts/windows/builder.ps1
@@ -350,6 +350,9 @@ function Get-CMakeConfigureArgs {
         $kwargs = @("..", "-G", "Visual Studio 17 2022", "-A", $vsArch)
     }
 
+    # Build for all supported GPU archs, not just the detected arch.
+    $kwargs += "-DBUILD_ALL_SUPPORTED_ARCHS"
+
     # Detect Python from current environment
     $pythonExe = (Get-Command python -ErrorAction SilentlyContinue).Source
     if ($pythonExe) {

Original file line number	Diff line number	Diff line change
`@@ -350,6 +350,9 @@ function Get-CMakeConfigureArgs {`
`350`	`350`	`$kwargs = @("..", "-G", "Visual Studio 17 2022", "-A", $vsArch)`
`351`	`351`	`}`
`352`	`352`
	`353`	`+ # Build for all supported GPU archs, not just the detected arch.`
	`354`	`+ $kwargs += "-DBUILD_ALL_SUPPORTED_ARCHS"`
	`355`	`+`
`353`	`356`	`# Detect Python from current environment`
`354`	`357`	`$pythonExe = (Get-Command python -ErrorAction SilentlyContinue).Source`
`355`	`358`	`if ($pythonExe) {`