Increase the minimum CUDA version to 12.0 and the default arch to compute_75.

nnethercote · nnethercote · commit c8ffbdfcd06e · 2025-11-20T16:38:26.000+11:00
CUDA 12.0 was released in December 2022, and CUDA 13.0 was released in August 2025. It feels like a good time to drop CUDA 11.x support. This means later Kepler devices with compute capabilities of 3.5 and 3.7 will no longer be supported. The minimum version of NVVM IR increases from 1.6 to 2.0, because that's what CUDA 12.0 uses. Along with this, change the default compiler target to `compute_75`. This is a good choice because it's the minimum supported by CUDA 13.0, and gets Rust CUDA a step closer to working with CUDA 13.0. The existing defaults were all over the place. - `NvvmArch::default()` was `compute_52`. - `CudaBuilder`'s default was `compute_61`. - compiletest's default was `compute_70`. This commit makes the latter two determined by `NvvmArch::default()`, which is changed to `compute_75`. Currently CI runs compiletests on `compute_61`, `compute_70`, and `compute_90`; this commit changes the `compute_70` to `compute_75`. It seems sensible to have the default value as one of the things tested by CI. This comment also adds a comment on NvvmArch with a table of CUDA/`compute_*` values, which I found very useful. Resources: - https://en.wikipedia.org/wiki/CUDA#GPUs_supported for compute capabilities supported by different CUDA versions. - https://docs.nvidia.com/cuda/archive/12.0.0/cuda-toolkit-release-notes/index.html for NVVM IR version information.
diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
@@ -192,4 +192,4 @@ jobs:
         shell: bash
         run: shopt -s globstar && rustfmt --check tests/compiletests/ui/**/*.rs
       - name: Compiletest
-        run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
+        run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
@@ -105,6 +105,6 @@ jobs:
           RUSTDOCFLAGS: -Dwarnings
         run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex0*" --exclude "cudnn*" --exclude "sha2*" --exclude "cust_raw"
 
-      # Disabled due to dll issues, someone with  Windows knowledge needed
+      # Disabled due to dll issues, someone with Windows knowledge needed
       # - name: Compiletest
-      #  run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
+      #  run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
@@ -91,10 +91,7 @@ pub struct CudaBuilder {
     /// will not work on older capabilities. It means that if it uses certain features
     /// it may not work.
     ///
-    /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as the
-    /// GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because Maxwell
-    /// (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover, `6.x`
-    /// contains support for things like f64 atomic add and half precision float ops.
+    /// This defaults to the default value of `NvvmArch`.
     ///
     /// Starting with CUDA 12.9, architectures can have suffixes:
     ///
@@ -207,7 +204,7 @@ impl CudaBuilder {
             ptx_file_copy_path: None,
             generate_line_info: true,
             nvvm_opts: true,
-            arch: NvvmArch::Compute61,
+            arch: NvvmArch::default(),
             ftz: false,
             fast_sqrt: false,
             fast_div: false,
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
@@ -253,20 +253,66 @@ impl FromStr for NvvmOption {
     }
 }
 
-/// Nvvm architecture, default is `Compute52`
+/// Nvvm architecture.
+///
+/// The following table indicates which `compute_*` values are supported by which CUDA versions.
+///
+/// ```text
+/// -----------------------------------------------------------------------------
+///             | Supported `compute_*` values (written vertically)
+/// -----------------------------------------------------------------------------
+/// CUDA        |                                 1 1 1 1 1 1
+/// Toolkit     | 5 5 5 6 6 6 7 7 7 7 8 8 8 8 8 9 0 0 0 1 2 2
+/// version     | 0 2 3 0 1 2 0 2 3 5 0 6 7 8 9 0 0 1 3 0 0 1
+/// -----------------------------------------------------------------------------
+/// 12.[01].0   | b b b b b b b b b b b b - - b b - - - - - -
+/// 12.2.0      | b b b b b b b b b b b b - - b a - - - - - -
+/// 12.[3456].0 | b b b b b b b b b b b b b - b a - - - - - -
+/// 12.8.0      | b b b b b b b b b b b b b - b a a a - - a -
+/// 12.9.0      | b b b b b b b b - b b b b - b a f f f - f f
+/// 13.0.0      | - - - - - - - - - b b b b b b a f - f f f f
+/// -----------------------------------------------------------------------------  
+/// Legend:
+/// - 'b': baseline features only
+/// - 'a': baseline + architecture-specific features
+/// - 'f': baseline + architecture-specific + family-specific features
+///
+/// Note: there was no 12.7 release.
+/// ```
+///
+/// For example, CUDA 12.9.0 supports `compute_89`, `compute_90{,a}`, `compute_100{,a,f}`.
+///
+/// This information is from "PTX Compiler APIs" documents under
+/// <https://developer.nvidia.com/cuda-toolkit-archive>, e.g.
+/// <https://docs.nvidia.com/cuda/archive/13.0.0/ptx-compiler-api/index.html>. (Adjust the version
+/// in that URL as necessary.) Specifically, the `compute-*` values allowed with the `--gpu-name`
+/// option.
+///
+/// # Example
+///
+/// ```
+/// // The default value is `NvvmArch::Compute75`.
+/// # use nvvm::NvvmArch;
+/// assert_eq!(NvvmArch::default(), NvvmArch::Compute75);
+/// ```
 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, strum::EnumIter)]
 pub enum NvvmArch {
     Compute35,
     Compute37,
     Compute50,
-    #[default]
     Compute52,
     Compute53,
     Compute60,
     Compute61,
     Compute62,
     Compute70,
     Compute72,
+    /// This default value of 7.5 corresponds to Turing and later devices. We default to this
+    /// because it is the minimum supported by CUDA 13.0 while being in the middle of the range
+    /// supported by CUDA 12.x.
+    // WARNING: If you change the default, consider updating the `--target-arch` values used for
+    // compiletests in `ci_linux.yml` and `.github/workflows/ci_{linux,windows}.yml`.
+    #[default]
     Compute75,
     Compute80,
     Compute86,
diff --git a/crates/rustc_codegen_nvvm/src/nvvm.rs b/crates/rustc_codegen_nvvm/src/nvvm.rs
@@ -44,7 +44,7 @@ impl Display for CodegenErr {
 }
 
 /// Take a list of bitcode module bytes and their names and codegen it
-/// into ptx bytes. The final PTX *should* be utf8, but just to be on the safe side
+/// into PTX bytes. The final PTX *should* be utf8, but just to be on the safe side
 /// it returns a vector of bytes.
 ///
 /// Note that this will implicitly try to find libdevice and add it, so don't do that
@@ -57,15 +57,15 @@ pub fn codegen_bitcode_modules(
 ) -> Result<Vec<u8>, CodegenErr> {
     debug!("Codegenning bitcode to PTX");
 
-    // make sure the nvvm version is high enough so users don't get confusing compilation errors.
+    // Make sure the nvvm version is high enough so users don't get confusing compilation errors.
     let (major, minor) = nvvm::ir_version();
 
-    if major <= 1 && minor < 6 {
+    if major <= 2 && minor < 0 {
         sess.dcx()
-            .fatal("rustc_codegen_nvvm requires at least libnvvm 1.6 (CUDA 11.2)");
+            .fatal("rustc_codegen_nvvm requires at least libnvvm 2.0 (CUDA 12.0)");
     }
 
-    // first, create the nvvm program we will add modules to.
+    // First, create the nvvm program we will add modules to.
     let prog = NvvmProgram::new()?;
 
     let module = merge_llvm_modules(modules, llcx);
diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md
@@ -6,7 +6,9 @@ This section covers how to get started writing GPU crates with `cuda_std` and `c
 
 Before you can use the project to write GPU crates, you will need a couple of prerequisites:
 
-- [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version 11.2 or later (and the appropriate driver - [see CUDA release notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)).
+- [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version 12.0 or later (and the
+  appropriate driver - [see CUDA release
+  notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)).
 
   This is only for building GPU crates, to execute built PTX you only need CUDA `9+`.
 
diff --git a/tests/compiletests/Cargo.toml b/tests/compiletests/Cargo.toml
@@ -10,6 +10,7 @@ path = "src/main.rs"
 [dependencies]
 compiletest_rs = "0.11"
 clap = { version = "4.5", features = ["derive"] }
+nvvm = { path = "../../crates/nvvm" }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 cuda_builder = { workspace = true }
diff --git a/tests/compiletests/README.md b/tests/compiletests/README.md
@@ -22,7 +22,7 @@ cargo run --release
 ### Options
 
 - `--bless` - Update expected output files
-- `--target-arch=compute_61,compute_70,compute_90` - Test multiple CUDA compute capabilities (comma-separated)
+- `--target-arch=compute_61,compute_75,compute_90` - Test multiple CUDA compute capabilities (comma-separated)
 - Filter by test name: `cargo compiletest simple`
 - `RUST_LOG=info` - Enable progress logging
 - `RUST_LOG=debug` - Enable detailed debug logging
diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs
@@ -1,4 +1,5 @@
 use clap::Parser;
+use nvvm::NvvmArch;
 use std::env;
 use std::io;
 use std::path::{Path, PathBuf};
@@ -13,17 +14,18 @@ struct Opt {
 
     /// The CUDA compute capability to target (e.g., compute_70, compute_80, compute_90).
     /// Can specify multiple architectures comma-separated.
-    #[arg(long, default_value = "compute_70", value_delimiter = ',')]
-    target_arch: Vec<String>,
+    // WARNING: This should be kept in sync with the default on `CudaBuilder::arch`.
+    #[arg(long, default_values_t = [NvvmArch::default()], value_delimiter = ',')]
+    target_arch: Vec<NvvmArch>,
 
     /// Only run tests that match these filters.
     #[arg(name = "FILTER")]
     filters: Vec<String>,
 }
 
 impl Opt {
-    pub fn architectures(&self) -> impl Iterator<Item = &str> {
-        self.target_arch.iter().map(|s| s.as_str())
+    pub fn architectures(&self) -> impl Iterator<Item = NvvmArch> + use<'_> {
+        self.target_arch.iter().copied()
     }
 }
 
@@ -136,18 +138,18 @@ impl Runner {
             extra_flags: "",
         }];
 
-        for (arch, variation) in self
-            .opt
-            .architectures()
-            .flat_map(|arch| VARIATIONS.iter().map(move |variation| (arch, variation)))
-        {
+        for (arch, variation) in self.opt.architectures().flat_map(|arch| {
+            VARIATIONS
+                .iter()
+                .map(move |variation| (arch.target_feature(), variation))
+        }) {
             // HACK(eddyb) in order to allow *some* tests to have separate output
             // in different testing variations (i.e. experimental features), while
             // keeping *most* of the tests unchanged, we make use of "stage IDs",
             // which offer `// only-S` and `// ignore-S` for any stage ID `S`.
             let stage_id = if variation.name == "default" {
                 // Use the architecture name as the stage ID.
-                arch.to_string()
+                arch.clone()
             } else {
                 // Include the variation name in the stage ID.
                 format!("{}-{}", arch, variation.name)
@@ -159,7 +161,7 @@ impl Runner {
                 &self.deps_target_dir,
                 &self.codegen_backend_path,
                 CUDA_TARGET,
-                arch,
+                &arch,
             );
             let mut flags = test_rustc_flags(
                 &self.codegen_backend_path,
@@ -172,7 +174,7 @@ impl Runner {
                         .deps_target_dir
                         .join(DepKind::ProcMacro.target_dir_suffix(CUDA_TARGET)),
                 ],
-                arch,
+                &arch,
             );
             flags += variation.extra_flags;