Rust-GPU · nnethercote · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
@@ -28,9 +28,6 @@ jobs:
       fail-fast: false
       matrix:
         variance:
-          # - name: Ubuntu-22.04/CUDA-11.8.0
-          #   image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda11:latest"
-          #   runner: ubuntu-latest
           - name: Ubuntu-22.04 / CUDA-12.8.1 / x86_64
             image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda12:latest"
             runner: ubuntu-latest
@@ -235,9 +232,6 @@ jobs:
       matrix:
         variance:
           # Must match the build job's matrix definition
-          # - name: Ubuntu-22.04 / CUDA-11.8.0
-          #   image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda11:latest"
-          #   runner: ubuntu-latest
           - name: Ubuntu-22.04 / CUDA-12.8.1 / x86_64
             image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda12:latest"
             runner: ubuntu-latest
@@ -289,4 +283,4 @@ jobs:
         shell: bash
         run: shopt -s globstar && rustfmt --check tests/compiletests/ui/**/*.rs
       - name: Compiletest
-        run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
+        run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
@@ -105,6 +105,6 @@ jobs:
           RUSTDOCFLAGS: -Dwarnings
         run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex0*" --exclude "cudnn*" --exclude "sha2*" --exclude "cust_raw"
 
-      # Disabled due to dll issues, someone with  Windows knowledge needed
+      # Disabled due to dll issues, someone with Windows knowledge needed
       # - name: Compiletest
-      #  run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
+      #  run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
diff --git a/.github/workflows/container_images.yml b/.github/workflows/container_images.yml
@@ -30,9 +30,6 @@ jobs:
           - runner: ubuntu-24.04-arm
             arch: arm64
         variance:
-          - name: Ubuntu-22.04/CUDA-11.8.0
-            image: "rust-gpu/rust-cuda-ubuntu22-cuda11"
-            dockerfile: ./container/ubuntu22-cuda11/Dockerfile
           - name: Ubuntu-22.04/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-ubuntu22-cuda12"
             dockerfile: ./container/ubuntu22-cuda12/Dockerfile
@@ -157,8 +154,6 @@ jobs:
       fail-fast: false
       matrix:
         variance:
-          - name: Ubuntu-22.04/CUDA-11.8.0
-            image: "rust-gpu/rust-cuda-ubuntu22-cuda11"
           - name: Ubuntu-22.04/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-ubuntu22-cuda12"
           - name: Ubuntu-24.04/CUDA-12.8.1

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/container/ubuntu22-cuda11/Dockerfile b/container/ubuntu22-cuda11/Dockerfile
diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
@@ -91,10 +91,7 @@ pub struct CudaBuilder {
     /// will not work on older capabilities. It means that if it uses certain features
     /// it may not work.
     ///
-    /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as the
-    /// GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because Maxwell
-    /// (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover, `6.x`
-    /// contains support for things like f64 atomic add and half precision float ops.
+    /// This defaults to the default value of `NvvmArch`.
     ///
     /// Starting with CUDA 12.9, architectures can have suffixes:
     ///
@@ -207,7 +204,7 @@ impl CudaBuilder {
             ptx_file_copy_path: None,
             generate_line_info: true,
             nvvm_opts: true,
-            arch: NvvmArch::Compute61,
+            arch: NvvmArch::default(),
             ftz: false,
             fast_sqrt: false,
             fast_div: false,
@@ -257,23 +254,6 @@ impl CudaBuilder {
         self
     }
 
-    /// The virtual compute architecture to target for PTX generation. This
-    /// dictates how certain things are codegenned and may affect performance
-    /// and/or which gpus the code can run on.
-    ///
-    /// You should generally try to pick an arch that will work with most
-    /// GPUs you want your program to work with.
-    ///
-    /// If you are unsure, either leave this option to default, or pick something around 5.2 to 7.x.
-    ///
-    /// You can find a list of features supported on each arch and a list of GPUs for every
-    /// arch [`here`](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications).
-    ///
-    /// NOTE that this does not necessarily mean that code using a certain capability
-    /// will not work on older capabilities. It means that if it uses certain
-    /// features it may not work.
-    ///
-    /// The chosen architecture enables target features for conditional compilation.
     /// See the documentation on the `arch` field for more details.
     pub fn arch(mut self, arch: NvvmArch) -> Self {
         self.arch = arch;

diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
@@ -243,54 +243,59 @@ impl FromStr for NvvmOption {
             }
             _ if s.starts_with("-arch=") => {
                 let slice = &s[6..];
-                if !slice.starts_with("compute_") {
-                    return Err(format!("unknown -arch value: {slice}"));
+                match NvvmArch::from_str(slice) {
+                    Ok(arch) => Self::Arch(arch),
+                    Err(_) => return Err(format!("unknown -arch value: {slice}")),
                 }
-                let arch_num = &slice[8..];
-                let arch = match arch_num {
-                    "35" => NvvmArch::Compute35,
-                    "37" => NvvmArch::Compute37,
-                    "50" => NvvmArch::Compute50,
-                    "52" => NvvmArch::Compute52,
-                    "53" => NvvmArch::Compute53,
-                    "60" => NvvmArch::Compute60,
-                    "61" => NvvmArch::Compute61,
-                    "62" => NvvmArch::Compute62,
-                    "70" => NvvmArch::Compute70,
-                    "72" => NvvmArch::Compute72,
-                    "75" => NvvmArch::Compute75,
-                    "80" => NvvmArch::Compute80,
-                    "86" => NvvmArch::Compute86,
-                    "87" => NvvmArch::Compute87,
-                    "89" => NvvmArch::Compute89,
-                    "90" => NvvmArch::Compute90,
-                    "90a" => NvvmArch::Compute90a,
-                    "100" => NvvmArch::Compute100,
-                    "100f" => NvvmArch::Compute100f,
-                    "100a" => NvvmArch::Compute100a,
-                    "101" => NvvmArch::Compute101,
-                    "101f" => NvvmArch::Compute101f,
-                    "101a" => NvvmArch::Compute101a,
-                    "103" => NvvmArch::Compute103,
-                    "103f" => NvvmArch::Compute103f,
-                    "103a" => NvvmArch::Compute103a,
-                    "120" => NvvmArch::Compute120,
-                    "120f" => NvvmArch::Compute120f,
-                    "120a" => NvvmArch::Compute120a,
-                    "121" => NvvmArch::Compute121,
-                    "121f" => NvvmArch::Compute121f,
-                    "121a" => NvvmArch::Compute121a,
-                    _ => return Err(format!("unknown -arch=compute_NN value: {arch_num}")),
-                };
-                Self::Arch(arch)
             }
             _ => return Err(format!("unknown option: {s}")),
         })
     }
 }
 
-/// Nvvm architecture, default is `Compute52`
-#[derive(Debug, Clone, Copy, PartialEq, Eq, strum::EnumIter)]
+/// Nvvm architecture.
+///
+/// The following table indicates which `compute_*` values are supported by which CUDA versions.
+///
+/// ```text
+/// -----------------------------------------------------------------------------
+///             | Supported `compute_*` values (written vertically)
+/// -----------------------------------------------------------------------------
+/// CUDA        |                                 1 1 1 1 1 1
+/// Toolkit     | 5 5 5 6 6 6 7 7 7 7 8 8 8 8 8 9 0 0 0 1 2 2
+/// version     | 0 2 3 0 1 2 0 2 3 5 0 6 7 8 9 0 0 1 3 0 0 1
+/// -----------------------------------------------------------------------------
+/// 12.[01].0   | b b b b b b b b b b b b - - b b - - - - - -
+/// 12.2.0      | b b b b b b b b b b b b - - b a - - - - - -
+/// 12.[3456].0 | b b b b b b b b b b b b b - b a - - - - - -
+/// 12.8.0      | b b b b b b b b b b b b b - b a a a - - a -
+/// 12.9.0      | b b b b b b b b - b b b b - b a f f f - f f
+/// 13.0.0      | - - - - - - - - - b b b b b b a f - f f f f
+/// -----------------------------------------------------------------------------  
+/// Legend:
+/// - 'b': baseline features only
+/// - 'a': baseline + architecture-specific features
+/// - 'f': baseline + architecture-specific + family-specific features
+///
+/// Note: there was no 12.7 release.
+/// ```
+///
+/// For example, CUDA 12.9.0 supports `compute_89`, `compute_90{,a}`, `compute_100{,a,f}`.
+///
+/// This information is from "PTX Compiler APIs" documents under
+/// <https://developer.nvidia.com/cuda-toolkit-archive>, e.g.
+/// <https://docs.nvidia.com/cuda/archive/13.0.0/ptx-compiler-api/index.html>. (Adjust the version
+/// in that URL as necessary.) Specifically, the `compute-*` values allowed with the `--gpu-name`
+/// option.
+///
+/// # Example
+///
+/// ```
+/// // The default value is `NvvmArch::Compute75`.
+/// # use nvvm::NvvmArch;
+/// assert_eq!(NvvmArch::default(), NvvmArch::Compute75);
+/// ```
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, strum::EnumIter)]
 pub enum NvvmArch {
     Compute35,
     Compute37,
@@ -302,6 +307,12 @@ pub enum NvvmArch {
     Compute62,
     Compute70,
     Compute72,
+    /// This default value of 7.5 corresponds to Turing and later devices. We default to this
+    /// because it is the minimum supported by CUDA 13.0 while being in the middle of the range
+    /// supported by CUDA 12.x.
+    // WARNING: If you change the default, consider updating the `--target-arch` values used for
+    // compiletests in `ci_linux.yml` and `.github/workflows/ci_{linux,windows}.yml`.
+    #[default]
     Compute75,
     Compute80,
     Compute86,
@@ -340,9 +351,45 @@ impl Display for NvvmArch {
     }
 }
 
-impl Default for NvvmArch {
-    fn default() -> Self {
-        Self::Compute52
+impl FromStr for NvvmArch {
+    type Err = &'static str;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "compute_35" => NvvmArch::Compute35,
+            "compute_37" => NvvmArch::Compute37,
+            "compute_50" => NvvmArch::Compute50,
+            "compute_52" => NvvmArch::Compute52,
+            "compute_53" => NvvmArch::Compute53,
+            "compute_60" => NvvmArch::Compute60,
+            "compute_61" => NvvmArch::Compute61,
+            "compute_62" => NvvmArch::Compute62,
+            "compute_70" => NvvmArch::Compute70,
+            "compute_72" => NvvmArch::Compute72,
+            "compute_75" => NvvmArch::Compute75,
+            "compute_80" => NvvmArch::Compute80,
+            "compute_86" => NvvmArch::Compute86,
+            "compute_87" => NvvmArch::Compute87,
+            "compute_89" => NvvmArch::Compute89,
+            "compute_90" => NvvmArch::Compute90,
+            "compute_90a" => NvvmArch::Compute90a,
+            "compute_100" => NvvmArch::Compute100,
+            "compute_100f" => NvvmArch::Compute100f,
+            "compute_100a" => NvvmArch::Compute100a,
+            "compute_101" => NvvmArch::Compute101,
+            "compute_101f" => NvvmArch::Compute101f,
+            "compute_101a" => NvvmArch::Compute101a,
+            "compute_103" => NvvmArch::Compute103,
+            "compute_103f" => NvvmArch::Compute103f,
+            "compute_103a" => NvvmArch::Compute103a,
+            "compute_120" => NvvmArch::Compute120,
+            "compute_120f" => NvvmArch::Compute120f,
+            "compute_120a" => NvvmArch::Compute120a,
+            "compute_121" => NvvmArch::Compute121,
+            "compute_121f" => NvvmArch::Compute121f,
+            "compute_121a" => NvvmArch::Compute121a,
+            _ => return Err("unknown compile target"),
+        })
     }
 }
 
@@ -1116,8 +1163,8 @@ mod tests {
         err("blah", "unknown option: blah");
         err("-aardvark", "unknown option: -aardvark");
         err("-arch=compute75", "unknown -arch value: compute75");
-        err("-arch=compute_10", "unknown -arch=compute_NN value: 10");
-        err("-arch=compute_100x", "unknown -arch=compute_NN value: 100x");
+        err("-arch=compute_10", "unknown -arch value: compute_10");
+        err("-arch=compute_100x", "unknown -arch value: compute_100x");
         err("-opt=3", "-opt=3 is the default");
         err("-opt=99", "unknown -opt value: 99");
     }

diff --git a/crates/rustc_codegen_nvvm/src/nvvm.rs b/crates/rustc_codegen_nvvm/src/nvvm.rs
@@ -44,7 +44,7 @@ impl Display for CodegenErr {
 }
 
 /// Take a list of bitcode module bytes and their names and codegen it
-/// into ptx bytes. The final PTX *should* be utf8, but just to be on the safe side
+/// into PTX bytes. The final PTX *should* be utf8, but just to be on the safe side
 /// it returns a vector of bytes.
 ///
 /// Note that this will implicitly try to find libdevice and add it, so don't do that
@@ -57,15 +57,15 @@ pub fn codegen_bitcode_modules(
 ) -> Result<Vec<u8>, CodegenErr> {
     debug!("Codegenning bitcode to PTX");
 
-    // make sure the nvvm version is high enough so users don't get confusing compilation errors.
+    // Make sure the nvvm version is high enough so users don't get confusing compilation errors.
     let (major, minor) = nvvm::ir_version();
 
-    if major <= 1 && minor < 6 {
+    if major <= 2 && minor < 0 {
         sess.dcx()
-            .fatal("rustc_codegen_nvvm requires at least libnvvm 1.6 (CUDA 11.2)");
+            .fatal("rustc_codegen_nvvm requires at least libnvvm 2.0 (CUDA 12.0)");
     }
 
-    // first, create the nvvm program we will add modules to.
+    // First, create the nvvm program we will add modules to.
     let prog = NvvmProgram::new()?;
 
     let module = merge_llvm_modules(modules, llcx);