diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
index e60d2f3f..92b1f70b 100644
--- a/.github/workflows/ci_linux.yml
+++ b/.github/workflows/ci_linux.yml
@@ -28,9 +28,6 @@ jobs:
       fail-fast: false
       matrix:
         variance:
-          # - name: Ubuntu-22.04/CUDA-11.8.0
-          #   image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda11:latest"
-          #   runner: ubuntu-latest
           - name: Ubuntu-22.04 / CUDA-12.8.1 / x86_64
             image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda12:latest"
             runner: ubuntu-latest
@@ -235,9 +232,6 @@ jobs:
       matrix:
         variance:
           # Must match the build job's matrix definition
-          # - name: Ubuntu-22.04 / CUDA-11.8.0
-          #   image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda11:latest"
-          #   runner: ubuntu-latest
           - name: Ubuntu-22.04 / CUDA-12.8.1 / x86_64
             image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda12:latest"
             runner: ubuntu-latest
@@ -289,4 +283,4 @@ jobs:
         shell: bash
         run: shopt -s globstar && rustfmt --check tests/compiletests/ui/**/*.rs
       - name: Compiletest
-        run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
+        run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
index a5910cf4..cb7e7167 100644
--- a/.github/workflows/ci_windows.yml
+++ b/.github/workflows/ci_windows.yml
@@ -105,6 +105,6 @@ jobs:
           RUSTDOCFLAGS: -Dwarnings
         run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex0*" --exclude "cudnn*" --exclude "sha2*" --exclude "cust_raw"
 
-      # Disabled due to dll issues, someone with  Windows knowledge needed
+      # Disabled due to dll issues, someone with Windows knowledge needed
       # - name: Compiletest
-      #  run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90
+      #  run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
diff --git a/.github/workflows/container_images.yml b/.github/workflows/container_images.yml
index eb66d98a..e3fcd8af 100644
--- a/.github/workflows/container_images.yml
+++ b/.github/workflows/container_images.yml
@@ -30,9 +30,6 @@ jobs:
           - runner: ubuntu-24.04-arm
             arch: arm64
         variance:
-          - name: Ubuntu-22.04/CUDA-11.8.0
-            image: "rust-gpu/rust-cuda-ubuntu22-cuda11"
-            dockerfile: ./container/ubuntu22-cuda11/Dockerfile
           - name: Ubuntu-22.04/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-ubuntu22-cuda12"
             dockerfile: ./container/ubuntu22-cuda12/Dockerfile
@@ -157,8 +154,6 @@ jobs:
       fail-fast: false
       matrix:
         variance:
-          - name: Ubuntu-22.04/CUDA-11.8.0
-            image: "rust-gpu/rust-cuda-ubuntu22-cuda11"
           - name: Ubuntu-22.04/CUDA-12.8.1
             image: "rust-gpu/rust-cuda-ubuntu22-cuda12"
           - name: Ubuntu-24.04/CUDA-12.8.1
diff --git a/Cargo.lock b/Cargo.lock
index 1af88d55..f21cd1d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -599,6 +599,7 @@ dependencies = [
  "clap 4.5.45",
  "compiletest_rs",
  "cuda_builder",
+ "nvvm",
  "tracing",
  "tracing-subscriber",
 ]
diff --git a/container/ubuntu22-cuda11/Dockerfile b/container/ubuntu22-cuda11/Dockerfile
deleted file mode 100644
index 6f4996ec..00000000
--- a/container/ubuntu22-cuda11/Dockerfile
+++ /dev/null
@@ -1,89 +0,0 @@
-FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS llvm-builder
-
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
-    build-essential \
-    clang \
-    curl \
-    libffi-dev \
-    libedit-dev \
-    libncurses5-dev \
-    libssl-dev \
-    libtinfo-dev \
-    libxml2-dev \
-    cmake \
-    ninja-build \
-    pkg-config \
-    python3 \
-    xz-utils \
-    zlib1g-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /data/llvm7
-
-# Download and build LLVM 7.1.0 for all architectures.
-RUN curl -sSf -L -O https://github.com/llvm/llvm-project/releases/download/llvmorg-7.1.0/llvm-7.1.0.src.tar.xz && \
-    tar -xf llvm-7.1.0.src.tar.xz && \
-    cd llvm-7.1.0.src && \
-    mkdir build && cd build && \
-    ARCH=$(dpkg --print-architecture) && \
-    if [ "$ARCH" = "amd64" ]; then \
-        TARGETS="X86;NVPTX"; \
-    else \
-        TARGETS="AArch64;NVPTX"; \
-    fi && \
-    cmake -G Ninja \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DLLVM_TARGETS_TO_BUILD="$TARGETS" \
-        -DLLVM_BUILD_LLVM_DYLIB=ON \
-        -DLLVM_LINK_LLVM_DYLIB=ON \
-        -DLLVM_ENABLE_ASSERTIONS=OFF \
-        -DLLVM_ENABLE_BINDINGS=OFF \
-        -DLLVM_INCLUDE_EXAMPLES=OFF \
-        -DLLVM_INCLUDE_TESTS=OFF \
-        -DLLVM_INCLUDE_BENCHMARKS=OFF \
-        -DLLVM_ENABLE_ZLIB=ON \
-        -DLLVM_ENABLE_TERMINFO=ON \
-        -DCMAKE_INSTALL_PREFIX=/opt/llvm-7 \
-        .. && \
-    ninja -j$(nproc) && \
-    ninja install && \
-    cd ../.. && \
-    rm -rf llvm-7.1.0.src*
-
-FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
-    build-essential \
-    clang \
-    curl \
-    libssl-dev \
-    libtinfo-dev \
-    pkg-config \
-    xz-utils \
-    zlib1g-dev \
-    cmake \
-    libfontconfig-dev \
-    libx11-xcb-dev \
-    libxcursor-dev \
-    libxi-dev \
-    libxinerama-dev \
-    libxrandr-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY --from=llvm-builder /opt/llvm-7 /opt/llvm-7
-RUN ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config && \
-    ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config-7
-
-# Get Rust (install rustup; toolchain installed from rust-toolchain.toml below)
-RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y --profile minimal --default-toolchain none
-ENV PATH="/root/.cargo/bin:${PATH}"
-
-# Setup the workspace
-WORKDIR /data/rust-cuda
-RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
-    rustup show
-
-# Add nvvm to LD_LIBRARY_PATH.
-ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"
-ENV LLVM_LINK_STATIC=1
-ENV RUST_LOG=info
diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
index 47df458d..f187ede2 100644
--- a/crates/cuda_builder/src/lib.rs
+++ b/crates/cuda_builder/src/lib.rs
@@ -91,10 +91,7 @@ pub struct CudaBuilder {
     /// will not work on older capabilities. It means that if it uses certain features
     /// it may not work.
     ///
-    /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as the
-    /// GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because Maxwell
-    /// (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover, `6.x`
-    /// contains support for things like f64 atomic add and half precision float ops.
+    /// This defaults to the default value of `NvvmArch`.
     ///
     /// Starting with CUDA 12.9, architectures can have suffixes:
     ///
@@ -207,7 +204,7 @@ impl CudaBuilder {
             ptx_file_copy_path: None,
             generate_line_info: true,
             nvvm_opts: true,
-            arch: NvvmArch::Compute61,
+            arch: NvvmArch::default(),
             ftz: false,
             fast_sqrt: false,
             fast_div: false,
@@ -257,23 +254,6 @@ impl CudaBuilder {
         self
     }
 
-    /// The virtual compute architecture to target for PTX generation. This
-    /// dictates how certain things are codegenned and may affect performance
-    /// and/or which gpus the code can run on.
-    ///
-    /// You should generally try to pick an arch that will work with most
-    /// GPUs you want your program to work with.
-    ///
-    /// If you are unsure, either leave this option to default, or pick something around 5.2 to 7.x.
-    ///
-    /// You can find a list of features supported on each arch and a list of GPUs for every
-    /// arch [`here`](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications).
-    ///
-    /// NOTE that this does not necessarily mean that code using a certain capability
-    /// will not work on older capabilities. It means that if it uses certain
-    /// features it may not work.
-    ///
-    /// The chosen architecture enables target features for conditional compilation.
     /// See the documentation on the `arch` field for more details.
     pub fn arch(mut self, arch: NvvmArch) -> Self {
         self.arch = arch;
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
index 0ff7a017..d3f34c4e 100644
--- a/crates/nvvm/src/lib.rs
+++ b/crates/nvvm/src/lib.rs
@@ -243,54 +243,59 @@ impl FromStr for NvvmOption {
             }
             _ if s.starts_with("-arch=") => {
                 let slice = &s[6..];
-                if !slice.starts_with("compute_") {
-                    return Err(format!("unknown -arch value: {slice}"));
+                match NvvmArch::from_str(slice) {
+                    Ok(arch) => Self::Arch(arch),
+                    Err(_) => return Err(format!("unknown -arch value: {slice}")),
                 }
-                let arch_num = &slice[8..];
-                let arch = match arch_num {
-                    "35" => NvvmArch::Compute35,
-                    "37" => NvvmArch::Compute37,
-                    "50" => NvvmArch::Compute50,
-                    "52" => NvvmArch::Compute52,
-                    "53" => NvvmArch::Compute53,
-                    "60" => NvvmArch::Compute60,
-                    "61" => NvvmArch::Compute61,
-                    "62" => NvvmArch::Compute62,
-                    "70" => NvvmArch::Compute70,
-                    "72" => NvvmArch::Compute72,
-                    "75" => NvvmArch::Compute75,
-                    "80" => NvvmArch::Compute80,
-                    "86" => NvvmArch::Compute86,
-                    "87" => NvvmArch::Compute87,
-                    "89" => NvvmArch::Compute89,
-                    "90" => NvvmArch::Compute90,
-                    "90a" => NvvmArch::Compute90a,
-                    "100" => NvvmArch::Compute100,
-                    "100f" => NvvmArch::Compute100f,
-                    "100a" => NvvmArch::Compute100a,
-                    "101" => NvvmArch::Compute101,
-                    "101f" => NvvmArch::Compute101f,
-                    "101a" => NvvmArch::Compute101a,
-                    "103" => NvvmArch::Compute103,
-                    "103f" => NvvmArch::Compute103f,
-                    "103a" => NvvmArch::Compute103a,
-                    "120" => NvvmArch::Compute120,
-                    "120f" => NvvmArch::Compute120f,
-                    "120a" => NvvmArch::Compute120a,
-                    "121" => NvvmArch::Compute121,
-                    "121f" => NvvmArch::Compute121f,
-                    "121a" => NvvmArch::Compute121a,
-                    _ => return Err(format!("unknown -arch=compute_NN value: {arch_num}")),
-                };
-                Self::Arch(arch)
             }
             _ => return Err(format!("unknown option: {s}")),
         })
     }
 }
 
-/// Nvvm architecture, default is `Compute52`
-#[derive(Debug, Clone, Copy, PartialEq, Eq, strum::EnumIter)]
+/// Nvvm architecture.
+///
+/// The following table indicates which `compute_*` values are supported by which CUDA versions.
+///
+/// ```text
+/// -----------------------------------------------------------------------------
+///             | Supported `compute_*` values (written vertically)
+/// -----------------------------------------------------------------------------
+/// CUDA        |                                 1 1 1 1 1 1
+/// Toolkit     | 5 5 5 6 6 6 7 7 7 7 8 8 8 8 8 9 0 0 0 1 2 2
+/// version     | 0 2 3 0 1 2 0 2 3 5 0 6 7 8 9 0 0 1 3 0 0 1
+/// -----------------------------------------------------------------------------
+/// 12.[01].0   | b b b b b b b b b b b b - - b b - - - - - -
+/// 12.2.0      | b b b b b b b b b b b b - - b a - - - - - -
+/// 12.[3456].0 | b b b b b b b b b b b b b - b a - - - - - -
+/// 12.8.0      | b b b b b b b b b b b b b - b a a a - - a -
+/// 12.9.0      | b b b b b b b b - b b b b - b a f f f - f f
+/// 13.0.0      | - - - - - - - - - b b b b b b a f - f f f f
+/// -----------------------------------------------------------------------------  
+/// Legend:
+/// - 'b': baseline features only
+/// - 'a': baseline + architecture-specific features
+/// - 'f': baseline + architecture-specific + family-specific features
+///
+/// Note: there was no 12.7 release.
+/// ```
+///
+/// For example, CUDA 12.9.0 supports `compute_89`, `compute_90{,a}`, `compute_100{,a,f}`.
+///
+/// This information is from "PTX Compiler APIs" documents under
+/// <https://developer.nvidia.com/cuda-toolkit-archive>, e.g.
+/// <https://docs.nvidia.com/cuda/archive/13.0.0/ptx-compiler-api/index.html>. (Adjust the version
+/// in that URL as necessary.) Specifically, the `compute-*` values allowed with the `--gpu-name`
+/// option.
+///
+/// # Example
+///
+/// ```
+/// // The default value is `NvvmArch::Compute75`.
+/// # use nvvm::NvvmArch;
+/// assert_eq!(NvvmArch::default(), NvvmArch::Compute75);
+/// ```
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, strum::EnumIter)]
 pub enum NvvmArch {
     Compute35,
     Compute37,
@@ -302,6 +307,12 @@ pub enum NvvmArch {
     Compute62,
     Compute70,
     Compute72,
+    /// This default value of 7.5 corresponds to Turing and later devices. We default to this
+    /// because it is the minimum supported by CUDA 13.0 while being in the middle of the range
+    /// supported by CUDA 12.x.
+    // WARNING: If you change the default, consider updating the `--target-arch` values used for
+    // compiletests in `ci_linux.yml` and `.github/workflows/ci_{linux,windows}.yml`.
+    #[default]
     Compute75,
     Compute80,
     Compute86,
@@ -340,9 +351,45 @@ impl Display for NvvmArch {
     }
 }
 
-impl Default for NvvmArch {
-    fn default() -> Self {
-        Self::Compute52
+impl FromStr for NvvmArch {
+    type Err = &'static str;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "compute_35" => NvvmArch::Compute35,
+            "compute_37" => NvvmArch::Compute37,
+            "compute_50" => NvvmArch::Compute50,
+            "compute_52" => NvvmArch::Compute52,
+            "compute_53" => NvvmArch::Compute53,
+            "compute_60" => NvvmArch::Compute60,
+            "compute_61" => NvvmArch::Compute61,
+            "compute_62" => NvvmArch::Compute62,
+            "compute_70" => NvvmArch::Compute70,
+            "compute_72" => NvvmArch::Compute72,
+            "compute_75" => NvvmArch::Compute75,
+            "compute_80" => NvvmArch::Compute80,
+            "compute_86" => NvvmArch::Compute86,
+            "compute_87" => NvvmArch::Compute87,
+            "compute_89" => NvvmArch::Compute89,
+            "compute_90" => NvvmArch::Compute90,
+            "compute_90a" => NvvmArch::Compute90a,
+            "compute_100" => NvvmArch::Compute100,
+            "compute_100f" => NvvmArch::Compute100f,
+            "compute_100a" => NvvmArch::Compute100a,
+            "compute_101" => NvvmArch::Compute101,
+            "compute_101f" => NvvmArch::Compute101f,
+            "compute_101a" => NvvmArch::Compute101a,
+            "compute_103" => NvvmArch::Compute103,
+            "compute_103f" => NvvmArch::Compute103f,
+            "compute_103a" => NvvmArch::Compute103a,
+            "compute_120" => NvvmArch::Compute120,
+            "compute_120f" => NvvmArch::Compute120f,
+            "compute_120a" => NvvmArch::Compute120a,
+            "compute_121" => NvvmArch::Compute121,
+            "compute_121f" => NvvmArch::Compute121f,
+            "compute_121a" => NvvmArch::Compute121a,
+            _ => return Err("unknown compile target"),
+        })
     }
 }
 
@@ -1116,8 +1163,8 @@ mod tests {
         err("blah", "unknown option: blah");
         err("-aardvark", "unknown option: -aardvark");
         err("-arch=compute75", "unknown -arch value: compute75");
-        err("-arch=compute_10", "unknown -arch=compute_NN value: 10");
-        err("-arch=compute_100x", "unknown -arch=compute_NN value: 100x");
+        err("-arch=compute_10", "unknown -arch value: compute_10");
+        err("-arch=compute_100x", "unknown -arch value: compute_100x");
         err("-opt=3", "-opt=3 is the default");
         err("-opt=99", "unknown -opt value: 99");
     }
diff --git a/crates/rustc_codegen_nvvm/src/nvvm.rs b/crates/rustc_codegen_nvvm/src/nvvm.rs
index 2c1ae5b2..e165225f 100644
--- a/crates/rustc_codegen_nvvm/src/nvvm.rs
+++ b/crates/rustc_codegen_nvvm/src/nvvm.rs
@@ -44,7 +44,7 @@ impl Display for CodegenErr {
 }
 
 /// Take a list of bitcode module bytes and their names and codegen it
-/// into ptx bytes. The final PTX *should* be utf8, but just to be on the safe side
+/// into PTX bytes. The final PTX *should* be utf8, but just to be on the safe side
 /// it returns a vector of bytes.
 ///
 /// Note that this will implicitly try to find libdevice and add it, so don't do that
@@ -57,15 +57,15 @@ pub fn codegen_bitcode_modules(
 ) -> Result<Vec<u8>, CodegenErr> {
     debug!("Codegenning bitcode to PTX");
 
-    // make sure the nvvm version is high enough so users don't get confusing compilation errors.
+    // Make sure the nvvm version is high enough so users don't get confusing compilation errors.
     let (major, minor) = nvvm::ir_version();
 
-    if major <= 1 && minor < 6 {
+    if major <= 2 && minor < 0 {
         sess.dcx()
-            .fatal("rustc_codegen_nvvm requires at least libnvvm 1.6 (CUDA 11.2)");
+            .fatal("rustc_codegen_nvvm requires at least libnvvm 2.0 (CUDA 12.0)");
     }
 
-    // first, create the nvvm program we will add modules to.
+    // First, create the nvvm program we will add modules to.
     let prog = NvvmProgram::new()?;
 
     let module = merge_llvm_modules(modules, llcx);
diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md
index be30c946..e7a8f728 100644
--- a/guide/src/guide/getting_started.md
+++ b/guide/src/guide/getting_started.md
@@ -6,7 +6,9 @@ This section covers how to get started writing GPU crates with `cuda_std` and `c
 
 Before you can use the project to write GPU crates, you will need a couple of prerequisites:
 
-- [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version 11.2 or later (and the appropriate driver - [see CUDA release notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)).
+- [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version 12.0 or later (and the
+  appropriate driver - [see CUDA release
+  notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)).
 
   This is only for building GPU crates, to execute built PTX you only need CUDA `9+`.
 
diff --git a/tests/compiletests/Cargo.toml b/tests/compiletests/Cargo.toml
index 48102ec7..989726d6 100644
--- a/tests/compiletests/Cargo.toml
+++ b/tests/compiletests/Cargo.toml
@@ -10,6 +10,7 @@ path = "src/main.rs"
 [dependencies]
 compiletest_rs = "0.11"
 clap = { version = "4.5", features = ["derive"] }
+nvvm = { path = "../../crates/nvvm" }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 cuda_builder = { workspace = true }
diff --git a/tests/compiletests/README.md b/tests/compiletests/README.md
index 55f4bcd1..2f638816 100644
--- a/tests/compiletests/README.md
+++ b/tests/compiletests/README.md
@@ -22,7 +22,7 @@ cargo run --release
 ### Options
 
 - `--bless` - Update expected output files
-- `--target-arch=compute_61,compute_70,compute_90` - Test multiple CUDA compute capabilities (comma-separated)
+- `--target-arch=compute_61,compute_75,compute_90` - Test multiple CUDA compute capabilities (comma-separated)
 - Filter by test name: `cargo compiletest simple`
 - `RUST_LOG=info` - Enable progress logging
 - `RUST_LOG=debug` - Enable detailed debug logging
diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs
index 927a8c06..e797f11b 100644
--- a/tests/compiletests/src/main.rs
+++ b/tests/compiletests/src/main.rs
@@ -1,4 +1,5 @@
 use clap::Parser;
+use nvvm::NvvmArch;
 use std::env;
 use std::io;
 use std::path::{Path, PathBuf};
@@ -13,8 +14,9 @@ struct Opt {
 
     /// The CUDA compute capability to target (e.g., compute_70, compute_80, compute_90).
     /// Can specify multiple architectures comma-separated.
-    #[arg(long, default_value = "compute_70", value_delimiter = ',')]
-    target_arch: Vec<String>,
+    // WARNING: This should be kept in sync with the default on `CudaBuilder::arch`.
+    #[arg(long, default_values_t = [NvvmArch::default()], value_delimiter = ',')]
+    target_arch: Vec<NvvmArch>,
 
     /// Only run tests that match these filters.
     #[arg(name = "FILTER")]
@@ -22,8 +24,8 @@ struct Opt {
 }
 
 impl Opt {
-    pub fn architectures(&self) -> impl Iterator<Item = &str> {
-        self.target_arch.iter().map(|s| s.as_str())
+    pub fn architectures(&self) -> impl Iterator<Item = NvvmArch> + use<'_> {
+        self.target_arch.iter().copied()
     }
 }
 
@@ -136,18 +138,18 @@ impl Runner {
             extra_flags: "",
         }];
 
-        for (arch, variation) in self
-            .opt
-            .architectures()
-            .flat_map(|arch| VARIATIONS.iter().map(move |variation| (arch, variation)))
-        {
+        for (arch, variation) in self.opt.architectures().flat_map(|arch| {
+            VARIATIONS
+                .iter()
+                .map(move |variation| (arch.target_feature(), variation))
+        }) {
             // HACK(eddyb) in order to allow *some* tests to have separate output
             // in different testing variations (i.e. experimental features), while
             // keeping *most* of the tests unchanged, we make use of "stage IDs",
             // which offer `// only-S` and `// ignore-S` for any stage ID `S`.
             let stage_id = if variation.name == "default" {
                 // Use the architecture name as the stage ID.
-                arch.to_string()
+                arch.clone()
             } else {
                 // Include the variation name in the stage ID.
                 format!("{}-{}", arch, variation.name)
@@ -159,7 +161,7 @@ impl Runner {
                 &self.deps_target_dir,
                 &self.codegen_backend_path,
                 CUDA_TARGET,
-                arch,
+                &arch,
             );
             let mut flags = test_rustc_flags(
                 &self.codegen_backend_path,
@@ -172,7 +174,7 @@ impl Runner {
                         .deps_target_dir
                         .join(DepKind::ProcMacro.target_dir_suffix(CUDA_TARGET)),
                 ],
-                arch,
+                &arch,
             );
             flags += variation.extra_flags;