diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml index e60d2f3f..92b1f70b 100644 --- a/.github/workflows/ci_linux.yml +++ b/.github/workflows/ci_linux.yml @@ -28,9 +28,6 @@ jobs: fail-fast: false matrix: variance: - # - name: Ubuntu-22.04/CUDA-11.8.0 - # image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda11:latest" - # runner: ubuntu-latest - name: Ubuntu-22.04 / CUDA-12.8.1 / x86_64 image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda12:latest" runner: ubuntu-latest @@ -235,9 +232,6 @@ jobs: matrix: variance: # Must match the build job's matrix definition - # - name: Ubuntu-22.04 / CUDA-11.8.0 - # image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda11:latest" - # runner: ubuntu-latest - name: Ubuntu-22.04 / CUDA-12.8.1 / x86_64 image: "ghcr.io/rust-gpu/rust-cuda-ubuntu22-cuda12:latest" runner: ubuntu-latest @@ -289,4 +283,4 @@ jobs: shell: bash run: shopt -s globstar && rustfmt --check tests/compiletests/ui/**/*.rs - name: Compiletest - run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90 + run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90 diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml index a5910cf4..cb7e7167 100644 --- a/.github/workflows/ci_windows.yml +++ b/.github/workflows/ci_windows.yml @@ -105,6 +105,6 @@ jobs: RUSTDOCFLAGS: -Dwarnings run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex0*" --exclude "cudnn*" --exclude "sha2*" --exclude "cust_raw" - # Disabled due to dll issues, someone with Windows knowledge needed + # Disabled due to dll issues, someone with Windows knowledge needed # - name: Compiletest - # run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90 + # run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90 diff --git a/.github/workflows/container_images.yml b/.github/workflows/container_images.yml index eb66d98a..e3fcd8af 100644 --- a/.github/workflows/container_images.yml +++ b/.github/workflows/container_images.yml @@ -30,9 +30,6 @@ jobs: - runner: ubuntu-24.04-arm arch: arm64 variance: - - name: Ubuntu-22.04/CUDA-11.8.0 - image: "rust-gpu/rust-cuda-ubuntu22-cuda11" - dockerfile: ./container/ubuntu22-cuda11/Dockerfile - name: Ubuntu-22.04/CUDA-12.8.1 image: "rust-gpu/rust-cuda-ubuntu22-cuda12" dockerfile: ./container/ubuntu22-cuda12/Dockerfile @@ -157,8 +154,6 @@ jobs: fail-fast: false matrix: variance: - - name: Ubuntu-22.04/CUDA-11.8.0 - image: "rust-gpu/rust-cuda-ubuntu22-cuda11" - name: Ubuntu-22.04/CUDA-12.8.1 image: "rust-gpu/rust-cuda-ubuntu22-cuda12" - name: Ubuntu-24.04/CUDA-12.8.1 diff --git a/Cargo.lock b/Cargo.lock index 1af88d55..f21cd1d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -599,6 +599,7 @@ dependencies = [ "clap 4.5.45", "compiletest_rs", "cuda_builder", + "nvvm", "tracing", "tracing-subscriber", ] diff --git a/container/ubuntu22-cuda11/Dockerfile b/container/ubuntu22-cuda11/Dockerfile deleted file mode 100644 index 6f4996ec..00000000 --- a/container/ubuntu22-cuda11/Dockerfile +++ /dev/null @@ -1,89 +0,0 @@ -FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS llvm-builder - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \ - build-essential \ - clang \ - curl \ - libffi-dev \ - libedit-dev \ - libncurses5-dev \ - libssl-dev \ - libtinfo-dev \ - libxml2-dev \ - cmake \ - ninja-build \ - pkg-config \ - python3 \ - xz-utils \ - zlib1g-dev && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /data/llvm7 - -# Download and build LLVM 7.1.0 for all architectures. -RUN curl -sSf -L -O https://github.com/llvm/llvm-project/releases/download/llvmorg-7.1.0/llvm-7.1.0.src.tar.xz && \ - tar -xf llvm-7.1.0.src.tar.xz && \ - cd llvm-7.1.0.src && \ - mkdir build && cd build && \ - ARCH=$(dpkg --print-architecture) && \ - if [ "$ARCH" = "amd64" ]; then \ - TARGETS="X86;NVPTX"; \ - else \ - TARGETS="AArch64;NVPTX"; \ - fi && \ - cmake -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DLLVM_TARGETS_TO_BUILD="$TARGETS" \ - -DLLVM_BUILD_LLVM_DYLIB=ON \ - -DLLVM_LINK_LLVM_DYLIB=ON \ - -DLLVM_ENABLE_ASSERTIONS=OFF \ - -DLLVM_ENABLE_BINDINGS=OFF \ - -DLLVM_INCLUDE_EXAMPLES=OFF \ - -DLLVM_INCLUDE_TESTS=OFF \ - -DLLVM_INCLUDE_BENCHMARKS=OFF \ - -DLLVM_ENABLE_ZLIB=ON \ - -DLLVM_ENABLE_TERMINFO=ON \ - -DCMAKE_INSTALL_PREFIX=/opt/llvm-7 \ - .. && \ - ninja -j$(nproc) && \ - ninja install && \ - cd ../.. && \ - rm -rf llvm-7.1.0.src* - -FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \ - build-essential \ - clang \ - curl \ - libssl-dev \ - libtinfo-dev \ - pkg-config \ - xz-utils \ - zlib1g-dev \ - cmake \ - libfontconfig-dev \ - libx11-xcb-dev \ - libxcursor-dev \ - libxi-dev \ - libxinerama-dev \ - libxrandr-dev && \ - rm -rf /var/lib/apt/lists/* - -COPY --from=llvm-builder /opt/llvm-7 /opt/llvm-7 -RUN ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config && \ - ln -s /opt/llvm-7/bin/llvm-config /usr/bin/llvm-config-7 - -# Get Rust (install rustup; toolchain installed from rust-toolchain.toml below) -RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y --profile minimal --default-toolchain none -ENV PATH="/root/.cargo/bin:${PATH}" - -# Setup the workspace -WORKDIR /data/rust-cuda -RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \ - rustup show - -# Add nvvm to LD_LIBRARY_PATH. -ENV LD_LIBRARY_PATH="/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}" -ENV LLVM_LINK_STATIC=1 -ENV RUST_LOG=info diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs index 47df458d..f187ede2 100644 --- a/crates/cuda_builder/src/lib.rs +++ b/crates/cuda_builder/src/lib.rs @@ -91,10 +91,7 @@ pub struct CudaBuilder { /// will not work on older capabilities. It means that if it uses certain features /// it may not work. /// - /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as the - /// GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because Maxwell - /// (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover, `6.x` - /// contains support for things like f64 atomic add and half precision float ops. + /// This defaults to the default value of `NvvmArch`. /// /// Starting with CUDA 12.9, architectures can have suffixes: /// @@ -207,7 +204,7 @@ impl CudaBuilder { ptx_file_copy_path: None, generate_line_info: true, nvvm_opts: true, - arch: NvvmArch::Compute61, + arch: NvvmArch::default(), ftz: false, fast_sqrt: false, fast_div: false, @@ -257,23 +254,6 @@ impl CudaBuilder { self } - /// The virtual compute architecture to target for PTX generation. This - /// dictates how certain things are codegenned and may affect performance - /// and/or which gpus the code can run on. - /// - /// You should generally try to pick an arch that will work with most - /// GPUs you want your program to work with. - /// - /// If you are unsure, either leave this option to default, or pick something around 5.2 to 7.x. - /// - /// You can find a list of features supported on each arch and a list of GPUs for every - /// arch [`here`](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications). - /// - /// NOTE that this does not necessarily mean that code using a certain capability - /// will not work on older capabilities. It means that if it uses certain - /// features it may not work. - /// - /// The chosen architecture enables target features for conditional compilation. /// See the documentation on the `arch` field for more details. pub fn arch(mut self, arch: NvvmArch) -> Self { self.arch = arch; diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs index 0ff7a017..d3f34c4e 100644 --- a/crates/nvvm/src/lib.rs +++ b/crates/nvvm/src/lib.rs @@ -243,54 +243,59 @@ impl FromStr for NvvmOption { } _ if s.starts_with("-arch=") => { let slice = &s[6..]; - if !slice.starts_with("compute_") { - return Err(format!("unknown -arch value: {slice}")); + match NvvmArch::from_str(slice) { + Ok(arch) => Self::Arch(arch), + Err(_) => return Err(format!("unknown -arch value: {slice}")), } - let arch_num = &slice[8..]; - let arch = match arch_num { - "35" => NvvmArch::Compute35, - "37" => NvvmArch::Compute37, - "50" => NvvmArch::Compute50, - "52" => NvvmArch::Compute52, - "53" => NvvmArch::Compute53, - "60" => NvvmArch::Compute60, - "61" => NvvmArch::Compute61, - "62" => NvvmArch::Compute62, - "70" => NvvmArch::Compute70, - "72" => NvvmArch::Compute72, - "75" => NvvmArch::Compute75, - "80" => NvvmArch::Compute80, - "86" => NvvmArch::Compute86, - "87" => NvvmArch::Compute87, - "89" => NvvmArch::Compute89, - "90" => NvvmArch::Compute90, - "90a" => NvvmArch::Compute90a, - "100" => NvvmArch::Compute100, - "100f" => NvvmArch::Compute100f, - "100a" => NvvmArch::Compute100a, - "101" => NvvmArch::Compute101, - "101f" => NvvmArch::Compute101f, - "101a" => NvvmArch::Compute101a, - "103" => NvvmArch::Compute103, - "103f" => NvvmArch::Compute103f, - "103a" => NvvmArch::Compute103a, - "120" => NvvmArch::Compute120, - "120f" => NvvmArch::Compute120f, - "120a" => NvvmArch::Compute120a, - "121" => NvvmArch::Compute121, - "121f" => NvvmArch::Compute121f, - "121a" => NvvmArch::Compute121a, - _ => return Err(format!("unknown -arch=compute_NN value: {arch_num}")), - }; - Self::Arch(arch) } _ => return Err(format!("unknown option: {s}")), }) } } -/// Nvvm architecture, default is `Compute52` -#[derive(Debug, Clone, Copy, PartialEq, Eq, strum::EnumIter)] +/// Nvvm architecture. +/// +/// The following table indicates which `compute_*` values are supported by which CUDA versions. +/// +/// ```text +/// ----------------------------------------------------------------------------- +/// | Supported `compute_*` values (written vertically) +/// ----------------------------------------------------------------------------- +/// CUDA | 1 1 1 1 1 1 +/// Toolkit | 5 5 5 6 6 6 7 7 7 7 8 8 8 8 8 9 0 0 0 1 2 2 +/// version | 0 2 3 0 1 2 0 2 3 5 0 6 7 8 9 0 0 1 3 0 0 1 +/// ----------------------------------------------------------------------------- +/// 12.[01].0 | b b b b b b b b b b b b - - b b - - - - - - +/// 12.2.0 | b b b b b b b b b b b b - - b a - - - - - - +/// 12.[3456].0 | b b b b b b b b b b b b b - b a - - - - - - +/// 12.8.0 | b b b b b b b b b b b b b - b a a a - - a - +/// 12.9.0 | b b b b b b b b - b b b b - b a f f f - f f +/// 13.0.0 | - - - - - - - - - b b b b b b a f - f f f f +/// ----------------------------------------------------------------------------- +/// Legend: +/// - 'b': baseline features only +/// - 'a': baseline + architecture-specific features +/// - 'f': baseline + architecture-specific + family-specific features +/// +/// Note: there was no 12.7 release. +/// ``` +/// +/// For example, CUDA 12.9.0 supports `compute_89`, `compute_90{,a}`, `compute_100{,a,f}`. +/// +/// This information is from "PTX Compiler APIs" documents under +/// , e.g. +/// . (Adjust the version +/// in that URL as necessary.) Specifically, the `compute-*` values allowed with the `--gpu-name` +/// option. +/// +/// # Example +/// +/// ``` +/// // The default value is `NvvmArch::Compute75`. +/// # use nvvm::NvvmArch; +/// assert_eq!(NvvmArch::default(), NvvmArch::Compute75); +/// ``` +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, strum::EnumIter)] pub enum NvvmArch { Compute35, Compute37, @@ -302,6 +307,12 @@ pub enum NvvmArch { Compute62, Compute70, Compute72, + /// This default value of 7.5 corresponds to Turing and later devices. We default to this + /// because it is the minimum supported by CUDA 13.0 while being in the middle of the range + /// supported by CUDA 12.x. + // WARNING: If you change the default, consider updating the `--target-arch` values used for + // compiletests in `ci_linux.yml` and `.github/workflows/ci_{linux,windows}.yml`. + #[default] Compute75, Compute80, Compute86, @@ -340,9 +351,45 @@ impl Display for NvvmArch { } } -impl Default for NvvmArch { - fn default() -> Self { - Self::Compute52 +impl FromStr for NvvmArch { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + Ok(match s { + "compute_35" => NvvmArch::Compute35, + "compute_37" => NvvmArch::Compute37, + "compute_50" => NvvmArch::Compute50, + "compute_52" => NvvmArch::Compute52, + "compute_53" => NvvmArch::Compute53, + "compute_60" => NvvmArch::Compute60, + "compute_61" => NvvmArch::Compute61, + "compute_62" => NvvmArch::Compute62, + "compute_70" => NvvmArch::Compute70, + "compute_72" => NvvmArch::Compute72, + "compute_75" => NvvmArch::Compute75, + "compute_80" => NvvmArch::Compute80, + "compute_86" => NvvmArch::Compute86, + "compute_87" => NvvmArch::Compute87, + "compute_89" => NvvmArch::Compute89, + "compute_90" => NvvmArch::Compute90, + "compute_90a" => NvvmArch::Compute90a, + "compute_100" => NvvmArch::Compute100, + "compute_100f" => NvvmArch::Compute100f, + "compute_100a" => NvvmArch::Compute100a, + "compute_101" => NvvmArch::Compute101, + "compute_101f" => NvvmArch::Compute101f, + "compute_101a" => NvvmArch::Compute101a, + "compute_103" => NvvmArch::Compute103, + "compute_103f" => NvvmArch::Compute103f, + "compute_103a" => NvvmArch::Compute103a, + "compute_120" => NvvmArch::Compute120, + "compute_120f" => NvvmArch::Compute120f, + "compute_120a" => NvvmArch::Compute120a, + "compute_121" => NvvmArch::Compute121, + "compute_121f" => NvvmArch::Compute121f, + "compute_121a" => NvvmArch::Compute121a, + _ => return Err("unknown compile target"), + }) } } @@ -1116,8 +1163,8 @@ mod tests { err("blah", "unknown option: blah"); err("-aardvark", "unknown option: -aardvark"); err("-arch=compute75", "unknown -arch value: compute75"); - err("-arch=compute_10", "unknown -arch=compute_NN value: 10"); - err("-arch=compute_100x", "unknown -arch=compute_NN value: 100x"); + err("-arch=compute_10", "unknown -arch value: compute_10"); + err("-arch=compute_100x", "unknown -arch value: compute_100x"); err("-opt=3", "-opt=3 is the default"); err("-opt=99", "unknown -opt value: 99"); } diff --git a/crates/rustc_codegen_nvvm/src/nvvm.rs b/crates/rustc_codegen_nvvm/src/nvvm.rs index 2c1ae5b2..e165225f 100644 --- a/crates/rustc_codegen_nvvm/src/nvvm.rs +++ b/crates/rustc_codegen_nvvm/src/nvvm.rs @@ -44,7 +44,7 @@ impl Display for CodegenErr { } /// Take a list of bitcode module bytes and their names and codegen it -/// into ptx bytes. The final PTX *should* be utf8, but just to be on the safe side +/// into PTX bytes. The final PTX *should* be utf8, but just to be on the safe side /// it returns a vector of bytes. /// /// Note that this will implicitly try to find libdevice and add it, so don't do that @@ -57,15 +57,15 @@ pub fn codegen_bitcode_modules( ) -> Result, CodegenErr> { debug!("Codegenning bitcode to PTX"); - // make sure the nvvm version is high enough so users don't get confusing compilation errors. + // Make sure the nvvm version is high enough so users don't get confusing compilation errors. let (major, minor) = nvvm::ir_version(); - if major <= 1 && minor < 6 { + if major <= 2 && minor < 0 { sess.dcx() - .fatal("rustc_codegen_nvvm requires at least libnvvm 1.6 (CUDA 11.2)"); + .fatal("rustc_codegen_nvvm requires at least libnvvm 2.0 (CUDA 12.0)"); } - // first, create the nvvm program we will add modules to. + // First, create the nvvm program we will add modules to. let prog = NvvmProgram::new()?; let module = merge_llvm_modules(modules, llcx); diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md index be30c946..e7a8f728 100644 --- a/guide/src/guide/getting_started.md +++ b/guide/src/guide/getting_started.md @@ -6,7 +6,9 @@ This section covers how to get started writing GPU crates with `cuda_std` and `c Before you can use the project to write GPU crates, you will need a couple of prerequisites: -- [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version 11.2 or later (and the appropriate driver - [see CUDA release notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)). +- [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version 12.0 or later (and the + appropriate driver - [see CUDA release + notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)). This is only for building GPU crates, to execute built PTX you only need CUDA `9+`. diff --git a/tests/compiletests/Cargo.toml b/tests/compiletests/Cargo.toml index 48102ec7..989726d6 100644 --- a/tests/compiletests/Cargo.toml +++ b/tests/compiletests/Cargo.toml @@ -10,6 +10,7 @@ path = "src/main.rs" [dependencies] compiletest_rs = "0.11" clap = { version = "4.5", features = ["derive"] } +nvvm = { path = "../../crates/nvvm" } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } cuda_builder = { workspace = true } diff --git a/tests/compiletests/README.md b/tests/compiletests/README.md index 55f4bcd1..2f638816 100644 --- a/tests/compiletests/README.md +++ b/tests/compiletests/README.md @@ -22,7 +22,7 @@ cargo run --release ### Options - `--bless` - Update expected output files -- `--target-arch=compute_61,compute_70,compute_90` - Test multiple CUDA compute capabilities (comma-separated) +- `--target-arch=compute_61,compute_75,compute_90` - Test multiple CUDA compute capabilities (comma-separated) - Filter by test name: `cargo compiletest simple` - `RUST_LOG=info` - Enable progress logging - `RUST_LOG=debug` - Enable detailed debug logging diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs index 927a8c06..e797f11b 100644 --- a/tests/compiletests/src/main.rs +++ b/tests/compiletests/src/main.rs @@ -1,4 +1,5 @@ use clap::Parser; +use nvvm::NvvmArch; use std::env; use std::io; use std::path::{Path, PathBuf}; @@ -13,8 +14,9 @@ struct Opt { /// The CUDA compute capability to target (e.g., compute_70, compute_80, compute_90). /// Can specify multiple architectures comma-separated. - #[arg(long, default_value = "compute_70", value_delimiter = ',')] - target_arch: Vec, + // WARNING: This should be kept in sync with the default on `CudaBuilder::arch`. + #[arg(long, default_values_t = [NvvmArch::default()], value_delimiter = ',')] + target_arch: Vec, /// Only run tests that match these filters. #[arg(name = "FILTER")] @@ -22,8 +24,8 @@ struct Opt { } impl Opt { - pub fn architectures(&self) -> impl Iterator { - self.target_arch.iter().map(|s| s.as_str()) + pub fn architectures(&self) -> impl Iterator + use<'_> { + self.target_arch.iter().copied() } } @@ -136,18 +138,18 @@ impl Runner { extra_flags: "", }]; - for (arch, variation) in self - .opt - .architectures() - .flat_map(|arch| VARIATIONS.iter().map(move |variation| (arch, variation))) - { + for (arch, variation) in self.opt.architectures().flat_map(|arch| { + VARIATIONS + .iter() + .map(move |variation| (arch.target_feature(), variation)) + }) { // HACK(eddyb) in order to allow *some* tests to have separate output // in different testing variations (i.e. experimental features), while // keeping *most* of the tests unchanged, we make use of "stage IDs", // which offer `// only-S` and `// ignore-S` for any stage ID `S`. let stage_id = if variation.name == "default" { // Use the architecture name as the stage ID. - arch.to_string() + arch.clone() } else { // Include the variation name in the stage ID. format!("{}-{}", arch, variation.name) @@ -159,7 +161,7 @@ impl Runner { &self.deps_target_dir, &self.codegen_backend_path, CUDA_TARGET, - arch, + &arch, ); let mut flags = test_rustc_flags( &self.codegen_backend_path, @@ -172,7 +174,7 @@ impl Runner { .deps_target_dir .join(DepKind::ProcMacro.target_dir_suffix(CUDA_TARGET)), ], - arch, + &arch, ); flags += variation.extra_flags;