Skip to content

ci: test gpu on self-hosted runners #33

ci: test gpu on self-hosted runners

ci: test gpu on self-hosted runners #33

Workflow file for this run

name: CI
on:
pull_request:
push:
branches:
- master
# Cancel a job if there's a new one on the same branch started.
# Based on https://stackoverflow.com/questions/58895283/stop-already-running-workflow-job-in-github-actions/67223051#67223051
concurrency:
group: ${{ github.ref }}
cancel-in-progress: true
env:
CARGO_INCREMENTAL: 0
RUST_BACKTRACE: 1
# Faster crates.io index checkout.
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
RUST_LOG: debug
# Build the kernel only for the single architecture. This should reduce the overall compile-time significantly.
EC_GPU_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
BELLMAN_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
NEPTUNE_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
jobs:
check_clippy:
runs-on: ubuntu-24.04
name: Clippy
steps:
- uses: actions/checkout@v4
- name: Install required packages
run: |
sudo apt-get update
sudo apt-get install --no-install-recommends --yes libhwloc-dev nvidia-cuda-toolkit ocl-icd-opencl-dev
- name: Install cargo clippy
run: rustup component add clippy
- name: Run cargo clippy
run: cargo clippy --all-targets --workspace -- -D warnings
check_fmt:
runs-on: ubuntu-24.04
name: Checking fmt
steps:
- uses: actions/checkout@v4
- name: Install cargo fmt
run: rustup component add rustfmt
- name: Run cargo fmt
run: cargo fmt --all -- --check
test:
runs-on: ['self-hosted', 'linux', 'x64', '2xlarge+gpu']
name: Test
steps:
- uses: actions/checkout@v4
# TODO: Move the driver installation to the AMI.
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/install-nvidia-driver.html
# https://www.nvidia.com/en-us/drivers/
- name: Install CUDA drivers
run: |
curl -L -o nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb https://us.download.nvidia.com/tesla/570.148.08/nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb
sudo dpkg -i nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb
sudo cp /var/nvidia-driver-local-repo-ubuntu2404-570.148.08/nvidia-driver-local-*-keyring.gpg /usr/share/keyrings/
sudo apt-get update
sudo apt-get install --no-install-recommends --yes cuda-drivers
rm nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb
- name: Install required packages
run: |
sudo apt-get update
sudo apt-get install --no-install-recommends --yes libhwloc-dev nvidia-cuda-toolkit ocl-icd-opencl-dev
# TODO: Remove this and other rust installation directives from jobs running
- uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: 1.83
- name: Test
run: cargo test --verbose
test_macos:
runs-on: macos-latest
name: Test in release mode on MacOS
steps:
- uses: actions/checkout@v4
- name: Install required packages
run: HOMEBREW_NO_AUTO_UPDATE=1 brew install hwloc
- name: Run usual tests in release profile
# CUDA isn't support on MacOS, hence only enable OpenCL.
run: cargo test --verbose --release --no-default-features -- --nocapture