Skip to content

Commit aac9023

Browse files
authored
ci: test gpu on self-hosted runners
1 parent b06f9fb commit aac9023

File tree

1 file changed

+29
-8
lines changed

1 file changed

+29
-8
lines changed

.github/workflows/ci.yml

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
name: CI
22

3-
on: [pull_request, push]
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- master
48

5-
# Cancel a job if there's a new on on the same branch started.
9+
# Cancel a job if there's a new one on the same branch started.
610
# Based on https://stackoverflow.com/questions/58895283/stop-already-running-workflow-job-in-github-actions/67223051#67223051
711
concurrency:
812
group: ${{ github.ref }}
@@ -14,8 +18,7 @@ env:
1418
# Faster crates.io index checkout.
1519
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
1620
RUST_LOG: debug
17-
# Build the kernel only for the single architecture . This should reduce
18-
# the overall compile-time significantly.
21+
# Build the kernel only for the single architecture. This should reduce the overall compile-time significantly.
1922
EC_GPU_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
2023
BELLMAN_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
2124
NEPTUNE_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
@@ -27,7 +30,9 @@ jobs:
2730
steps:
2831
- uses: actions/checkout@v4
2932
- name: Install required packages
30-
run: sudo apt install --no-install-recommends --yes libhwloc-dev nvidia-cuda-toolkit ocl-icd-opencl-dev
33+
run: |
34+
sudo apt-get update
35+
sudo apt-get install --no-install-recommends --yes libhwloc-dev ocl-icd-opencl-dev
3136
- name: Install cargo clippy
3237
run: rustup component add clippy
3338
- name: Run cargo clippy
@@ -44,13 +49,29 @@ jobs:
4449
run: cargo fmt --all -- --check
4550

4651
test:
47-
runs-on: ubuntu-24.04
52+
runs-on: ['self-hosted', 'linux', 'x64', '2xlarge+gpu']
4853
name: Test
4954
steps:
5055
- uses: actions/checkout@v4
56+
# TODO: Move the driver installation to the AMI.
57+
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/install-nvidia-driver.html
58+
# https://www.nvidia.com/en-us/drivers/
59+
- name: Install CUDA drivers
60+
run: |
61+
curl -L -o nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb https://us.download.nvidia.com/tesla/570.148.08/nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb
62+
sudo dpkg -i nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb
63+
sudo cp /var/nvidia-driver-local-repo-ubuntu2404-570.148.08/nvidia-driver-local-*-keyring.gpg /usr/share/keyrings/
64+
sudo apt-get update
65+
sudo apt-get install --no-install-recommends --yes cuda-drivers
66+
rm nvidia-driver-local-repo-ubuntu2404-570.148.08_1.0-1_amd64.deb
5167
- name: Install required packages
52-
run: sudo apt install --no-install-recommends --yes libhwloc-dev nvidia-cuda-toolkit ocl-icd-opencl-dev
53-
# In case no GPUs are available, it's using the CPU fallback.
68+
run: |
69+
sudo apt-get update
70+
sudo apt-get install --no-install-recommends --yes libhwloc-dev nvidia-cuda-toolkit ocl-icd-opencl-dev
71+
# TODO: Remove this and other rust installation directives from jobs running
72+
- uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
73+
with:
74+
toolchain: 1.83
5475
- name: Test
5576
run: cargo test --verbose
5677

0 commit comments

Comments
 (0)