From 4b0b81305d1d2416f03ed87a24b3f15308153d73 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 16 Apr 2025 10:05:37 +0100 Subject: [PATCH 1/3] Correct cuda_samples_path for secure nfs-homedirs The ``rocky`` user's homedir has moved from ``/home/rocky`` to ``/var/lib/rocky``. This updates ``cuda_samples_path`` to use that. --- ansible/roles/cuda/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 31cfe23d5..a401973a5 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -7,7 +7,7 @@ cuda_packages: # _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0') cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}" cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz" -cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples" +cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples" cuda_samples_programs: - deviceQuery - bandwidthTest From f80f0044d473de83c23ac66b1265a19a403911ea Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 16 Apr 2025 10:07:58 +0100 Subject: [PATCH 2/3] Hardcode cuda_version_short and remove lookup We already hardcode the version of cuda which we install. This ensures that we will also use the requested version when multiple cuda versions are installed. Moves ``cuda_version_short`` to be next to ``cuda_package_version`` so it's harder to miss updating one when the other is changed. --- ansible/roles/cuda/defaults/main.yml | 3 +-- ansible/roles/cuda/tasks/samples.yml | 9 --------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index a401973a5..d7df77821 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,11 +1,10 @@ cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" cuda_nvidia_driver_stream: '570-open' cuda_package_version: '12.8.1-1' +cuda_version_short: '12.8' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds -# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0') -cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}" cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz" cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples" cuda_samples_programs: diff --git a/ansible/roles/cuda/tasks/samples.yml b/ansible/roles/cuda/tasks/samples.yml index 679ce5644..38ce3339d 100644 --- a/ansible/roles/cuda/tasks/samples.yml +++ b/ansible/roles/cuda/tasks/samples.yml @@ -1,12 +1,3 @@ -- name: Read CUDA version file - slurp: - src: /usr/local/cuda/version.json - register: _cuda_samples_version - -- name: Set fact for discovered CUDA version - set_fact: - _cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0' - - name: Ensure cuda_samples_path exists file: state: directory From 9d6985427ab6ed2ec1b505b26fc732f971df7e5e Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 16 Apr 2025 10:12:18 +0100 Subject: [PATCH 3/3] Install cmake and cuda-toolkit --- ansible/roles/cuda/README.md | 2 +- ansible/roles/cuda/defaults/main.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index be6439cd5..1e74d07f3 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -10,6 +10,6 @@ Requires OFED to be installed to provide required kernel-* packages. - `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. - `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. -- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`. +- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-8']`. - `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index d7df77821..fd4bf37c8 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -5,6 +5,8 @@ cuda_version_short: '12.8' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds + - cmake + - cuda-toolkit-12-8 cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz" cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples" cuda_samples_programs: