From c52fd2d593bddf711898f5560d130ad932f78d10 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 28 May 2025 12:57:51 +0200 Subject: [PATCH] Bump CUDA to 12.9 and NVIDIA driver to 575 The bandwidthTest utility was removed from CUDA Samples v12.9 [1]. Remove it from the samples playbook for now: it can be replaced by nvbandwidth [2] later. [1] https://github.com/NVIDIA/cuda-samples/releases/tag/v12.9 [2] https://github.com/NVIDIA/nvbandwidth --- ansible/roles/cuda/README.md | 2 +- ansible/roles/cuda/defaults/main.yml | 8 +++---- ansible/roles/cuda/tasks/samples.yml | 33 ---------------------------- 3 files changed, 5 insertions(+), 38 deletions(-) diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index 1e74d07f3..c968532b5 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -10,6 +10,6 @@ Requires OFED to be installed to provide required kernel-* packages. - `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. - `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. -- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-8']`. +- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-9']`. - `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index fd4bf37c8..6a67b1f12 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,12 +1,12 @@ cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" -cuda_nvidia_driver_stream: '570-open' -cuda_package_version: '12.8.1-1' -cuda_version_short: '12.8' +cuda_nvidia_driver_stream: '575-open' +cuda_package_version: '12.9.0-1' +cuda_version_short: '12.9' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds - cmake - - cuda-toolkit-12-8 + - cuda-toolkit-12-9 cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz" cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples" cuda_samples_programs: diff --git a/ansible/roles/cuda/tasks/samples.yml b/ansible/roles/cuda/tasks/samples.yml index 38ce3339d..b2bccd74d 100644 --- a/ansible/roles/cuda/tasks/samples.yml +++ b/ansible/roles/cuda/tasks/samples.yml @@ -25,36 +25,3 @@ cmd: . /etc/profile.d/sh.local && cmake .. && make -j {{ ansible_processor_vcpus }} chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/deviceQuery/deviceQuery" - -- name: Run CUDA deviceQuery - command: - cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/deviceQuery/deviceQuery" - register: _cuda_devicequery - -- name: Set fact for CUDA devices - set_fact: - cuda_devices: "{{ _cuda_devicequery.stdout | regex_findall('Device (\\d+):') }}" - -- name: Run CUDA bandwidth test - command: - cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/bandwidthTest/bandwidthTest --device={{ item }}" - register: _cuda_bandwidthtest - loop: "{{ cuda_devices }}" - loop_control: - label: "Device {{ item }}" # e.g '0' - -- name: Summarise bandwidth test output - debug: - msg: | - {{ _parts[1].splitlines()[0] | trim }} - Bandwidths: (Gb/s) - Host to Device: {{ _parts[2].split()[-1] }} - Device to Host: {{ _parts[3].split()[-1] }} - Device to Device: {{ _parts[4].split()[-1] }} - {{ ': '.join(_parts[5].split('=') | map('trim')) }} - {{ _parts[6] }} - loop: "{{ _cuda_bandwidthtest.results }}" - vars: - _parts: "{{ item.stdout.split('\n\n') }}" - loop_control: - label: "Device {{ item.item }}" # e.g '0'