From 779a29cae5f37d6ea90ee369801b340e63e58f79 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Fri, 21 Mar 2025 11:55:39 +0100 Subject: [PATCH 1/2] Bump CUDA and NVIDIA driver versions --- ansible/roles/cuda/defaults/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 05f1e093d..31cfe23d5 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,6 +1,6 @@ cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" -cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages -cuda_package_version: '12.6.3-1' +cuda_nvidia_driver_stream: '570-open' +cuda_package_version: '12.8.1-1' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds From 07563c877f00c37ebfa50dda4b74efc93a327cbd Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Fri, 21 Mar 2025 15:19:46 +0100 Subject: [PATCH 2/2] Update CUDA samples playbook for CUDA 12.8 --- ansible/adhoc/cudatests.yml | 2 +- ansible/roles/cuda/tasks/samples.yml | 32 +++++++++++++++++----------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/ansible/adhoc/cudatests.yml b/ansible/adhoc/cudatests.yml index 3f5fb143f..59af8568a 100644 --- a/ansible/adhoc/cudatests.yml +++ b/ansible/adhoc/cudatests.yml @@ -1,6 +1,6 @@ - hosts: cuda become: yes - gather_facts: no + gather_facts: yes tags: cuda_samples tasks: - import_role: diff --git a/ansible/roles/cuda/tasks/samples.yml b/ansible/roles/cuda/tasks/samples.yml index bf48c4aa4..679ce5644 100644 --- a/ansible/roles/cuda/tasks/samples.yml +++ b/ansible/roles/cuda/tasks/samples.yml @@ -1,9 +1,9 @@ -- name: Read cuda version file +- name: Read CUDA version file slurp: src: /usr/local/cuda/version.json register: _cuda_samples_version -- name: Set fact for discovered cuda version +- name: Set fact for discovered CUDA version set_fact: _cuda_version_tuple: "{{ (_cuda_samples_version.content | b64decode | from_json).cuda.version | split('.') }}" # e.g. '12.1.0' @@ -14,33 +14,39 @@ owner: "{{ ansible_user }}" group: "{{ ansible_user }}" -- name: Download cuda sample release +- name: Download CUDA samples release unarchive: remote_src: yes src: "{{ cuda_samples_release_url }}" dest: "{{ cuda_samples_path }}" owner: "{{ ansible_user }}" group: "{{ ansible_user }}" + creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}" -- name: Build cuda samples +- name: Create CUDA samples build directory + file: + state: directory + path: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" + +- name: Build CUDA samples shell: - cmd: make - chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/Samples/1_Utilities/{{ item }}" - creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/bin/x86_64/linux/release/{{ item }}" - loop: "{{ cuda_samples_programs }}" + # We need to source /etc/profile.d/sh.local to add CUDA to the PATH + cmd: . /etc/profile.d/sh.local && cmake .. && make -j {{ ansible_processor_vcpus }} + chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" + creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/deviceQuery/deviceQuery" -- name: Run cuda deviceQuery +- name: Run CUDA deviceQuery command: - cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/bin/x86_64/linux/release/deviceQuery" + cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/deviceQuery/deviceQuery" register: _cuda_devicequery -- name: Set fact for cuda devices +- name: Set fact for CUDA devices set_fact: cuda_devices: "{{ _cuda_devicequery.stdout | regex_findall('Device (\\d+):') }}" -- name: Run cuda bandwidth test +- name: Run CUDA bandwidth test command: - cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/bin/x86_64/linux/release/bandwidthTest --device={{ item }}" + cmd: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build/Samples/1_Utilities/bandwidthTest/bandwidthTest --device={{ item }}" register: _cuda_bandwidthtest loop: "{{ cuda_devices }}" loop_control: