From cc6ff8daa408952792baf073b780107a8779ddf8 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 7 Nov 2024 16:31:14 +0000 Subject: [PATCH 1/2] tried pinning nvidia-driver version --- ansible/roles/cuda/tasks/main.yml | 2 +- packer/openhpc_extravars.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/main.yml index 22f8e9e8e..bb130eb5f 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/main.yml @@ -25,7 +25,7 @@ register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ 'open-dkms' if cuda_driver_stream == 'default' else cuda_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index 66f668649..7a78e3e51 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1 +1,2 @@ workaround_ansible_issue_61497: yes # extravars files can't be empty +cuda_driver_stream: 560-open # pinned to fix incompatibility cuda latest (12.6.2) being out of date for nvidia-driver latest (565.57.01), remove when fixed From 211d2308b831b330086bf73d43d8186c04ab0133 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 8 Nov 2024 08:27:29 +0000 Subject: [PATCH 2/2] simplified templating --- ansible/roles/cuda/defaults/main.yml | 2 +- ansible/roles/cuda/tasks/main.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 33a25d9b4..f41c17f70 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,6 +1,6 @@ cuda_distro: "rhel{{ ansible_distribution_major_version }}" cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" -cuda_driver_stream: default +cuda_driver_stream: open-dkms cuda_package_version: 'latest' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/main.yml index bb130eb5f..3dbc45268 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/main.yml @@ -25,7 +25,7 @@ register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ 'open-dkms' if cuda_driver_stream == 'default' else cuda_driver_stream }}" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"