Skip to content

Commit 48d06f3

Browse files
committed
24.11.0 release fix
1 parent 9aaa251 commit 48d06f3

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

playbooks/nvidia-driver.yaml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -237,14 +237,14 @@
237237
- set_fact:
238238
driver_version: "{{ dversion.stdout }}"
239239

240-
- name: check if NVSwitch/NVlink
241-
shell: lspci | grep -i nvidia | egrep -i 'bridge|nvlink|nvswitch'
242-
register: nvlink
240+
- name: check if NVSwitches
241+
shell: ls -A /proc/driver/nvidia-nvswitch/devices | wc -l | tr -d '\n'
242+
register: nvswitch
243243
failed_when: false
244244

245245
- name: check nvlink status with NVIDIA SMI
246-
shell: nvidia-smi nvlink -s -i 0 | tail -1f | awk '{print $NF}' | tr -dd '\n'
247-
register: nvlink_status
246+
shell: nvidia-smi nvlink -s -i 0 | tail -1f | awk '{print $NF}' | tr -d '\n'
247+
register: nvlink
248248
failed_when: false
249249

250250
- name: check dgx
@@ -261,7 +261,8 @@
261261

262262
- name: Install NVIDIA Fabric Manager on Ubuntu
263263
become: true
264-
when: ansible_distribution == 'Ubuntu' and dgx.stat.exists == True or ansible_distribution == 'Ubuntu' and l4t.stat.exists == True or ansible_distribution == 'Ubuntu' and nvlink_status.stdout != 'inActive'
264+
when: ansible_distribution == 'Ubuntu' and dgx.stat.exists == True or ansible_distribution == 'Ubuntu' and l4t.stat.exists == True or ansible_distribution == 'Ubuntu' and nvswitch.stdout | int > 0
265+
ignore_errors: true
265266
ignore_errors: true
266267
apt:
267268
name: "nvidia-fabricmanager-{{ driver_version }}={{ gpu_driver_version}}-1"
@@ -271,15 +272,15 @@
271272

272273
- name: Install NVIDIA Fabric Manager for NVSwitch on RHEL
273274
become: true
274-
when: "ansible_distribution == 'RedHat' and dgx.stat.exists == True or ansible_distribution == 'RedHat' and l4t.stat.exists == True or ansible_distribution == 'RedHat' and and nvlink_status.stdout != 'inActive'"
275+
when: "ansible_distribution == 'RedHat' and dgx.stat.exists == True or ansible_distribution == 'RedHat' and l4t.stat.exists == True or ansible_distribution == 'RedHat' and nvswitch.stdout | int > 0"
275276
ignore_errors: true
276277
yum:
277278
name: "nvidia-fabric-manager-{{ gpu_driver_version}}-1"
278279
state: present
279280
update_cache: true
280281

281282
- name: Enable and restart NVIDIA Fabric manager
282-
when: dgx.stat.exists == True or l4t.stat.exists == True or nvlink_status.stdout != 'inActive'
283+
when: dgx.stat.exists == True or l4t.stat.exists == True or nvswitch.stdout | int > 0
283284
ignore_errors: true
284285
become: true
285286
systemd_service:

0 commit comments

Comments
 (0)