Skip to content

Commit 9aaa251

Browse files
committed
24.11.0 release fix
1 parent 4d97cb7 commit 9aaa251

File tree

3 files changed

+95
-21
lines changed

3 files changed

+95
-21
lines changed

playbooks/nvidia-driver.yaml

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -149,28 +149,14 @@
149149
become: true
150150
get_url:
151151
url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
152-
dest: /tmp//cuda-{{ ansible_distribution | lower }}.repo
152+
dest: /etc/yum.repos.d/cuda-{{ ansible_distribution | lower }}.repo
153153

154154
- name: Download NVIDIA CUDA keyring package for RHEL
155155
when: ansible_distribution == 'RedHat' and ansible_architecture == 'aarch64'
156156
become: true
157157
get_url:
158158
url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/sbsa/cuda-rhel{{ ansible_distribution_major_version }}.repo"
159-
dest: /tmp//cuda-{{ ansible_distribution | lower }}.repo
160-
161-
- name: Install NVIDIA CUDA keyring package for RHEL
162-
when: ansible_distribution == 'RedHat'
163-
become: true
164-
block:
165-
- name: Install CUDA keyring package
166-
yum:
167-
name: /tmp/cuda-{{ ansible_distribution | lower }}.repo
168-
state: present
169-
170-
- name: Clean up the downloaded package
171-
file:
172-
path: /tmp//cuda-{{ ansible_distribution | lower }}.repo
173-
state: absent
159+
dest: /etc/yum.repos.d/cuda-{{ ansible_distribution | lower }}.repo
174160

175161
- name: ensure we have kernel-headers installed for the current kernel on RHEL
176162
when: "cns_version >= 10.0 and ansible_distribution == 'RedHat'"
@@ -251,14 +237,53 @@
251237
- set_fact:
252238
driver_version: "{{ dversion.stdout }}"
253239

240+
- name: check if NVSwitch/NVlink
241+
shell: lspci | grep -i nvidia | egrep -i 'bridge|nvlink|nvswitch'
242+
register: nvlink
243+
failed_when: false
244+
245+
- name: check nvlink status with NVIDIA SMI
246+
shell: nvidia-smi nvlink -s -i 0 | tail -1f | awk '{print $NF}' | tr -dd '\n'
247+
register: nvlink_status
248+
failed_when: false
249+
250+
- name: check dgx
251+
ignore_errors: true
252+
stat:
253+
path: /etc/dgx-release
254+
register: dgx
255+
256+
- name: check l4t
257+
ignore_errors: true
258+
stat:
259+
path: /etc/l4t-release
260+
register: l4t
261+
254262
- name: Install NVIDIA Fabric Manager on Ubuntu
255263
become: true
256-
when: ansible_distribution == 'Ubuntu'
264+
when: ansible_distribution == 'Ubuntu' and dgx.stat.exists == True or ansible_distribution == 'Ubuntu' and l4t.stat.exists == True or ansible_distribution == 'Ubuntu' and nvlink_status.stdout != 'inActive'
257265
ignore_errors: true
258-
shell: "apt update; apt install nvidia-fabricmanager-{{ driver_version }} -y; sudo systemctl --now enable nvidia-fabricmanager; sudo systemctl daemon-reload"
266+
apt:
267+
name: "nvidia-fabricmanager-{{ driver_version }}={{ gpu_driver_version}}-1"
268+
state: present
269+
update_cache: true
270+
force: yes
259271

260-
- name: Install NVIDIA Fabric Manager on RHEL
272+
- name: Install NVIDIA Fabric Manager for NVSwitch on RHEL
261273
become: true
262-
when: ansible_distribution == 'RedHat'
274+
when: "ansible_distribution == 'RedHat' and dgx.stat.exists == True or ansible_distribution == 'RedHat' and l4t.stat.exists == True or ansible_distribution == 'RedHat' and and nvlink_status.stdout != 'inActive'"
263275
ignore_errors: true
264-
shell: "yum update -y; yum install nvidia-fabricmanager-{{ driver_version }} -y; sudo systemctl --now enable nvidia-fabricmanager; sudo systemctl daemon-reload"
276+
yum:
277+
name: "nvidia-fabric-manager-{{ gpu_driver_version}}-1"
278+
state: present
279+
update_cache: true
280+
281+
- name: Enable and restart NVIDIA Fabric manager
282+
when: dgx.stat.exists == True or l4t.stat.exists == True or nvlink_status.stdout != 'inActive'
283+
ignore_errors: true
284+
become: true
285+
systemd_service:
286+
name: nvidia-fabricmanager
287+
enabled: true
288+
state: started
289+
daemon_reload: true

playbooks/operators-install.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,10 @@
272272
when: "confidential_computing == false and enable_gpu_operator == true and use_open_kernel_module == true and enable_mig == true and enable_rdma == true and enable_vgpu == false and enable_gds == false and enable_secure_boot == false and gpu_operator.rc == 1 and network_operator_valid.rc == 1 and cns_nvidia_driver == false and ngc_registry_password == ''"
273273
shell: helm install --version {{ gpu_operator_version }} --values {{ ansible_user_dir }}/values.yaml --create-namespace --namespace nvidia-gpu-operator --devel '{{ gpu_operator_helm_chart }}' --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true,mig.strategy='{{ mig_strategy }}',driver.version='{{ gpu_driver_version }}',driver.repository='{{ gpu_operator_driver_registry }}',driver.useOpenKernelModules=true --wait --generate-name
274274

275+
- name: Installing the GDS with Open RM GPU Operator on NVIDIA Cloud Native Stack
276+
when: "confidential_computing == false and enable_gpu_operator == true and use_open_kernel_module == true and enable_mig == false and enable_rdma == false and enable_vgpu == false and enable_gds == true and enable_secure_boot == false and gpu_operator.rc == 1 and network_operator_valid.rc == 1 and cns_nvidia_driver == false and ngc_registry_password == ''"
277+
shell: helm install --version {{ gpu_operator_version }} --values {{ ansible_user_dir }}/values.yaml --create-namespace --namespace nvidia-gpu-operator --devel '{{ gpu_operator_helm_chart }}' --set gds.enabled=true,driver.version='{{ gpu_driver_version }}',driver.repository='{{ gpu_operator_driver_registry }}',driver.useOpenKernelModules=true --wait --generate-name
278+
275279
- name: Installing the GPU Operator with RDMA and Host MOFED on NVIDIA Cloud Native Stack
276280
when: "confidential_computing == false and enable_gpu_operator == true and use_open_kernel_module == false and enable_mig == false and enable_rdma == true and enable_vgpu == false and enable_gds == false and enable_secure_boot == false and gpu_operator.rc == 1 and network_operator_valid.rc == 1 and cns_nvidia_driver == false and ngc_registry_password == ''"
277281
shell: helm install --version {{ gpu_operator_version }} --values {{ ansible_user_dir }}/values.yaml --create-namespace --namespace nvidia-gpu-operator --devel '{{ gpu_operator_helm_chart }}' --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true,driver.version='{{ gpu_driver_version }}',driver.repository='{{ gpu_operator_driver_registry }}' --wait --generate-name

playbooks/prerequisites.yaml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,51 @@
8787
name: ['containers-common','net-tools', 'libseccomp', 'curl', 'ca-certificates', 'kubelet-{{ k8s_version }}', 'kubeadm-{{ k8s_version }}', 'kubectl-{{ k8s_version }}']
8888
state: present
8989

90+
- name: check update GCC for build essentials
91+
when: "ansible_distribution == 'Ubuntu'"
92+
become: true
93+
block:
94+
- name: capture gcc expected version
95+
shell: cat /proc/version | tr -d '(' | tr -d ')' | awk '{print $8}' | cut -d '.' -f 1
96+
register: gcc_expected_version
97+
98+
- name: install expected gcc version
99+
apt:
100+
name: "gcc-{{ gcc_expected_version.stdout }}"
101+
state: present
102+
103+
- name: install expected g++ version
104+
apt:
105+
name: "g++-{{ gcc_expected_version.stdout }}"
106+
state: present
107+
108+
- name: capture gcc binary path
109+
shell: "which gcc"
110+
register: gcc_bin_path
111+
112+
- name: capture g++ binary path
113+
shell: "which g++"
114+
register: g_plus_plus_bin_path
115+
116+
- name: capture gcc expected version binary path
117+
shell: "which gcc-{{ gcc_expected_version.stdout }}"
118+
register: gcc_expected_version_bin_path
119+
120+
- name: capture g++ expected version binary path
121+
shell: "which g++-{{ gcc_expected_version.stdout }}"
122+
register: g_plus_plus_expected_version_bin_path
123+
124+
- name: setup expected gcc as primary gcc
125+
community.general.alternatives:
126+
name: gcc
127+
link: "{{ gcc_bin_path.stdout }}"
128+
path: "{{ gcc_expected_version_bin_path.stdout }}"
129+
subcommands:
130+
- name: g++
131+
link: "{{ g_plus_plus_bin_path.stdout }}"
132+
path: "{{ g_plus_plus_expected_version_bin_path.stdout }}"
133+
state: auto
134+
90135
- name: Hold the installed Packages
91136
become: true
92137
when: "ansible_distribution == 'Ubuntu'"

0 commit comments

Comments
 (0)