Skip to content

Commit 50cc965

Browse files
committed
Address some code review comments
1 parent 0f04ca2 commit 50cc965

File tree

5 files changed

+45
-19
lines changed

5 files changed

+45
-19
lines changed

ansible/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,6 @@ roles/*
9090
!roles/gateway/**
9191
!roles/alertmanager/
9292
!roles/alertmanager/**
93+
!roles/slurm_recompile/**
94+
!roles/slurm_recompile/**
95+

ansible/fatimage.yml

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -254,25 +254,11 @@
254254
hosts: cuda
255255
become: yes
256256
tasks:
257-
- name: Get facts about CUDA installation
258-
import_role: cuda
259-
tasks_from: facts.yml
260-
261-
- name: Recompile and install slurm packages
262-
shell: |
263-
#!/bin/bash
264-
set -eux
265-
dnf download -y --source slurm-slurmd-ohpc
266-
rpm -i slurm-ohpc-*.src.rpm
267-
dnf install -y @'Development Tools'
268-
cd /root/rpmbuild/SPECS
269-
dnf builddep -y slurm.spec
270-
rpmbuild -bb -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/" slurm.spec
271-
dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
272-
# Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
273-
if [[ -e /lib64/libnvidia-ml.so ]]; then
274-
ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
275-
fi
257+
- name: Recompile slurm
258+
import_role:
259+
name: recompile_slurm
260+
vars:
261+
recompile_slurm_nvml: "{{ groups.cuda | length > 0 }}"
276262

277263
- name: Run post.yml hook
278264
vars:

ansible/roles/cuda/tasks/facts.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
3+
- name: Set cuda_facts_version_short
4+
set_fact:
5+
cuda_facts_version_short: "{{ cuda_version_short }}"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
---
2+
slurm_recompile_nvml: false
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
---
2+
- name: Get facts about CUDA installation
3+
import_role: cuda
4+
tasks_from: facts.yml
5+
6+
- name: Recompile and install slurm packages
7+
shell: |
8+
#!/bin/bash
9+
set -eux
10+
dnf download -y --source slurm-slurmd-ohpc
11+
rpm -i slurm-ohpc-*.src.rpm
12+
cd /root/rpmbuild/SPECS
13+
dnf builddep -y slurm.spec
14+
rpmbuild -bb{% if slurm_recompile_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
15+
dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
16+
become: true
17+
18+
- name: Workaround missing symlink
19+
# Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
20+
command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
21+
args:
22+
creates: /lib64/libnvidia-ml.so
23+
24+
- name: Cleanup Dependencies
25+
shell: |
26+
#!/bin/bash
27+
set -eux
28+
set -o pipefail
29+
dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 | awk '{print $1}' | xargs dnf history -y undo
30+
become: true

0 commit comments

Comments
 (0)