Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ jobs:
- image_name: openhpc-extra-RL8
source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
inventory_groups: doca,cuda,lustre
volume_size: 30 # needed for cuda
volume_size: 35 # needed for cuda
- image_name: openhpc-extra-RL9
source_image_name_key: RL9
inventory_groups: doca,cuda,lustre
volume_size: 30 # needed for cuda
volume_size: 35 # needed for cuda
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
Expand Down
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,5 @@ roles/*
!roles/gateway/**
!roles/alertmanager/
!roles/alertmanager/**
!roles/slurm_recompile/**
!roles/slurm_recompile/**
14 changes: 14 additions & 0 deletions ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,20 @@
name: cuda
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"

- name: Setup vGPU
hosts: vgpu
become: yes
gather_facts: yes
tags: vgpu
tasks:
- include_role:
name: stackhpc.linux.vgpu
tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
handlers:
- name: reboot
fail:
msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.

- name: Persist hostkeys across rebuilds
# Must be after filesystems.yml (for storage)
# and before portal.yml (where OOD login node hostkeys are scanned)
Expand Down
10 changes: 10 additions & 0 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,16 @@
name: cloudalchemy.grafana
tasks_from: install.yml

- name: Add support for NVIDIA GPU auto detection to Slurm
hosts: cuda
become: yes
tasks:
- name: Recompile slurm
import_role:
name: slurm_recompile
vars:
slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}"

- name: Run post.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/compute_init/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
| extras.yml | basic_users | All functionality [6] | No |
| extras.yml | eessi | All functionality [7] | No |
| extras.yml | cuda | None required - use image build | Yes [8] |
| extras.yml | vgpu | All functionality | Yes |
| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
| extras.yml | k9s (install) | Not relevant during boot | n/a |
Expand Down
7 changes: 7 additions & 0 deletions ansible/roles/compute_init/files/compute-init.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}"

# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
resolv_conf_nameservers: []
Expand Down Expand Up @@ -295,6 +296,12 @@
cmd: "cvmfs_config setup"
when: enable_eessi

- name: Configure VGPUs
include_role:
name: stackhpc.linux.vgpu
tasks_from: 'configure.yml'
when: enable_vgpu

# NB: don't need conditional block on enable_compute as have already exited
# if not the case
- name: Write Munge key
Expand Down
4 changes: 4 additions & 0 deletions ansible/roles/cuda/tasks/facts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
- name: Set cuda_facts_version_short
set_fact:
cuda_facts_version_short: "{{ cuda_version_short }}"
28 changes: 28 additions & 0 deletions ansible/roles/slurm_recompile/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# slurm_recompile
=================

Recompiles slurm from source RPMs and installs the packages that were built.

Requirements
------------

Role Variables
--------------

See `defaults/main.yml`.

Dependencies
------------

Example Playbook
----------------

- hosts: compute
tasks:
- import_role:
name: slurm_recompile

License
-------

Apache-2.0
4 changes: 4 additions & 0 deletions ansible/roles/slurm_recompile/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
# Whether to link slurm against the NVIDIA management library
slurm_recompile_with_nvml: false

41 changes: 41 additions & 0 deletions ansible/roles/slurm_recompile/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---
- name: Get facts about CUDA installation
import_role:
name: cuda
tasks_from: facts.yml

- name: Gather the package facts
ansible.builtin.package_facts:
manager: auto

- name: Set fact containing slurm package facts
set_fact:
slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"

- name: Recompile and install slurm packages
shell: |
#!/bin/bash
source /etc/profile
set -eux
dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
rpm -i slurm-ohpc-*.src.rpm
cd /root/rpmbuild/SPECS
dnf builddep -y slurm.spec
rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
become: true

- name: Workaround missing symlink
# Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
args:
creates: /lib64/libnvidia-ml.so
when: slurm_recompile_with_nvml | bool

- name: Cleanup Dependencies
shell: |
#!/bin/bash
set -eux
set -o pipefail
dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 | awk '{print $1}' | xargs dnf history -y undo
become: true
10 changes: 10 additions & 0 deletions ansible/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,13 @@
- import_role:
name: lustre
tasks_from: validate.yml

- name: Validate vGPU configuration
hosts: vgpu
become: yes
gather_facts: yes
tags: vgpu
tasks:
- include_role:
name: stackhpc.linux.vgpu
tasks_from: validate.yml
Loading