Skip to content

Commit 3645bfb

Browse files
committed
Merge branch 'main' into feat/additional-nodes
2 parents de86177 + e1de488 commit 3645bfb

File tree

40 files changed

+662
-76
lines changed

40 files changed

+662
-76
lines changed

.github/workflows/extra.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ jobs:
3232
- image_name: openhpc-extra-RL8
3333
source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
3434
inventory_groups: doca,cuda,lustre
35-
volume_size: 30 # needed for cuda
35+
volume_size: 35 # needed for cuda
3636
- image_name: openhpc-extra-RL9
3737
source_image_name_key: RL9
3838
inventory_groups: doca,cuda,lustre
39-
volume_size: 30 # needed for cuda
39+
volume_size: 35 # needed for cuda
4040
env:
4141
ANSIBLE_FORCE_COLOR: True
4242
OS_CLOUD: openstack

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ venv
55
*.pyc
66
packer/openhpc2
77
.vscode
8+
requirements.yml.last

ansible/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,7 @@ roles/*
9090
!roles/gateway/**
9191
!roles/alertmanager/
9292
!roles/alertmanager/**
93+
!roles/slurm_recompile/**
94+
!roles/slurm_recompile/**
95+
!roles/nhc/
96+
!roles/nhc/**

ansible/bootstrap.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@
134134

135135
- hosts: dnf_repos
136136
become: yes
137+
tags: dnf_repos
137138
tasks:
138139
- name: Check that creds won't be leaked to users
139140
ansible.builtin.assert:

ansible/disable-repos.yml

Lines changed: 0 additions & 8 deletions
This file was deleted.

ansible/extras.yml

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@
4848
name: cuda
4949
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
5050

51+
- name: Setup vGPU
52+
hosts: vgpu
53+
become: yes
54+
gather_facts: yes
55+
tags: vgpu
56+
tasks:
57+
- include_role:
58+
name: stackhpc.linux.vgpu
59+
tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
60+
handlers:
61+
- name: reboot
62+
fail:
63+
msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
64+
5165
- name: Persist hostkeys across rebuilds
5266
# Must be after filesystems.yml (for storage)
5367
# and before portal.yml (where OOD login node hostkeys are scanned)
@@ -58,17 +72,6 @@
5872
- import_role:
5973
name: persist_hostkeys
6074

61-
62-
- name: Setup NFS export for compute node configuration
63-
hosts: compute_init:!builder
64-
# NB: has to be after eeesi and os-manila-mount
65-
tags: compute_init
66-
become: yes
67-
tasks:
68-
- include_role:
69-
name: compute_init
70-
tasks_from: export.yml
71-
7275
- name: Install k9s
7376
become: yes
7477
hosts: k9s

ansible/fatimage.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,14 +250,24 @@
250250
name: cloudalchemy.grafana
251251
tasks_from: install.yml
252252

253+
- name: Add support for NVIDIA GPU auto detection to Slurm
254+
hosts: cuda
255+
become: yes
256+
tasks:
257+
- name: Recompile slurm
258+
import_role:
259+
name: slurm_recompile
260+
vars:
261+
slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}"
262+
253263
- name: Run post.yml hook
254264
vars:
255265
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
256266
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
257267
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
258268
when: hook_path | exists
259269

260-
- import_playbook: disable-repos.yml
270+
- import_playbook: final.yml
261271

262272
- hosts: builder
263273
become: yes

ansible/final.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
- hosts: dnf_repos
2+
become: yes
3+
tags: dnf_repos
4+
tasks:
5+
- name: Disable pulp repos
6+
ansible.builtin.include_role:
7+
name: dnf_repos
8+
tasks_from: disable_repos.yml
9+
10+
- name: Setup NFS export for compute_init
11+
hosts: compute_init:!builder
12+
# NB: done last so other roles can prepare configuration etc
13+
tags: compute_init
14+
become: yes
15+
tasks:
16+
- include_role:
17+
name: compute_init
18+
tasks_from: export.yml

ansible/roles/compute_init/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
7575
| extras.yml | basic_users | All functionality [6] | No |
7676
| extras.yml | eessi | All functionality [7] | No |
7777
| extras.yml | cuda | None required - use image build | Yes [8] |
78+
| extras.yml | vgpu | All functionality | Yes |
7879
| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
7980
| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
8081
| extras.yml | k9s (install) | Not relevant during boot | n/a |
@@ -84,6 +85,7 @@ it also requires an image build with the role name added to the
8485
| slurm.yml | openhpc [10] | All slurmd functionality | No |
8586
| slurm.yml | (set memory limits) | Fully supported | No |
8687
| slurm.yml | (block ssh) | Fully supported | No |
88+
| slurm.yml | nhc | Fully supported | No |
8789
| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a |
8890
| portal.yml | (openondemand vnc desktop) | None required - use image build | No |
8991
| portal.yml | (openondemand jupyter server) | None required - use image build | No |

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
2020
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
2121
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
22+
enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}"
23+
enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}"
2224

2325
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2426
resolv_conf_nameservers: []
@@ -63,12 +65,12 @@
6365
mode: u=rX,g=rwX,o=
6466

6567
- name: Mount /mnt/cluster
66-
mount:
68+
ansible.posix.mount:
6769
path: /mnt/cluster
6870
src: "{{ server_node_ip }}:/exports/cluster"
6971
fstype: nfs
7072
opts: ro,sync
71-
state: mounted
73+
state: ephemeral # will be unmounted after sync, don't want it in fstab
7274
register: _mount_mnt_cluster
7375
ignore_errors: true
7476
# exits from playbook if this failed below, allowing ansible-init to
@@ -295,6 +297,12 @@
295297
cmd: "cvmfs_config setup"
296298
when: enable_eessi
297299

300+
- name: Configure VGPUs
301+
include_role:
302+
name: stackhpc.linux.vgpu
303+
tasks_from: 'configure.yml'
304+
when: enable_vgpu
305+
298306
# NB: don't need conditional block on enable_compute as have already exited
299307
# if not the case
300308
- name: Write Munge key
@@ -350,6 +358,11 @@
350358
enabled: true
351359
state: started
352360

361+
- name: Provide NHC configuration
362+
ansible.builtin.include_role:
363+
name: nhc
364+
tasks_from: boot.yml
365+
when: enable_nhc
353366

354367
- name: Ensure node is resumed
355368
# TODO: consider if this is always safe for all job states?

0 commit comments

Comments
 (0)