Skip to content

Commit 999584f

Browse files
committed
Merge branch 'main' into feat/isolated-env-2
2 parents 192607f + 7509986 commit 999584f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+1117
-249
lines changed

.github/workflows/extra.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ jobs:
3232
- image_name: openhpc-extra-RL8
3333
source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
3434
inventory_groups: doca,cuda,lustre
35-
volume_size: 30 # needed for cuda
35+
volume_size: 35 # needed for cuda
3636
- image_name: openhpc-extra-RL9
3737
source_image_name_key: RL9
3838
inventory_groups: doca,cuda,lustre
39-
volume_size: 30 # needed for cuda
39+
volume_size: 35 # needed for cuda
4040
env:
4141
ANSIBLE_FORCE_COLOR: True
4242
OS_CLOUD: openstack

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ venv
55
*.pyc
66
packer/openhpc2
77
.vscode
8+
requirements.yml.last

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ The default configuration in this repository may be used to create a cluster to
2525
- Persistent state backed by an OpenStack volume.
2626
- NFS-based shared file system backed by another OpenStack volume.
2727

28-
Note that the Open OnDemand portal and its remote apps are not usable with this default configuration.
29-
3028
It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud.
3129

3230
Before starting ensure that:

ansible/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,7 @@ roles/*
9090
!roles/gateway/**
9191
!roles/alertmanager/
9292
!roles/alertmanager/**
93+
!roles/slurm_recompile/**
94+
!roles/slurm_recompile/**
95+
!roles/nhc/
96+
!roles/nhc/**

ansible/bootstrap.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@
134134

135135
- hosts: dnf_repos
136136
become: yes
137+
tags: dnf_repos
137138
tasks:
138139
- name: Check that creds won't be leaked to users
139140
ansible.builtin.assert:
@@ -143,7 +144,7 @@
143144
- appliances_mode == 'configure'
144145
- not (dnf_repos_allow_insecure_creds | default(false)) # useful for development
145146

146-
- hosts: cacerts:!builder
147+
- hosts: cacerts
147148
tags: cacerts
148149
gather_facts: false
149150
tasks:

ansible/disable-repos.yml

Lines changed: 0 additions & 8 deletions
This file was deleted.

ansible/extras.yml

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,20 @@
4949
name: cuda
5050
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
5151

52+
- name: Setup vGPU
53+
hosts: vgpu
54+
become: yes
55+
gather_facts: yes
56+
tags: vgpu
57+
tasks:
58+
- include_role:
59+
name: stackhpc.linux.vgpu
60+
tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
61+
handlers:
62+
- name: reboot
63+
fail:
64+
msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
65+
5266
- name: Persist hostkeys across rebuilds
5367
# Must be after filesystems.yml (for storage)
5468
# and before portal.yml (where OOD login node hostkeys are scanned)
@@ -59,17 +73,6 @@
5973
- import_role:
6074
name: persist_hostkeys
6175

62-
63-
- name: Setup NFS export for compute node configuration
64-
hosts: compute_init:!builder
65-
# NB: has to be after eeesi and os-manila-mount
66-
tags: compute_init
67-
become: yes
68-
tasks:
69-
- include_role:
70-
name: compute_init
71-
tasks_from: export.yml
72-
7376
- name: Install k9s
7477
become: yes
7578
hosts: k9s

ansible/fatimage.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,24 @@
252252
- include_role: # done in same play so it can use handlers from cloudalchemy.grafana
253253
name: grafana-dashboards
254254

255+
- name: Add support for NVIDIA GPU auto detection to Slurm
256+
hosts: cuda
257+
become: yes
258+
tasks:
259+
- name: Recompile slurm
260+
import_role:
261+
name: slurm_recompile
262+
vars:
263+
slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}"
264+
255265
- name: Run post.yml hook
256266
vars:
257267
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
258268
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
259269
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
260270
when: hook_path | exists
261271

262-
- import_playbook: disable-repos.yml
272+
- import_playbook: final.yml
263273

264274
- hosts: builder
265275
become: yes

ansible/final.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
- hosts: dnf_repos
2+
become: yes
3+
tags: dnf_repos
4+
tasks:
5+
- name: Disable pulp repos
6+
ansible.builtin.include_role:
7+
name: dnf_repos
8+
tasks_from: disable_repos.yml
9+
10+
- name: Setup NFS export for compute_init
11+
hosts: compute_init:!builder
12+
# NB: done last so other roles can prepare configuration etc
13+
tags: compute_init
14+
become: yes
15+
tasks:
16+
- include_role:
17+
name: compute_init
18+
tasks_from: export.yml

ansible/roles/cacerts/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Configure CA certificates and trusts.
44

55
## Role variables
66

7-
- `ca-certificates`: Optional str. Path to directory containing certificates
7+
- `cacerts_cert_dir`: Optional str. Path to directory containing certificates
88
in PEM or DER format. Any files here will be added to the list of CAs trusted
99
by the system.
1010

0 commit comments

Comments
 (0)