File tree Expand file tree Collapse file tree 8 files changed +51
-1
lines changed
environments/common/inventory Expand file tree Collapse file tree 8 files changed +51
-1
lines changed Original file line number Diff line number Diff line change @@ -90,3 +90,5 @@ roles/*
9090! roles /gateway /**
9191! roles /alertmanager /
9292! roles /alertmanager /**
93+ ! roles /slurm_recompile /**
94+ ! roles /slurm_recompile /**
Original file line number Diff line number Diff line change 4848 name : cuda
4949 tasks_from : " {{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
5050
51+ - name : Setup vGPU
52+ hosts : vgpu
53+ become : yes
54+ gather_facts : yes
55+ tags : vgpu
56+ tasks :
57+ - include_role :
58+ name : stackhpc.linux.vgpu
59+ tasks_from : " {{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
60+ handlers :
61+ - name : reboot
62+ fail :
63+ msg : Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
64+
5165- name : Persist hostkeys across rebuilds
5266 # Must be after filesystems.yml (for storage)
5367 # and before portal.yml (where OOD login node hostkeys are scanned)
Original file line number Diff line number Diff line change 250250 name : cloudalchemy.grafana
251251 tasks_from : install.yml
252252
253+ - name : Add support for NVIDIA GPU auto detection to Slurm
254+ hosts : cuda
255+ become : yes
256+ tasks :
257+ - name : Recompile slurm
258+ import_role :
259+ name : slurm_recompile
260+ vars :
261+ slurm_recompile_with_nvml : " {{ groups.cuda | length > 0 }}"
262+
253263- name : Run post.yml hook
254264 vars :
255265 appliances_environment_root : " {{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Original file line number Diff line number Diff line change @@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
7575| extras.yml | basic_users | All functionality [ 6] | No |
7676| extras.yml | eessi | All functionality [ 7] | No |
7777| extras.yml | cuda | None required - use image build | Yes [ 8] |
78+ | extras.yml | vgpu | All functionality | Yes |
7879| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
7980| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
8081| extras.yml | k9s (install) | Not relevant during boot | n/a |
Original file line number Diff line number Diff line change 1919 enable_basic_users : " {{ os_metadata.meta.basic_users | default(false) | bool }}"
2020 enable_eessi : " {{ os_metadata.meta.eessi | default(false) | bool }}"
2121 enable_chrony : " {{ os_metadata.meta.chrony | default(false) | bool }}"
22+ enable_vgpu : " {{ os_metadata.meta.vpgu | default(false) | bool }}"
23+
2224
2325 # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2426 resolv_conf_nameservers : []
295297 cmd : " cvmfs_config setup"
296298 when : enable_eessi
297299
300+ - name : Configure VGPUs
301+ include_role :
302+ name : stackhpc.linux.vgpu
303+ tasks_from : ' configure.yml'
304+ when : enable_vgpu
305+
298306 # NB: don't need conditional block on enable_compute as have already exited
299307 # if not the case
300308 - name : Write Munge key
Original file line number Diff line number Diff line change 8888 - import_role :
8989 name : lustre
9090 tasks_from : validate.yml
91+
92+ - name : Validate vGPU configuration
93+ hosts : vgpu
94+ become : yes
95+ gather_facts : yes
96+ tags : vgpu
97+ tasks :
98+ - include_role :
99+ name : stackhpc.linux.vgpu
100+ tasks_from : validate.yml
Original file line number Diff line number Diff line change @@ -112,6 +112,9 @@ freeipa_client
112112[cuda]
113113# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md
114114
115+ [vgpu]
116+ # Hosts where vGPU/MIG should be configured - see docs/mig.md
117+
115118[eessi]
116119# Hosts on which EESSI stack should be configured
117120
Original file line number Diff line number Diff line change 44 version : v25.3.2
55 name : stackhpc.nfs
66 - src : https://github.com/stackhpc/ansible-role-openhpc.git
7- version : v1.0 .0
7+ version : v1.1 .0
88 name : stackhpc.openhpc
99 - src : https://github.com/stackhpc/ansible-node-exporter.git
1010 version : stackhpc
@@ -55,4 +55,6 @@ collections:
5555 version : 0.0.15
5656 - name : stackhpc.pulp
5757 version : 0.5.5
58+ - name : stackhpc.linux
59+ version : 1.4.0
5860...
You can’t perform that action at this time.
0 commit comments