Skip to content

Commit 6c595ba

Browse files
committed
Adds support for NVIDIA MIG configuration
1 parent 986a6bc commit 6c595ba

File tree

8 files changed

+61
-0
lines changed

8 files changed

+61
-0
lines changed

ansible/extras.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@
4848
name: cuda
4949
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
5050

51+
- name: Setup vGPU
52+
hosts: vgpu
53+
become: yeas
54+
gather_facts: yes
55+
tags: vgpu
56+
tasks:
57+
- include_role:
58+
name: stackhpc.linux.vgpu
59+
tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
60+
handlers:
61+
- name: reboot
62+
fail:
63+
msg: Reboot handlier fired unexpectedly. This was supposed to be unreachable.
64+
5165
- name: Persist hostkeys across rebuilds
5266
# Must be after filesystems.yml (for storage)
5367
# and before portal.yml (where OOD login node hostkeys are scanned)

ansible/roles/compute_init/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
7575
| extras.yml | basic_users | All functionality [6] | No |
7676
| extras.yml | eessi | All functionality [7] | No |
7777
| extras.yml | cuda | None required - use image build | Yes [8] |
78+
| extras.yml | vgpu | All functionality | Yes |
7879
| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
7980
| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
8081
| extras.yml | k9s (install) | Not relevant during boot | n/a |

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
2020
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
2121
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
22+
enable_vgpu: "{{ os_metadata.meta.chrony | default(false) | bool }}"
23+
2224

2325
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2426
resolv_conf_nameservers: []
@@ -295,6 +297,12 @@
295297
cmd: "cvmfs_config setup"
296298
when: enable_eessi
297299

300+
- name: Configure VGPUs
301+
include_role:
302+
name: stackhpc.linux.vgpu
303+
tasks_from: 'configure.yml'
304+
when: enable_vgpu
305+
298306
# NB: don't need conditional block on enable_compute as have already exited
299307
# if not the case
300308
- name: Write Munge key

ansible/validate.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,13 @@
8383
- import_role:
8484
name: lustre
8585
tasks_from: validate.yml
86+
87+
- name: Validate vGPU configuration
88+
hosts: vgpu
89+
become: yes
90+
gather_facts: yes
91+
tags: vgpu
92+
tasks:
93+
- include_role:
94+
name: stackhpc.linux.vgpu
95+
tasks_from: validate.yml

docs/mig.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# vGPU/MIG configuration
2+
3+
Use variables from the [stackhpc.linux.vgpu](https://github.com/stackhpc/ansible-collection-linux/tree/main/roles/vgpu) role.
4+
5+
For example in: `environments/<environment>/inventory/group_vars/all/vgpu`:
6+
7+
```
8+
---
9+
vgpu_definitions:
10+
- pci_address: "0000:17:00.0"
11+
mig_devices:
12+
"1g.10gb": 1
13+
"2g.20gb": 3
14+
```
15+
16+
The appliance will use the driver installed via the ``cuda`` role. Use ``lspci`` to determine the PCI
17+
addresses.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
3+
# Nvidia driver is provided by cuda role.
4+
vgpu_nvidia_driver_install_enabled: false

environments/common/inventory/groups

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ freeipa_client
112112
[cuda]
113113
# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md
114114

115+
[vgpu]
116+
# FIXME: Update once PR merged
117+
# Hosts where vGPU/MIG should be configured - see https://github.com/stackhpc/ansible-collection-linux/pull/43/files#diff-74e43d9a34244aa54721f4dbd12a029baa87957afd762b88c2677aa75414f514R75
118+
115119
[eessi]
116120
# Hosts on which EESSI stack should be configured
117121

requirements.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,7 @@ collections:
5555
version: 0.0.15
5656
- name: stackhpc.pulp
5757
version: 0.5.5
58+
- name: https://github.com/stackhpc/ansible-collection-linux
59+
type: git
60+
version: feature/mig-only
5861
...

0 commit comments

Comments
 (0)