stackhpc · sjpb · Jun 24, 2025 · Apr 23, 2025 · Jun 11, 2025 · Jun 11, 2025
@@ -90,3 +90,5 @@ roles/*
 !roles/gateway/**
 !roles/alertmanager/
 !roles/alertmanager/**
+!roles/slurm_recompile/**
+!roles/slurm_recompile/**
@@ -48,6 +48,20 @@
         name: cuda
         tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
 
+- name: Setup vGPU
+  hosts: vgpu
+  become: yes
+  gather_facts: yes
+  tags: vgpu
+  tasks:
+    - include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
+  handlers:
+    - name: reboot
+      fail:
+        msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
+
 - name: Persist hostkeys across rebuilds
   # Must be after filesystems.yml (for storage)
   # and before portal.yml (where OOD login node hostkeys are scanned)

@@ -250,6 +250,16 @@
         name: cloudalchemy.grafana
         tasks_from: install.yml
 
+- name: Add support for NVIDIA GPU auto detection to Slurm
+  hosts: cuda
+  become: yes
+  tasks:
+    - name: Recompile slurm
+      import_role:
+        name: slurm_recompile
+      vars:
+        slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}"
+
 - name: Run post.yml hook
   vars:
     appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"

@@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
 | extras.yml               | basic_users             | All functionality [6]           | No                  |
 | extras.yml               | eessi                   | All functionality [7]           | No                  |
 | extras.yml               | cuda                    | None required - use image build | Yes [8]             |
+| extras.yml               | vgpu                    | All functionality               | Yes                 |
 | extras.yml               | persist_hostkeys        | Not relevant for compute nodes  | n/a                 |
 | extras.yml               | compute_init (export)   | Not relevant for compute nodes  | n/a                 |
 | extras.yml               | k9s (install)           | Not relevant during boot        | n/a                 |

@@ -19,6 +19,7 @@
     enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
     enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
     enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
+    enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}"
 
     # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
     resolv_conf_nameservers: []
@@ -295,6 +296,12 @@
             cmd: "cvmfs_config setup"
       when: enable_eessi
 
+    - name: Configure VGPUs
+      include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: 'configure.yml'
+      when: enable_vgpu
+
     # NB: don't need conditional block on enable_compute as have already exited
     # if not the case
     - name: Write Munge key

@@ -0,0 +1,4 @@
+---
+- name: Set cuda_facts_version_short
+  set_fact:
+    cuda_facts_version_short: "{{ cuda_version_short }}"
@@ -0,0 +1,28 @@
+# slurm_recompile
+=================
+
+Recompiles slurm from source RPMs and installs the packages that were built.
+
+Requirements
+------------
+
+Role Variables
+--------------
+
+See `defaults/main.yml`.
+
+Dependencies
+------------
+
+Example Playbook
+----------------
+
+    - hosts: compute
+      tasks:
+        - import_role:
+            name: slurm_recompile
+
+License
+-------
+
+Apache-2.0
@@ -0,0 +1,4 @@
+---
+# Whether to link slurm against the NVIDIA management library
+slurm_recompile_with_nvml: false
+
@@ -0,0 +1,41 @@
+---
+- name: Get facts about CUDA installation
+  import_role:
+    name: cuda
+    tasks_from: facts.yml
+
+- name: Gather the package facts
+  ansible.builtin.package_facts:
+    manager: auto
+
+- name: Set fact containing slurm package facts
+  set_fact:
+    slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"
+
+- name: Recompile and install slurm packages
+  shell: |
+    #!/bin/bash
+    source /etc/profile
+    set -eux
+    dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
+    rpm -i slurm-ohpc-*.src.rpm
+    cd /root/rpmbuild/SPECS
+    dnf builddep -y slurm.spec
+    rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
+    dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
+  become: true
+
+- name: Workaround missing symlink
+  # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
+  command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
+  args:
+    creates: /lib64/libnvidia-ml.so
+  when: slurm_recompile_with_nvml | bool
+
+- name: Cleanup Dependencies
+  shell: |
+    #!/bin/bash
+    set -eux
+    set -o pipefail
+    dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 |  awk '{print $1}' | xargs dnf history -y undo
+  become: true
@@ -88,3 +88,13 @@
     - import_role:
         name: lustre
         tasks_from: validate.yml
+
+- name: Validate vGPU configuration
+  hosts: vgpu
+  become: yes
+  gather_facts: yes
+  tags: vgpu
+  tasks:
+    - include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: validate.yml
@@ -0,0 +1,226 @@
+# vGPU/MIG configuration
+
+This page details how to configure Multi Instance GPU (MIG) in Slurm.
+
+## Pre-requisites
+
+- Image built with cuda support. This should automatically recompile slurm
+  against NVML. The builder will need to be added to the `vgpu` and `cuda`
+  groups.
+
+## Inventory
+
+Add relevant hosts to the ``vgpu`` group, for example in `environments/$ENV/inventory/groups`:
+
+```
+[vgpu:children]
+cuda
+```
+
+## Configuration
+
+Use variables from the [stackhpc.linux.vgpu](https://github.com/stackhpc/ansible-collection-linux/tree/main/roles/vgpu) role.
+
+For example in: `environments/<environment>/inventory/group_vars/all/vgpu`:
+
+```
+---
+vgpu_definitions:
+    - pci_address: "0000:17:00.0"
+      mig_devices:
+        "1g.10gb": 4
+        "4g.40gb": 1
+    - pci_address: "0000:81:00.0"
+      mig_devices:
+        "1g.10gb": 4
+        "4g.40gb": 1
+```
+
+The appliance will use the driver installed via the ``cuda`` role. 
+
+Use ``lspci`` to determine the PCI addresses e.g:
+
+```
+[root@io-io-gpu-02 ~]# lspci -nn | grep -i nvidia
+06:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1)
+0c:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1)
+46:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1)
+4c:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1)
+```
+
+The supported profiles can be discovered by consulting the [NVIDIA documentation](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#supported-mig-profiles)
+or interactively by running the following on one of the compute nodes with GPU resources:
+
+```
+[rocky@io-io-gpu-05 ~]$ sudo nvidia-smi -i 0 -mig 1
+Enabled MIG Mode for GPU 00000000:06:00.0
+All done.
+[rocky@io-io-gpu-05 ~]$ sudo nvidia-smi mig -lgip
++-----------------------------------------------------------------------------+
+| GPU instance profiles:                                                      |
+| GPU   Name             ID    Instances   Memory     P2P    SM    DEC   ENC  |
+|                              Free/Total   GiB              CE    JPEG  OFA  |
+|=============================================================================|
+|   0  MIG 1g.10gb       19     7/7        9.75       No     16     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   0  MIG 1g.10gb+me    20     1/1        9.75       No     16     1     0   |
+|                                                             1     1     1   |
++-----------------------------------------------------------------------------+
+|   0  MIG 1g.20gb       15     4/4        19.62      No     26     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   0  MIG 2g.20gb       14     3/3        19.62      No     32     2     0   |
+|                                                             2     2     0   |
++-----------------------------------------------------------------------------+
+|   0  MIG 3g.40gb        9     2/2        39.50      No     60     3     0   |
+|                                                             3     3     0   |
++-----------------------------------------------------------------------------+
+|   0  MIG 4g.40gb        5     1/1        39.50      No     64     4     0   |
+|                                                             4     4     0   |
++-----------------------------------------------------------------------------+
+|   0  MIG 7g.80gb        0     1/1        79.25      No     132    7     0   |
+|                                                             8     7     1   |
++-----------------------------------------------------------------------------+
+|   1  MIG 1g.10gb       19     7/7        9.75       No     16     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   1  MIG 1g.10gb+me    20     1/1        9.75       No     16     1     0   |
+|                                                             1     1     1   |
++-----------------------------------------------------------------------------+
+|   1  MIG 1g.20gb       15     4/4        19.62      No     26     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   1  MIG 2g.20gb       14     3/3        19.62      No     32     2     0   |
+|                                                             2     2     0   |
++-----------------------------------------------------------------------------+
+|   1  MIG 3g.40gb        9     2/2        39.50      No     60     3     0   |
+|                                                             3     3     0   |
++-----------------------------------------------------------------------------+
+|   1  MIG 4g.40gb        5     1/1        39.50      No     64     4     0   |
+|                                                             4     4     0   |
++-----------------------------------------------------------------------------+
+|   1  MIG 7g.80gb        0     1/1        79.25      No     132    7     0   |
+|                                                             8     7     1   |
++-----------------------------------------------------------------------------+
+|   2  MIG 1g.10gb       19     7/7        9.75       No     16     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   2  MIG 1g.10gb+me    20     1/1        9.75       No     16     1     0   |
+|                                                             1     1     1   |
++-----------------------------------------------------------------------------+
+|   2  MIG 1g.20gb       15     4/4        19.62      No     26     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   2  MIG 2g.20gb       14     3/3        19.62      No     32     2     0   |
+|                                                             2     2     0   |
++-----------------------------------------------------------------------------+
+|   2  MIG 3g.40gb        9     2/2        39.50      No     60     3     0   |
+|                                                             3     3     0   |
++-----------------------------------------------------------------------------+
+|   2  MIG 4g.40gb        5     1/1        39.50      No     64     4     0   |
+|                                                             4     4     0   |
++-----------------------------------------------------------------------------+
+|   2  MIG 7g.80gb        0     1/1        79.25      No     132    7     0   |
+|                                                             8     7     1   |
++-----------------------------------------------------------------------------+
+|   3  MIG 1g.10gb       19     7/7        9.75       No     16     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   3  MIG 1g.10gb+me    20     1/1        9.75       No     16     1     0   |
+|                                                             1     1     1   |
++-----------------------------------------------------------------------------+
+|   3  MIG 1g.20gb       15     4/4        19.62      No     26     1     0   |
+|                                                             1     1     0   |
++-----------------------------------------------------------------------------+
+|   3  MIG 2g.20gb       14     3/3        19.62      No     32     2     0   |
+|                                                             2     2     0   |
++-----------------------------------------------------------------------------+
+|   3  MIG 3g.40gb        9     2/2        39.50      No     60     3     0   |
+|                                                             3     3     0   |
++-----------------------------------------------------------------------------+
+|   3  MIG 4g.40gb        5     1/1        39.50      No     64     4     0   |
+|                                                             4     4     0   |
++-----------------------------------------------------------------------------+
+|   3  MIG 7g.80gb        0     1/1        79.25      No     132    7     0   |
+|                                                             8     7     1   |
++-----------------------------------------------------------------------------+
+```
+
+## compute_init configuration for slurm triggered rebuild (optional)
+
+You only need to configure this if you are using the slurm triggered rebuild
+feature.  Use the ``vgpu`` metadata option to enable creation of mig devices on
+rebuild.
+
+## GRES configuration
+
+GPU resources need to be added to the OpenHPC nodegroup definitions (`openhpc_nodegroups`). To
+do this you need to determine the names of the GPU types as detected by slurm. First
+deploy slurm with the default nodegroup definitions to get a working cluster. Make a temporary
+copy of slurm.conf:
+
+```
+cp /var/spool/slurm/conf-cache/slurm.conf /tmp/
+```
+
+Then create a `/tmp/gres.conf` which enables autodetection:
+
+```
+AutoDetect=nvml
+```
+
+You will then be able to run: `sudo slurmd -f /tmp/slurm.conf -G` on a compute node where GPU resources exist. An example is shown below:
+
+```
+[rocky@io-io-gpu-02 ~]$ sudo slurmd -f /tmp/slurm.conf -G
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3 Count=1 Index=0 ID=7696487 File=/dev/nvidia0 Links=(null) Flags=HAS_FILE,HAS_TYPE,ENV_NVML,ENV_RSMI,ENV_ONEAPI
+,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3 Count=1 Index=1 ID=7696487 File=/dev/nvidia1 Links=(null) Flags=HAS_FILE,HAS_TYPE,ENV_NVML,ENV_RSMI,ENV_ONEAPI
+,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_4g.40gb Count=1 Index=291 ID=7696487 File=/dev/nvidia-caps/nvidia-cap291 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_4g.40gb Count=1 Index=417 ID=7696487 File=/dev/nvidia-caps/nvidia-cap417 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=336 ID=7696487 File=/dev/nvidia-caps/nvidia-cap336 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=345 ID=7696487 File=/dev/nvidia-caps/nvidia-cap345 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=354 ID=7696487 File=/dev/nvidia-caps/nvidia-cap354 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=507 ID=7696487 File=/dev/nvidia-caps/nvidia-cap507 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=516 ID=7696487 File=/dev/nvidia-caps/nvidia-cap516 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=525 ID=7696487 File=/dev/nvidia-caps/nvidia-cap525 Links=(null) Flags=HAS_FILE,HAS_TYPE,
+ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT
+```
+
+NOTE: If you have configured a Gres= line in slurm.conf already. You may have to adjust or remove it.
+
+GRES resources can then be configured manually. An example is shown below
+(`environments/<environment>/inventory/group_vars/all/openhpc.yml`):
+
+```
+openhpc_partitions:
+  - name: cpu
+  - name: gpu
+
+openhpc_nodegroups:
+    - name: cpu
+    - name: gpu
+      gres_autodetect: nvml
+      gres:
+        - conf: "gpu:nvidia_h100_80gb_hbm3:2"
+        - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2"
+        - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6"
+
+openhpc_config:
+  GresTypes:
+    - gpu
+
+```
+
+Making sure the types (the identifier after `gpu:`) match those collected with `slurmd -G`. Substrings
+of this type are also permissable, see the [slurm docs](https://slurm.schedmd.com/gres.html#MIG_Management)
+for more details.