diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml index 6a075ae16..e00dfdea7 100644 --- a/.github/workflows/extra.yml +++ b/.github/workflows/extra.yml @@ -32,11 +32,11 @@ jobs: - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json inventory_groups: doca,cuda,lustre - volume_size: 30 # needed for cuda + volume_size: 35 # needed for cuda - image_name: openhpc-extra-RL9 source_image_name_key: RL9 inventory_groups: doca,cuda,lustre - volume_size: 30 # needed for cuda + volume_size: 35 # needed for cuda env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack diff --git a/ansible/.gitignore b/ansible/.gitignore index d7f3e99b1..978e29e91 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -90,5 +90,7 @@ roles/* !roles/gateway/** !roles/alertmanager/ !roles/alertmanager/** +!roles/slurm_recompile/** +!roles/slurm_recompile/** !roles/nhc/ !roles/nhc/** diff --git a/ansible/extras.yml b/ansible/extras.yml index 8e3248d3f..54168e97d 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -48,6 +48,20 @@ name: cuda tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" +- name: Setup vGPU + hosts: vgpu + become: yes + gather_facts: yes + tags: vgpu + tasks: + - include_role: + name: stackhpc.linux.vgpu + tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}" + handlers: + - name: reboot + fail: + msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable. + - name: Persist hostkeys across rebuilds # Must be after filesystems.yml (for storage) # and before portal.yml (where OOD login node hostkeys are scanned) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 422cc07c4..21a4d4126 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -250,6 +250,16 @@ name: cloudalchemy.grafana tasks_from: install.yml +- name: Add support for NVIDIA GPU auto detection to Slurm + hosts: cuda + become: yes + tasks: + - name: Recompile slurm + import_role: + name: slurm_recompile + vars: + slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}" + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index d1a7e854d..7a95d2b74 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -75,6 +75,7 @@ it also requires an image build with the role name added to the | extras.yml | basic_users | All functionality [6] | No | | extras.yml | eessi | All functionality [7] | No | | extras.yml | cuda | None required - use image build | Yes [8] | +| extras.yml | vgpu | All functionality | Yes | | extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a | | extras.yml | compute_init (export) | Not relevant for compute nodes | n/a | | extras.yml | k9s (install) | Not relevant during boot | n/a | diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 96722e95c..397da0126 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -19,6 +19,7 @@ enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}" enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}" + enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}" enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects @@ -296,6 +297,12 @@ cmd: "cvmfs_config setup" when: enable_eessi + - name: Configure VGPUs + include_role: + name: stackhpc.linux.vgpu + tasks_from: 'configure.yml' + when: enable_vgpu + # NB: don't need conditional block on enable_compute as have already exited # if not the case - name: Write Munge key diff --git a/ansible/roles/cuda/tasks/facts.yml b/ansible/roles/cuda/tasks/facts.yml new file mode 100644 index 000000000..0d60457de --- /dev/null +++ b/ansible/roles/cuda/tasks/facts.yml @@ -0,0 +1,4 @@ +--- +- name: Set cuda_facts_version_short + set_fact: + cuda_facts_version_short: "{{ cuda_version_short }}" diff --git a/ansible/roles/slurm_recompile/README.md b/ansible/roles/slurm_recompile/README.md new file mode 100644 index 000000000..e42572aea --- /dev/null +++ b/ansible/roles/slurm_recompile/README.md @@ -0,0 +1,28 @@ +# slurm_recompile +================= + +Recompiles slurm from source RPMs and installs the packages that were built. + +Requirements +------------ + +Role Variables +-------------- + +See `defaults/main.yml`. + +Dependencies +------------ + +Example Playbook +---------------- + + - hosts: compute + tasks: + - import_role: + name: slurm_recompile + +License +------- + +Apache-2.0 diff --git a/ansible/roles/slurm_recompile/defaults/main.yml b/ansible/roles/slurm_recompile/defaults/main.yml new file mode 100644 index 000000000..85aa72ace --- /dev/null +++ b/ansible/roles/slurm_recompile/defaults/main.yml @@ -0,0 +1,4 @@ +--- +# Whether to link slurm against the NVIDIA management library +slurm_recompile_with_nvml: false + diff --git a/ansible/roles/slurm_recompile/tasks/main.yml b/ansible/roles/slurm_recompile/tasks/main.yml new file mode 100644 index 000000000..66989945e --- /dev/null +++ b/ansible/roles/slurm_recompile/tasks/main.yml @@ -0,0 +1,41 @@ +--- +- name: Get facts about CUDA installation + import_role: + name: cuda + tasks_from: facts.yml + +- name: Gather the package facts + ansible.builtin.package_facts: + manager: auto + +- name: Set fact containing slurm package facts + set_fact: + slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}" + +- name: Recompile and install slurm packages + shell: | + #!/bin/bash + source /etc/profile + set -eux + dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }} + rpm -i slurm-ohpc-*.src.rpm + cd /root/rpmbuild/SPECS + dnf builddep -y slurm.spec + rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec + dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm + become: true + +- name: Workaround missing symlink + # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY + command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so + args: + creates: /lib64/libnvidia-ml.so + when: slurm_recompile_with_nvml | bool + +- name: Cleanup Dependencies + shell: | + #!/bin/bash + set -eux + set -o pipefail + dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 | awk '{print $1}' | xargs dnf history -y undo + become: true diff --git a/ansible/validate.yml b/ansible/validate.yml index 74b23a7ab..9a8aac641 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -133,3 +133,13 @@ - import_role: name: lustre tasks_from: validate.yml + +- name: Validate vGPU configuration + hosts: vgpu + become: yes + gather_facts: yes + tags: vgpu + tasks: + - include_role: + name: stackhpc.linux.vgpu + tasks_from: validate.yml diff --git a/docs/mig.md b/docs/mig.md new file mode 100644 index 000000000..0d52f968c --- /dev/null +++ b/docs/mig.md @@ -0,0 +1,226 @@ +# vGPU/MIG configuration + +This page details how to configure Multi Instance GPU (MIG) in Slurm. + +## Pre-requisites + +- Image built with cuda support. This should automatically recompile slurm + against NVML. The builder will need to be added to the `vgpu` and `cuda` + groups. + +## Inventory + +Add relevant hosts to the ``vgpu`` group, for example in `environments/$ENV/inventory/groups`: + +``` +[vgpu:children] +cuda +``` + +## Configuration + +Use variables from the [stackhpc.linux.vgpu](https://github.com/stackhpc/ansible-collection-linux/tree/main/roles/vgpu) role. + +For example in: `environments//inventory/group_vars/all/vgpu`: + +``` +--- +vgpu_definitions: + - pci_address: "0000:17:00.0" + mig_devices: + "1g.10gb": 4 + "4g.40gb": 1 + - pci_address: "0000:81:00.0" + mig_devices: + "1g.10gb": 4 + "4g.40gb": 1 +``` + +The appliance will use the driver installed via the ``cuda`` role. + +Use ``lspci`` to determine the PCI addresses e.g: + +``` +[root@io-io-gpu-02 ~]# lspci -nn | grep -i nvidia +06:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) +0c:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) +46:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) +4c:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) +``` + +The supported profiles can be discovered by consulting the [NVIDIA documentation](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#supported-mig-profiles) +or interactively by running the following on one of the compute nodes with GPU resources: + +``` +[rocky@io-io-gpu-05 ~]$ sudo nvidia-smi -i 0 -mig 1 +Enabled MIG Mode for GPU 00000000:06:00.0 +All done. +[rocky@io-io-gpu-05 ~]$ sudo nvidia-smi mig -lgip ++-----------------------------------------------------------------------------+ +| GPU instance profiles: | +| GPU Name ID Instances Memory P2P SM DEC ENC | +| Free/Total GiB CE JPEG OFA | +|=============================================================================| +| 0 MIG 1g.10gb 19 7/7 9.75 No 16 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 0 MIG 1g.10gb+me 20 1/1 9.75 No 16 1 0 | +| 1 1 1 | ++-----------------------------------------------------------------------------+ +| 0 MIG 1g.20gb 15 4/4 19.62 No 26 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 0 MIG 2g.20gb 14 3/3 19.62 No 32 2 0 | +| 2 2 0 | ++-----------------------------------------------------------------------------+ +| 0 MIG 3g.40gb 9 2/2 39.50 No 60 3 0 | +| 3 3 0 | ++-----------------------------------------------------------------------------+ +| 0 MIG 4g.40gb 5 1/1 39.50 No 64 4 0 | +| 4 4 0 | ++-----------------------------------------------------------------------------+ +| 0 MIG 7g.80gb 0 1/1 79.25 No 132 7 0 | +| 8 7 1 | ++-----------------------------------------------------------------------------+ +| 1 MIG 1g.10gb 19 7/7 9.75 No 16 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 1 MIG 1g.10gb+me 20 1/1 9.75 No 16 1 0 | +| 1 1 1 | ++-----------------------------------------------------------------------------+ +| 1 MIG 1g.20gb 15 4/4 19.62 No 26 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 1 MIG 2g.20gb 14 3/3 19.62 No 32 2 0 | +| 2 2 0 | ++-----------------------------------------------------------------------------+ +| 1 MIG 3g.40gb 9 2/2 39.50 No 60 3 0 | +| 3 3 0 | ++-----------------------------------------------------------------------------+ +| 1 MIG 4g.40gb 5 1/1 39.50 No 64 4 0 | +| 4 4 0 | ++-----------------------------------------------------------------------------+ +| 1 MIG 7g.80gb 0 1/1 79.25 No 132 7 0 | +| 8 7 1 | ++-----------------------------------------------------------------------------+ +| 2 MIG 1g.10gb 19 7/7 9.75 No 16 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 2 MIG 1g.10gb+me 20 1/1 9.75 No 16 1 0 | +| 1 1 1 | ++-----------------------------------------------------------------------------+ +| 2 MIG 1g.20gb 15 4/4 19.62 No 26 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 2 MIG 2g.20gb 14 3/3 19.62 No 32 2 0 | +| 2 2 0 | ++-----------------------------------------------------------------------------+ +| 2 MIG 3g.40gb 9 2/2 39.50 No 60 3 0 | +| 3 3 0 | ++-----------------------------------------------------------------------------+ +| 2 MIG 4g.40gb 5 1/1 39.50 No 64 4 0 | +| 4 4 0 | ++-----------------------------------------------------------------------------+ +| 2 MIG 7g.80gb 0 1/1 79.25 No 132 7 0 | +| 8 7 1 | ++-----------------------------------------------------------------------------+ +| 3 MIG 1g.10gb 19 7/7 9.75 No 16 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 3 MIG 1g.10gb+me 20 1/1 9.75 No 16 1 0 | +| 1 1 1 | ++-----------------------------------------------------------------------------+ +| 3 MIG 1g.20gb 15 4/4 19.62 No 26 1 0 | +| 1 1 0 | ++-----------------------------------------------------------------------------+ +| 3 MIG 2g.20gb 14 3/3 19.62 No 32 2 0 | +| 2 2 0 | ++-----------------------------------------------------------------------------+ +| 3 MIG 3g.40gb 9 2/2 39.50 No 60 3 0 | +| 3 3 0 | ++-----------------------------------------------------------------------------+ +| 3 MIG 4g.40gb 5 1/1 39.50 No 64 4 0 | +| 4 4 0 | ++-----------------------------------------------------------------------------+ +| 3 MIG 7g.80gb 0 1/1 79.25 No 132 7 0 | +| 8 7 1 | ++-----------------------------------------------------------------------------+ +``` + +## compute_init configuration for slurm triggered rebuild (optional) + +You only need to configure this if you are using the slurm triggered rebuild +feature. Use the ``vgpu`` metadata option to enable creation of mig devices on +rebuild. + +## GRES configuration + +GPU resources need to be added to the OpenHPC nodegroup definitions (`openhpc_nodegroups`). To +do this you need to determine the names of the GPU types as detected by slurm. First +deploy slurm with the default nodegroup definitions to get a working cluster. Make a temporary +copy of slurm.conf: + +``` +cp /var/spool/slurm/conf-cache/slurm.conf /tmp/ +``` + +Then create a `/tmp/gres.conf` which enables autodetection: + +``` +AutoDetect=nvml +``` + +You will then be able to run: `sudo slurmd -f /tmp/slurm.conf -G` on a compute node where GPU resources exist. An example is shown below: + +``` +[rocky@io-io-gpu-02 ~]$ sudo slurmd -f /tmp/slurm.conf -G +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3 Count=1 Index=0 ID=7696487 File=/dev/nvidia0 Links=(null) Flags=HAS_FILE,HAS_TYPE,ENV_NVML,ENV_RSMI,ENV_ONEAPI +,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3 Count=1 Index=1 ID=7696487 File=/dev/nvidia1 Links=(null) Flags=HAS_FILE,HAS_TYPE,ENV_NVML,ENV_RSMI,ENV_ONEAPI +,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_4g.40gb Count=1 Index=291 ID=7696487 File=/dev/nvidia-caps/nvidia-cap291 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_4g.40gb Count=1 Index=417 ID=7696487 File=/dev/nvidia-caps/nvidia-cap417 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=336 ID=7696487 File=/dev/nvidia-caps/nvidia-cap336 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=345 ID=7696487 File=/dev/nvidia-caps/nvidia-cap345 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=354 ID=7696487 File=/dev/nvidia-caps/nvidia-cap354 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=507 ID=7696487 File=/dev/nvidia-caps/nvidia-cap507 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=516 ID=7696487 File=/dev/nvidia-caps/nvidia-cap516 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3_1g.10gb Count=1 Index=525 ID=7696487 File=/dev/nvidia-caps/nvidia-cap525 Links=(null) Flags=HAS_FILE,HAS_TYPE, +ENV_NVML,ENV_RSMI,ENV_ONEAPI,ENV_OPENCL,ENV_DEFAULT +``` + +NOTE: If you have configured a Gres= line in slurm.conf already. You may have to adjust or remove it. + +GRES resources can then be configured manually. An example is shown below +(`environments//inventory/group_vars/all/openhpc.yml`): + +``` +openhpc_partitions: + - name: cpu + - name: gpu + +openhpc_nodegroups: + - name: cpu + - name: gpu + gres_autodetect: nvml + gres: + - conf: "gpu:nvidia_h100_80gb_hbm3:2" + - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2" + - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6" + +openhpc_config: + GresTypes: + - gpu + +``` + +Making sure the types (the identifier after `gpu:`) match those collected with `slurmd -G`. Substrings +of this type are also permissable, see the [slurm docs](https://slurm.schedmd.com/gres.html#MIG_Management) +for more details. diff --git a/environments/common/inventory/group_vars/all/vgpu.yml b/environments/common/inventory/group_vars/all/vgpu.yml new file mode 100644 index 000000000..72ea342da --- /dev/null +++ b/environments/common/inventory/group_vars/all/vgpu.yml @@ -0,0 +1,7 @@ +--- + +# Nvidia driver is provided by cuda role. +vgpu_nvidia_driver_install_enabled: false + +# Explitly enable MIG support for the image build +vgpu_mig_enabled: true diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 5bf38aaa9..1cc5523fb 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -112,6 +112,9 @@ freeipa_client [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md +[vgpu] +# Hosts where vGPU/MIG should be configured - see docs/mig.md + [eessi] # Hosts on which EESSI stack should be configured diff --git a/requirements.yml b/requirements.yml index 729280df5..5e2493bad 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v1.0.0 + version: v1.2.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc @@ -55,4 +55,6 @@ collections: version: 0.0.15 - name: stackhpc.pulp version: 0.5.5 + - name: stackhpc.linux + version: 1.5.0 ...