From 9f53fd82a8794a2903a683688ab5a570f0572236 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Oct 2025 19:13:02 +0000 Subject: [PATCH 1/5] wip - bump openhpc role for testing --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 13c6b77fe..f8d9d27a9 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v1.4.0 + version: feat/auto-gres # TODO: bump to release name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From d23904d4c2ff11ecdde67bd3356acbc7d2e4ee5d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Oct 2025 14:04:22 +0000 Subject: [PATCH 2/5] remove GresTypes from MIG docs --- docs/mig.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/mig.md b/docs/mig.md index b8eeae8ad..fcef06f1d 100644 --- a/docs/mig.md +++ b/docs/mig.md @@ -215,9 +215,6 @@ openhpc_nodegroups: - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2" - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6" -openhpc_config: - GresTypes: - - gpu ``` Making sure the types (the identifier after `gpu:`) match those collected with `slurmd -G`. Substrings From 67c93f0bba83b9cf389263682692ba24992cb5c3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Oct 2025 14:50:16 +0000 Subject: [PATCH 3/5] enable nvml autoconfiguration for CaaS --- environments/.caas/inventory/group_vars/all/openhpc.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/environments/.caas/inventory/group_vars/all/openhpc.yml b/environments/.caas/inventory/group_vars/all/openhpc.yml index 56c8b907d..e3ad4c0e4 100644 --- a/environments/.caas/inventory/group_vars/all/openhpc.yml +++ b/environments/.caas/inventory/group_vars/all/openhpc.yml @@ -4,3 +4,6 @@ openhpc_cluster_name: "{{ cluster_name }}" # Provision a single "standard" compute nodegroup using the supplied # node count and flavor openhpc_nodegroups: "{{ hostvars[groups['openstack'][0]]['openhpc_nodegroups'] }}" + +# Enable autoconfiguration of NVIDIA GPUs, if using a suitable (`cuda`) image: +openhpc_gres_autodetect: nvml From 0ed4fab690b55f6f49e8d55821e706965649bd07 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Oct 2025 16:07:28 +0000 Subject: [PATCH 4/5] fix linter problems --- docs/mig.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/mig.md b/docs/mig.md index fcef06f1d..092629fed 100644 --- a/docs/mig.md +++ b/docs/mig.md @@ -214,7 +214,6 @@ openhpc_nodegroups: - conf: "gpu:nvidia_h100_80gb_hbm3:2" - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2" - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6" - ``` Making sure the types (the identifier after `gpu:`) match those collected with `slurmd -G`. Substrings From 4ccf8b4248c9ed2656abf5522fe1bff58dbdeb5a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 27 Oct 2025 09:34:36 +0000 Subject: [PATCH 5/5] bump openhpc to release w/ auto gres support --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index f8d9d27a9..0d24cdf2d 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: feat/auto-gres # TODO: bump to release + version: v1.5.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc