Skip to content

Commit a7e8a66

Browse files
committed
auto gres - v1
1 parent ea1736c commit a7e8a66

File tree

5 files changed

+113
-6
lines changed

5 files changed

+113
-6
lines changed

defaults/main.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ openhpc_packages:
1212
openhpc_resume_timeout: 300
1313
openhpc_retry_delay: 10
1414
openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some formats to seconds, which is interpreted as minutes by Slurm
15+
openhpc_gres_autodetect: 'off'
1516
openhpc_default_config:
1617
# This only defines values which are not Slurm defaults
1718
SlurmctldHost: "{{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %}"
@@ -50,6 +51,9 @@ openhpc_cgroup_default_config:
5051
openhpc_config: {}
5152
openhpc_cgroup_config: {}
5253
ohpc_nodegroup_gres_types: >-
54+
{% if openhpc_gres_autodetect == 'nvml' %}
55+
gpu
56+
{% else %}
5357
{{
5458
openhpc_nodegroups |
5559
community.general.json_query('[].gres[].conf') |
@@ -58,6 +62,7 @@ ohpc_nodegroup_gres_types: >-
5862
unique |
5963
join(',')
6064
}}
65+
{% endif %}
6166
openhpc_gres_template: gres.conf.j2
6267
openhpc_cgroup_template: cgroup.conf.j2
6368

library/gpu_info.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/usr/bin/python
2+
3+
# Copyright: (c) 2025, StackHPC
4+
# Apache 2 License
5+
6+
from ansible.module_utils.basic import AnsibleModule
7+
8+
ANSIBLE_METADATA = {
9+
"metadata_version": "0.1",
10+
"status": ["preview"],
11+
"supported_by": "community",
12+
}
13+
14+
DOCUMENTATION = """
15+
---
16+
module: sacct_cluster
17+
short_description: Manages clusters in the accounting database
18+
version_added: "2.9"
19+
description:
20+
- "Adds/removes a cluster from the accounting database"
21+
options:
22+
name:
23+
description:
24+
- Name of the cluster
25+
required: true
26+
type: str
27+
state:
28+
description:
29+
- If C(present), cluster will be added if it does't already exist
30+
- If C(absent), cluster will be removed if it exists
31+
type: str
32+
required: true
33+
choices: [ absent, present]
34+
35+
requirements:
36+
- "python >= 3.6"
37+
author:
38+
- Will Szumski, StackHPC
39+
"""
40+
41+
EXAMPLES = """
42+
"""
43+
44+
import collections
45+
46+
def run_module():
47+
module_args = dict({})
48+
49+
module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
50+
51+
try:
52+
rc ,stdout, stderr = module.run_command("nvidia-smi --query-gpu=name --format=noheader", check_rc=False, handle_exceptions=False)
53+
except FileNotFoundError: # nvidia-smi not installed
54+
rc = None
55+
56+
# nvidia-smi return codes: https://docs.nvidia.com/deploy/nvidia-smi/index.html
57+
gpus = {}
58+
result = {'changed': False, 'gpus': gpus, 'gres':''}
59+
if rc == 0:
60+
# stdout line e.g. 'NVIDIA H200' for each GPU
61+
lines = [line for line in stdout.splitlines() if line != ''] # defensive: currently no blank lines
62+
models = [line.split()[1] for line in lines]
63+
gpus.update(collections.Counter(models))
64+
elif rc == 9:
65+
# nvidia-smi installed but driver not running
66+
pass
67+
elif rc == None:
68+
# nvidia-smi not installed
69+
pass
70+
else:
71+
result.update({'stdout': stdout, 'rc': rc, 'stderr':stderr})
72+
module.fail_json(**result)
73+
74+
if len(gpus) > 0:
75+
gres_parts = []
76+
for model, count in gpus.items():
77+
gres_parts.append(f"gpu:{model}:{count}")
78+
result.update({'gres': ','.join(gres_parts)})
79+
80+
module.exit_json(**result)
81+
82+
83+
def main():
84+
run_module()
85+
86+
87+
if __name__ == "__main__":
88+
main()

tasks/runtime.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@
6363
notify: Restart slurmdbd service
6464
when: openhpc_enable.database | default(false) | bool
6565

66+
- name: Query GPU info
67+
gpu_info:
68+
register: _gpu_info
69+
when: openhpc_enable.batch | default(false)
70+
71+
- name: Set fact for node GPU GRES
72+
set_fact:
73+
ohpc_node_gpu_gres: "{{ _gpu_info.gres }}"
74+
when: openhpc_enable.batch | default(false)
75+
6676
- name: Template slurm.conf
6777
template:
6878
src: "{{ openhpc_slurm_conf_template }}"

templates/gres.conf.j2

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
AutoDetect=off
1+
AutoDetect={{ openhpc_gres_autodetect }}
22
{% for nodegroup in openhpc_nodegroups %}
33
{% set gres_list = nodegroup.gres | default([]) %}
4-
{% set gres_autodetect = nodegroup.gres_autodetect | default('off') %}
4+
{% set nodegroup_gres_autodetect = nodegroup.gres_autodetect | default('off') %}
55
{% set inventory_group_name = openhpc_cluster_name ~ '_' ~ nodegroup.name %}
66
{% set inventory_group_hosts = groups.get(inventory_group_name, []) %}
77
{% set hostlist_string = inventory_group_hosts | hostlist_expression | join(',') %}
8-
{% if gres_autodetect != 'off' %}
9-
NodeName={{ hostlist_string }} AutoDetect={{ gres_autodetect }}
8+
{% if nodegroup_gres_autodetect != 'off' %}
9+
NodeName={{ hostlist_string }} AutoDetect={{ nodegroup_gres_autodetect }}
1010
{% else %}
1111
{% for gres in gres_list %}
1212
{% set gres_name, gres_type, _ = gres.conf.split(':') %}
13-
NodeName={{ hostlist_string }} Name={{ gres_name }} Type={{ gres_type }} File={{ gres.file | mandatory('The gres configuration dictionary: ' ~ gres ~ ' is missing the file key, but gres_autodetect is set to off. The error occured on node group: ' ~ nodegroup.name ~ '. Please add the file key or set gres_autodetect.') }}
13+
NodeName={{ hostlist_string }} Name={{ gres_name }} Type={{ gres_type }} File={{ gres.file | mandatory('The gres configuration dictionary: ' ~ gres ~ ' is missing the file key, but gres_autodetect is not specified. The error occured on node group: ' ~ nodegroup.name ~ '. Please add the file key or set gres_autodetect.') }}
1414
{% endfor %}{# gres #}
1515
{% endif %}{# autodetect #}
1616
{% endfor %}{# nodegroup #}

templates/slurm.conf.j2

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,11 @@ NodeName={{ hostlists | join(',') }} {{ '' -}}
3838
CoresPerSocket={{ first_host_hv['ansible_processor_cores'] }} {{ '' -}}
3939
ThreadsPerCore={{ first_host_hv['ansible_processor_threads_per_core'] }} {{ '' -}}
4040
{{ nodegroup.node_params | default({}) | dict2parameters }} {{ '' -}}
41-
{% if 'gres' in nodegroup %}Gres={{ ','.join(nodegroup.gres | map(attribute='conf')) }}{% endif %}
41+
{% if 'gres' in nodegroup -%}
42+
Gres={{ ','.join(nodegroup.gres | map(attribute='conf')) -}}
43+
{% elif openhpc_gres_autodetect == 'nvml' and first_host_hv['ohpc_node_gpu_gres'] != '' -%}
44+
Gres={{ first_host_hv['ohpc_node_gpu_gres'] -}}
45+
{% endif %}
4246

4347
{% endif %}{# 1 or more hosts in inventory #}
4448
NodeSet=nodegroup_{{ nodegroup.name }} Feature=nodegroup_{{ nodegroup.name }}

0 commit comments

Comments
 (0)