From 6913e9c6b9c462982f7c5157d7a242975c61dbf3 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 23 Jul 2025 09:23:56 +0200 Subject: [PATCH 1/2] Support defining custom cgroup.conf options --- README.md | 9 ++++++++- defaults/main.yml | 6 ++++++ templates/cgroup.conf.j2 | 9 +++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 022c9ab..a3852eb 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,14 @@ partition configuration for each. [slurm.conf](https://slurm.schedmd.com/slurm.conf.html). Keys are slurm.conf parameter names and values are lists or strings as appropriate. This can be used to supplement or override the template defaults. Templated parameters can -also be removed by setting the value to the literal string`'omit'` - note +also be removed by setting the value to the literal string `'omit'` - note +that this is *not the same* as the Ansible `omit` [special variable](https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html#term-omit). + +`openhpc_cgroup_config`: Optional. Mapping of additional parameters and values for +[cgroup.conf](https://slurm.schedmd.com/cgroup.conf.html). Keys are cgroup.conf +parameter names and values are lists or strings as appropriate. This can be +used to supplement or override the template defaults. Templated parameters can +also be removed by setting the value to the literal string `'omit'` - note that this is *not the same* as the Ansible `omit` [special variable](https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html#term-omit). `openhpc_ram_multiplier`: Optional, default `0.95`. Multiplier used in the calculation: `total_memory * openhpc_ram_multiplier` when setting `RealMemory` for the partition in slurm.conf. Can be overriden on a per partition basis using `openhpc_slurm_partitions.ram_multiplier`. Has no effect if `openhpc_slurm_partitions.ram_mb` is set. diff --git a/defaults/main.yml b/defaults/main.yml index 94ba868..2b85a4b 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -40,8 +40,14 @@ openhpc_default_config: PropagateResourceLimitsExcept: MEMLOCK Epilog: /etc/slurm/slurm.epilog.clean ReturnToService: 2 +openhpc_cgroup_default_config: + ConstrainCores: "yes" + ConstrainDevices: "yes" + ConstrainRAMSpace: "yes" + ConstrainSwapSpace: "yes" openhpc_config: {} +openhpc_cgroup_config: {} openhpc_gres_template: gres.conf.j2 openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" diff --git a/templates/cgroup.conf.j2 b/templates/cgroup.conf.j2 index 9186bca..a362ad6 100644 --- a/templates/cgroup.conf.j2 +++ b/templates/cgroup.conf.j2 @@ -5,7 +5,8 @@ # See man slurm.conf and man cgroup.conf for further # information on cgroup configuration parameters #-- -ConstrainCores=yes -ConstrainDevices=yes -ConstrainRAMSpace=yes -ConstrainSwapSpace=yes +{% for k, v in openhpc_cgroup_default_config | combine(openhpc_cgroup_config) | items %} +{% if v != "omit" %}{# allow removing items using setting key: null #} +{{ k }}={{ v | join(',') if (v is sequence and v is not string) else v }} +{% endif %} +{% endfor %} From cfbc4fd0437636c0716a6b7a6d4b706229e6da58 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 23 Jul 2025 10:06:14 +0200 Subject: [PATCH 2/2] Restart Slurm if cgroup.conf has changed --- tasks/runtime.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index b08a451..b09dad8 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -105,6 +105,10 @@ owner: root group: root when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool + notify: + - Restart slurmctld service + register: ohpc_cgroup_conf + # NB uses restart rather than reload as this is needed in some cases - name: Remove local tempfile for slurm.conf templating ansible.builtin.file: @@ -139,7 +143,7 @@ changed_when: true when: - openhpc_slurm_control_host in ansible_play_hosts - - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler + - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_cgroup_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler notify: - Restart slurmd service