From 59e1222778ca0bf496d38a5fe6cf42b05094e1ee Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 17 Sep 2025 10:06:49 +0200 Subject: [PATCH] Stop slurmd during slurmctld restart Some Slurm configuration changes can cause compute nodes to go into invalid state if slurmctld is restarted while slurmd services are still running. Stop slurmd services while slurmctld is being restarted. This has been tested not to affect running jobs. Closes #199 --- tasks/runtime.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 5edd5fc..9ee9652 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -127,6 +127,25 @@ - "_openhpc_slurmdbd_state.stdout == 'inactive'" - openhpc_enable.database | default(false) +- name: Stop slurmd if configuration has changed + service: + name: "slurmd" + state: stopped + retries: 5 + register: slurmd_stop + until: slurmd_stop is success + delay: 30 + when: + - openhpc_slurm_service_started | bool + - openhpc_enable.batch | default(false) | bool + - openhpc_slurm_control_host in ansible_play_hosts + - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or + hostvars[openhpc_slurm_control_host].ohpc_cgroup_conf.changed or + hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler + +- name: Flush handler + meta: flush_handlers # This will restart slurmctld while slurmd services are stopped, if needed + - name: Notify handler for slurmd restart debug: msg: "notifying handlers" # meta: noop doesn't support 'when'