Skip to content

Commit 36f1e17

Browse files
committed
testing openhpc in compute script
1 parent fce13ed commit 36f1e17

File tree

2 files changed

+221
-2
lines changed

2 files changed

+221
-2
lines changed

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 219 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,9 +264,226 @@
264264
value: "{{ item.value }}"
265265
no_extra_spaces: true
266266
loop: "{{ cvmfs_config | dict2items }}"
267-
268267

269268
# NOTE: Not clear how to make this idempotent
270269
- name: Ensure CVMFS config is setup
271270
command:
272-
cmd: "cvmfs_config setup"
271+
cmd: "cvmfs_config setup"
272+
273+
274+
- name: Configure openhpc
275+
block:
276+
- name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist
277+
assert:
278+
that:
279+
- openhpc_slurm_control_host is defined
280+
- openhpc_cluster_name is defined
281+
- openhpc_cluster_name != ''
282+
- openhpc_slurm_partitions is defined
283+
fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions."
284+
285+
- name: Fail if control host not in play and munge key not specified
286+
fail:
287+
msg: "Either the slurm control node must be in the play or `openhpc_munge_key` must be set"
288+
when:
289+
- openhpc_slurm_control_host not in ansible_play_hosts
290+
- not openhpc_munge_key
291+
292+
# - name: Ensure Slurm directories exists
293+
# file:
294+
# path: "{{ openhpc_state_save_location }}"
295+
# owner: slurm
296+
# group: slurm
297+
# mode: 0755
298+
# state: directory
299+
# when: inventory_hostname == openhpc_slurm_control_host
300+
301+
# - name: Generate a Munge key on control host
302+
# # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler
303+
# command: "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024"
304+
# args:
305+
# creates: "/etc/munge/munge.key"
306+
# when: inventory_hostname == openhpc_slurm_control_host
307+
308+
# - name: Retrieve Munge key from control host
309+
# slurp:
310+
# src: "/etc/munge/munge.key"
311+
# register: openhpc_control_munge_key
312+
# delegate_to: "{{ openhpc_slurm_control_host }}"
313+
# when: openhpc_slurm_control_host in ansible_play_hosts
314+
315+
- name: Fix permissions on /etc to pass Munge startup checks
316+
# Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root)
317+
# which fails munged startup checks
318+
file:
319+
path: /etc
320+
state: directory
321+
mode: g-w
322+
323+
- name: Write Munge key
324+
copy:
325+
content: "{{ openhpc_munge_key or (openhpc_control_munge_key.content | b64decode) }}"
326+
dest: "/etc/munge/munge.key"
327+
owner: munge
328+
group: munge
329+
mode: 0400
330+
notify:
331+
- Restart Munge service
332+
333+
- name: Ensure JobComp logfile exists
334+
file:
335+
path: "{{ openhpc_slurm_job_comp_loc }}"
336+
state: touch
337+
owner: slurm
338+
group: slurm
339+
mode: 0644
340+
access_time: preserve
341+
modification_time: preserve
342+
when: openhpc_slurm_job_comp_type == 'jobcomp/filetxt'
343+
344+
- name: Template slurmdbd.conf
345+
template:
346+
src: slurmdbd.conf.j2
347+
dest: /etc/slurm/slurmdbd.conf
348+
mode: "0600"
349+
owner: slurm
350+
group: slurm
351+
notify: Restart slurmdbd service
352+
when: openhpc_enable.database | default(false) | bool
353+
354+
- name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other
355+
ansible.builtin.tempfile:
356+
register: _slurm_conf_tmpfile
357+
delegate_to: localhost
358+
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
359+
changed_when: false # so molecule doesn't fail
360+
become: no
361+
362+
- name: Template basic slurm.conf
363+
template:
364+
src: slurm.conf.j2
365+
dest: "{{ _slurm_conf_tmpfile.path }}"
366+
lstrip_blocks: true
367+
mode: 0644
368+
delegate_to: localhost
369+
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
370+
changed_when: false # so molecule doesn't fail
371+
become: no
372+
373+
- name: Customise slurm.conf
374+
community.general.ini_file:
375+
path: "{{ _slurm_conf_tmpfile.path }}"
376+
option: "{{ item.key }}"
377+
section: ''
378+
value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}"
379+
no_extra_spaces: true
380+
create: no
381+
mode: 0644
382+
loop: "{{ openhpc_config | dict2items }}"
383+
delegate_to: localhost
384+
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
385+
changed_when: false # so molecule doesn't fail
386+
become: no
387+
388+
- name: Create slurm.conf
389+
copy:
390+
src: "{{ _slurm_conf_tmpfile.path }}"
391+
dest: /etc/slurm/slurm.conf
392+
owner: root
393+
group: root
394+
mode: 0644
395+
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
396+
notify:
397+
- Restart slurmctld service
398+
register: ohpc_slurm_conf
399+
# NB uses restart rather than reload as number of nodes might have changed
400+
401+
- name: Create gres.conf
402+
template:
403+
src: "{{ openhpc_gres_template }}"
404+
dest: /etc/slurm/gres.conf
405+
mode: "0600"
406+
owner: slurm
407+
group: slurm
408+
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
409+
notify:
410+
- Restart slurmctld service
411+
register: ohpc_gres_conf
412+
# NB uses restart rather than reload as this is needed in some cases
413+
414+
- name: Template cgroup.conf
415+
# appears to be required even with NO cgroup plugins: https://slurm.schedmd.com/cgroups.html#cgroup_design
416+
template:
417+
src: cgroup.conf.j2
418+
dest: /etc/slurm/cgroup.conf
419+
mode: "0644" # perms/ownership based off src from ohpc package
420+
owner: root
421+
group: root
422+
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
423+
424+
- name: Remove local tempfile for slurm.conf templating
425+
ansible.builtin.file:
426+
path: "{{ _slurm_conf_tmpfile.path }}"
427+
state: absent
428+
when: _slurm_conf_tmpfile.path is defined
429+
delegate_to: localhost
430+
changed_when: false # so molecule doesn't fail
431+
become: no
432+
433+
- name: Notify handler for slurmd restart
434+
debug:
435+
msg: "notifying handlers" # meta: noop doesn't support 'when'
436+
changed_when: true
437+
when:
438+
- openhpc_slurm_control_host in ansible_play_hosts
439+
- hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler
440+
notify:
441+
- Restart slurmd service
442+
443+
- name: Set slurmctld location for configless operation
444+
lineinfile:
445+
path: /etc/sysconfig/slurmd
446+
line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'"
447+
regexp: "^SLURMD_OPTIONS="
448+
create: yes
449+
owner: root
450+
group: root
451+
mode: 0644
452+
when:
453+
- openhpc_enable.batch | default(false)
454+
- openhpc_slurm_configless
455+
notify:
456+
- Restart slurmd service
457+
# Reloading is sufficent, but using a single handler means no bounce. Realistically this won't regularly change on a running slurmd so restarting is ok.
458+
459+
# Munge state could be unchanged but the service is not running.
460+
# Handle that here.
461+
- name: Configure Munge service
462+
service:
463+
name: munge
464+
enabled: "{{ openhpc_slurm_service_enabled | bool }}"
465+
state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
466+
467+
- name: Flush handler
468+
meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced
469+
470+
- name: Ensure slurmdbd state
471+
service:
472+
name: slurmdbd
473+
enabled: "{{ openhpc_slurm_service_enabled | bool }}"
474+
state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
475+
when: openhpc_enable.database | default(false) | bool
476+
477+
# - name: Ensure slurmctld state
478+
# service:
479+
# name: slurmctld
480+
# enabled: "{{ openhpc_slurm_service_enabled | bool }}"
481+
# state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
482+
# when: openhpc_enable.control | default(false) | bool
483+
484+
- name: Ensure slurmd state
485+
service:
486+
name: slurmd
487+
enabled: "{{ openhpc_slurm_service_enabled | bool }}"
488+
state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
489+
when: openhpc_enable.batch | default(false) | bool

ansible/roles/compute_init/tasks/main.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
loop:
5757
- ../../basic_users/library/terminate_user_sessions.py
5858
- ../../stackhpc.os-manila-mount/library/os_manila_share.py
59+
- ../../stackhpc.openhpc/library/sacct_cluster.py
5960

6061
- name: Ensure filter_plugins directory exists
6162
file:
@@ -74,6 +75,7 @@
7475
mode: 0644
7576
loop:
7677
- ../../basic_users/filter_plugins/filter_keys.py
78+
- ../../stackhpc.openhpc/filter_plugins/slurm_conf.py
7779

7880
- name: Add filter_plugins ansible.cfg
7981
lineinfile:

0 commit comments

Comments
 (0)