Skip to content

Commit 8930d38

Browse files
committed
finish transferring openhpc tasks to compute script
1 parent 36f1e17 commit 8930d38

File tree

2 files changed

+35
-181
lines changed

2 files changed

+35
-181
lines changed

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 14 additions & 181 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,15 @@
5656
cvmfs_config_overrides: {}
5757
cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}"
5858

59+
openhpc_conf_server: control_node_ip
60+
openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2
61+
openhpc_slurm_service_enabled: true
62+
openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}"
63+
openhpc_enable:
64+
control: false
65+
batch: true
66+
database: false
67+
runtime: true
5968

6069
tasks:
6170
- name: Configure resolve.conf
@@ -223,7 +232,6 @@
223232

224233

225234
- name: Configure EESSI
226-
gather_facts: false
227235
block:
228236
- name: Download Cern GPG key
229237
ansible.builtin.get_url:
@@ -247,15 +255,6 @@
247255
# NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok?
248256
disable_gpg_check: true
249257

250-
# Alternative version using official repo - still no GPG key :(
251-
# - name: Add EESSI repo
252-
# dnf:
253-
# name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm
254-
255-
# - name: Install EESSI CVMFS config
256-
# dnf:
257-
# name: cvmfs-config-eessi
258-
259258
- name: Add base CVMFS config
260259
community.general.ini_file:
261260
dest: /etc/cvmfs/default.local
@@ -273,45 +272,6 @@
273272

274273
- name: Configure openhpc
275274
block:
276-
- name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist
277-
assert:
278-
that:
279-
- openhpc_slurm_control_host is defined
280-
- openhpc_cluster_name is defined
281-
- openhpc_cluster_name != ''
282-
- openhpc_slurm_partitions is defined
283-
fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions."
284-
285-
- name: Fail if control host not in play and munge key not specified
286-
fail:
287-
msg: "Either the slurm control node must be in the play or `openhpc_munge_key` must be set"
288-
when:
289-
- openhpc_slurm_control_host not in ansible_play_hosts
290-
- not openhpc_munge_key
291-
292-
# - name: Ensure Slurm directories exists
293-
# file:
294-
# path: "{{ openhpc_state_save_location }}"
295-
# owner: slurm
296-
# group: slurm
297-
# mode: 0755
298-
# state: directory
299-
# when: inventory_hostname == openhpc_slurm_control_host
300-
301-
# - name: Generate a Munge key on control host
302-
# # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler
303-
# command: "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024"
304-
# args:
305-
# creates: "/etc/munge/munge.key"
306-
# when: inventory_hostname == openhpc_slurm_control_host
307-
308-
# - name: Retrieve Munge key from control host
309-
# slurp:
310-
# src: "/etc/munge/munge.key"
311-
# register: openhpc_control_munge_key
312-
# delegate_to: "{{ openhpc_slurm_control_host }}"
313-
# when: openhpc_slurm_control_host in ansible_play_hosts
314-
315275
- name: Fix permissions on /etc to pass Munge startup checks
316276
# Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root)
317277
# which fails munged startup checks
@@ -320,83 +280,13 @@
320280
state: directory
321281
mode: g-w
322282

323-
- name: Write Munge key
283+
- name: Copy Munge key from NFS-mounted directory to /etc/munge
324284
copy:
325-
content: "{{ openhpc_munge_key or (openhpc_control_munge_key.content | b64decode) }}"
285+
src: "/mnt/openhpc_munge.key"
326286
dest: "/etc/munge/munge.key"
327287
owner: munge
328288
group: munge
329289
mode: 0400
330-
notify:
331-
- Restart Munge service
332-
333-
- name: Ensure JobComp logfile exists
334-
file:
335-
path: "{{ openhpc_slurm_job_comp_loc }}"
336-
state: touch
337-
owner: slurm
338-
group: slurm
339-
mode: 0644
340-
access_time: preserve
341-
modification_time: preserve
342-
when: openhpc_slurm_job_comp_type == 'jobcomp/filetxt'
343-
344-
- name: Template slurmdbd.conf
345-
template:
346-
src: slurmdbd.conf.j2
347-
dest: /etc/slurm/slurmdbd.conf
348-
mode: "0600"
349-
owner: slurm
350-
group: slurm
351-
notify: Restart slurmdbd service
352-
when: openhpc_enable.database | default(false) | bool
353-
354-
- name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other
355-
ansible.builtin.tempfile:
356-
register: _slurm_conf_tmpfile
357-
delegate_to: localhost
358-
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
359-
changed_when: false # so molecule doesn't fail
360-
become: no
361-
362-
- name: Template basic slurm.conf
363-
template:
364-
src: slurm.conf.j2
365-
dest: "{{ _slurm_conf_tmpfile.path }}"
366-
lstrip_blocks: true
367-
mode: 0644
368-
delegate_to: localhost
369-
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
370-
changed_when: false # so molecule doesn't fail
371-
become: no
372-
373-
- name: Customise slurm.conf
374-
community.general.ini_file:
375-
path: "{{ _slurm_conf_tmpfile.path }}"
376-
option: "{{ item.key }}"
377-
section: ''
378-
value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}"
379-
no_extra_spaces: true
380-
create: no
381-
mode: 0644
382-
loop: "{{ openhpc_config | dict2items }}"
383-
delegate_to: localhost
384-
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
385-
changed_when: false # so molecule doesn't fail
386-
become: no
387-
388-
- name: Create slurm.conf
389-
copy:
390-
src: "{{ _slurm_conf_tmpfile.path }}"
391-
dest: /etc/slurm/slurm.conf
392-
owner: root
393-
group: root
394-
mode: 0644
395-
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
396-
notify:
397-
- Restart slurmctld service
398-
register: ohpc_slurm_conf
399-
# NB uses restart rather than reload as number of nodes might have changed
400290

401291
- name: Create gres.conf
402292
template:
@@ -405,82 +295,25 @@
405295
mode: "0600"
406296
owner: slurm
407297
group: slurm
408-
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
409-
notify:
410-
- Restart slurmctld service
298+
when: openhpc_enable.control | default(false)
411299
register: ohpc_gres_conf
412-
# NB uses restart rather than reload as this is needed in some cases
413-
414-
- name: Template cgroup.conf
415-
# appears to be required even with NO cgroup plugins: https://slurm.schedmd.com/cgroups.html#cgroup_design
416-
template:
417-
src: cgroup.conf.j2
418-
dest: /etc/slurm/cgroup.conf
419-
mode: "0644" # perms/ownership based off src from ohpc package
420-
owner: root
421-
group: root
422-
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
423-
424-
- name: Remove local tempfile for slurm.conf templating
425-
ansible.builtin.file:
426-
path: "{{ _slurm_conf_tmpfile.path }}"
427-
state: absent
428-
when: _slurm_conf_tmpfile.path is defined
429-
delegate_to: localhost
430-
changed_when: false # so molecule doesn't fail
431-
become: no
432-
433-
- name: Notify handler for slurmd restart
434-
debug:
435-
msg: "notifying handlers" # meta: noop doesn't support 'when'
436-
changed_when: true
437-
when:
438-
- openhpc_slurm_control_host in ansible_play_hosts
439-
- hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler
440-
notify:
441-
- Restart slurmd service
442300

443301
- name: Set slurmctld location for configless operation
444302
lineinfile:
445303
path: /etc/sysconfig/slurmd
446-
line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'"
304+
line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'"
447305
regexp: "^SLURMD_OPTIONS="
448306
create: yes
449307
owner: root
450308
group: root
451309
mode: 0644
452-
when:
453-
- openhpc_enable.batch | default(false)
454-
- openhpc_slurm_configless
455-
notify:
456-
- Restart slurmd service
457-
# Reloading is sufficent, but using a single handler means no bounce. Realistically this won't regularly change on a running slurmd so restarting is ok.
458-
459-
# Munge state could be unchanged but the service is not running.
460-
# Handle that here.
310+
461311
- name: Configure Munge service
462312
service:
463313
name: munge
464314
enabled: "{{ openhpc_slurm_service_enabled | bool }}"
465315
state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
466316

467-
- name: Flush handler
468-
meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced
469-
470-
- name: Ensure slurmdbd state
471-
service:
472-
name: slurmdbd
473-
enabled: "{{ openhpc_slurm_service_enabled | bool }}"
474-
state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
475-
when: openhpc_enable.database | default(false) | bool
476-
477-
# - name: Ensure slurmctld state
478-
# service:
479-
# name: slurmctld
480-
# enabled: "{{ openhpc_slurm_service_enabled | bool }}"
481-
# state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
482-
# when: openhpc_enable.control | default(false) | bool
483-
484317
- name: Ensure slurmd state
485318
service:
486319
name: slurmd

ansible/roles/compute_init/tasks/main.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
- ../../resolv_conf/templates/resolv.conf.j2
2020
- ../../stackhpc.os-manila-mount/templates/ceph.conf.j2
2121
- ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2
22+
- ../../stackhpc.openhpc/templates/gres.conf.j2
2223

2324
- name: Ensure files directory exists
2425
file:
@@ -86,6 +87,26 @@
8687
group: root
8788
mode: 0644
8889

90+
- name: Ensure /exports/cluster directory exists
91+
file:
92+
path: /exports/cluster
93+
state: directory
94+
owner: root
95+
group: root
96+
mode: 0644
97+
delegate_to: "{{ groups['control'] | first }}"
98+
99+
- name: Write openhpc munge key
100+
copy:
101+
content: "{{ vault_openhpc_mungekey | b64decode }}"
102+
dest: "/exports/cluster/openhpc_munge.key"
103+
owner: munge
104+
group: munge
105+
mode: 0400
106+
become: true
107+
delegate_to: "{{ groups['control'] | first }}"
108+
109+
89110
- name: Inject compute initialisation playbook
90111
copy:
91112
src: compute-init.yml

0 commit comments

Comments
 (0)