Skip to content

Commit 7f5941c

Browse files
committed
Merge branch 'master' into feat/no-ohpc
2 parents d2d4d3f + 190f8ca commit 7f5941c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+889
-698
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
# as the container command is flaky.
1515
# This job builds an image using the upstream rockylinux:9.3 image which ensures
1616
# that the image used for the molecule workflow is always updated.
17-
runs-on: ubuntu-22.04
17+
runs-on: ubuntu-latest
1818
defaults:
1919
run:
2020
working-directory: molecule/images
@@ -36,7 +36,7 @@ jobs:
3636

3737
molecule:
3838
name: Molecule
39-
runs-on: ubuntu-22.04
39+
runs-on: ubuntu-latest
4040
needs: build
4141
strategy:
4242
fail-fast: false
@@ -51,17 +51,17 @@ jobs:
5151
- test2
5252
- test3
5353
- test4
54-
- test5
5554
- test6
56-
- test7
5755
- test8
5856
- test9
5957
- test10
6058
- test11
6159
- test12
6260
- test13
63-
- test14
64-
exclude: []
61+
exclude:
62+
# mariadb package provides /usr/bin/mysql on RL8 which doesn't work with geerlingguy/mysql role
63+
- scenario: test4
64+
image: 'rockylinux:8.9'
6565

6666
steps:
6767
- name: Check out the codebase.
@@ -107,7 +107,7 @@ jobs:
107107

108108
checks:
109109
name: Checks
110-
runs-on: ubuntu-22.04
110+
runs-on: ubuntu-latest
111111
steps:
112112
- name: Check out the codebase.
113113
uses: actions/checkout@v3

README.md

Lines changed: 269 additions & 64 deletions
Large diffs are not rendered by default.

defaults/main.yml

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,57 @@ openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}"
44
openhpc_slurm_service:
55
openhpc_slurm_control_host: "{{ inventory_hostname }}"
66
#openhpc_slurm_control_host_address:
7-
openhpc_slurm_partitions: []
7+
openhpc_partitions: "{{ openhpc_nodegroups }}"
8+
openhpc_nodegroups: []
89
openhpc_cluster_name:
910
openhpc_packages:
1011
- slurm-libpmi-ohpc
1112
openhpc_resume_timeout: 300
1213
openhpc_retry_delay: 10
1314
openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some formats to seconds, which is interpreted as minutes by Slurm
14-
openhpc_config: "{{ openhpc_extra_config | default({}) }}"
15+
openhpc_default_config:
16+
# This only defines values which are not Slurm defaults
17+
SlurmctldHost: "{{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %}"
18+
ProctrackType: proctrack/linuxproc # TODO: really want cgroup but needs cgroup.conf and workaround for CI
19+
SlurmdSpoolDir: /var/spool/slurm # NB: not OpenHPC default!
20+
SlurmUser: slurm
21+
StateSaveLocation: "{{ openhpc_state_save_location }}"
22+
SlurmctldTimeout: 300
23+
SchedulerType: sched/backfill
24+
SelectType: select/cons_tres
25+
SelectTypeParameters: CR_Core
26+
PriorityWeightPartition: 1000
27+
PreemptType: preempt/partition_prio
28+
PreemptMode: SUSPEND,GANG
29+
AccountingStoragePass: "{{ openhpc_slurm_accounting_storage_pass | default('omit') }}"
30+
AccountingStorageHost: "{{ openhpc_slurm_accounting_storage_host }}"
31+
AccountingStoragePort: "{{ openhpc_slurm_accounting_storage_port }}"
32+
AccountingStorageType: "{{ openhpc_slurm_accounting_storage_type }}"
33+
AccountingStorageUser: "{{ openhpc_slurm_accounting_storage_user }}"
34+
JobCompLoc: "{{ openhpc_slurm_job_comp_loc }}"
35+
JobCompType: "{{ openhpc_slurm_job_comp_type }}"
36+
JobAcctGatherFrequency: "{{ openhpc_slurm_job_acct_gather_frequency }}"
37+
JobAcctGatherType: "{{ openhpc_slurm_job_acct_gather_type }}"
38+
SlurmctldSyslogDebug: info
39+
SlurmdSyslogDebug: info
40+
PropagateResourceLimitsExcept: MEMLOCK
41+
Epilog: /etc/slurm/slurm.epilog.clean
42+
ReturnToService: 2
43+
openhpc_cgroup_default_config:
44+
ConstrainCores: "yes"
45+
ConstrainDevices: "yes"
46+
ConstrainRAMSpace: "yes"
47+
ConstrainSwapSpace: "yes"
48+
49+
openhpc_config: {}
50+
openhpc_cgroup_config: {}
1551
openhpc_gres_template: gres.conf.j2
1652
openhpc_cgroup_template: cgroup.conf.j2
17-
openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}"
1853

1954
openhpc_state_save_location: /var/spool/slurm
2055
openhpc_slurmd_spool_dir: /var/spool/slurm
2156
openhpc_slurm_conf_path: /etc/slurm/slurm.conf
2257
openhpc_slurm_conf_template: slurm.conf.j2
23-
openhpc_config_files:
24-
- template:
25-
dest: "{{ openhpc_slurm_conf_path | dirname }}/gres.conf"
26-
src: "{{ openhpc_gres_template }}"
27-
mode: "0600"
28-
owner: slurm
29-
group: slurm
30-
enable: control
3158

3259
# Accounting
3360
openhpc_slurm_accounting_storage_host: "{{ openhpc_slurmdbd_host }}"
@@ -113,12 +140,16 @@ ohpc_default_extra_repos:
113140
gpgcheck: true
114141
gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8"
115142

116-
# Concatenate extra repo definitions here
117-
ohpc_extra_repos: "{{ ohpc_default_extra_repos[ansible_distribution_major_version] + openhpc_extra_repos }}"
118-
119-
openhpc_munge_key:
143+
openhpc_munge_key_b64:
120144
openhpc_login_only_nodes: ''
121145
openhpc_module_system_install: true # only works for install-ohpc.yml/main.yml
122146

123147
# Auto detection
124148
openhpc_ram_multiplier: 0.95
149+
150+
# Database upgrade
151+
openhpc_slurm_accounting_storage_service: ''
152+
openhpc_slurm_accounting_storage_backup_cmd: ''
153+
openhpc_slurm_accounting_storage_backup_host: "{{ openhpc_slurm_accounting_storage_host }}"
154+
openhpc_slurm_accounting_storage_backup_become: true
155+
openhpc_slurm_accounting_storage_client_package: mysql

files/nodegroup.schema

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
{ "$schema": "https://json-schema.org/draft/2020-12/schema",
2+
"type": "object",
3+
"definitions": {
4+
"gres": {
5+
"type": "array",
6+
"items": {
7+
"type": "object",
8+
"properties": {
9+
"conf": {
10+
"type": "string",
11+
"minLength": 1
12+
},
13+
"file": {
14+
"type": "string",
15+
"minLength": 1
16+
}
17+
},
18+
"required": [
19+
"conf"
20+
]
21+
}
22+
}
23+
},
24+
"properties": {
25+
"name": {
26+
"type": "string",
27+
"minLength": 1
28+
},
29+
"ram_mb": {
30+
"type": "number",
31+
},
32+
"ram_multiplier": {
33+
"type": "number",
34+
},
35+
"features": {
36+
"type": "array",
37+
"items": {
38+
"type": "string"
39+
}
40+
},
41+
"node_params": {
42+
"type": "object",
43+
},
44+
"gres_autodetect": {
45+
"type": "string",
46+
"minLength": 1
47+
},
48+
"gres": {
49+
"$ref": "#/definitions/gres"
50+
}
51+
},
52+
"required": [
53+
"name"
54+
],
55+
"if": {
56+
"properties": {
57+
"gres_autodetect": {
58+
"const": "off"
59+
}
60+
}
61+
},
62+
"then": {
63+
"properties": {
64+
"gres": {
65+
"items": {
66+
"required": [
67+
"file"
68+
]
69+
}
70+
}
71+
}
72+
},
73+
"else": {
74+
"properties": {
75+
"gres": {
76+
"items": {
77+
"not": {
78+
"required": [
79+
"file"
80+
]
81+
}
82+
}
83+
}
84+
}
85+
}
86+
}

filter_plugins/slurm_conf.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,6 @@
2222
# Pattern to match a hostname with numerical ending
2323
pattern = re.compile("^(.*\D(?=\d))(\d+)$")
2424

25-
def _get_hostvar(context, var_name, inventory_hostname=None):
26-
if inventory_hostname is None:
27-
namespace = context
28-
else:
29-
if inventory_hostname not in context['hostvars']:
30-
raise errors.AnsibleFilterError(
31-
"Inventory hostname '%s' not in hostvars" % inventory_hostname)
32-
namespace = context["hostvars"][inventory_hostname]
33-
return namespace.get(var_name)
34-
3525
def hostlist_expression(hosts):
3626
""" Group hostnames using Slurm's hostlist expression format.
3727
@@ -91,11 +81,46 @@ def dict2parameters(d):
9181
parts = ['%s=%s' % (k, v) for k, v in d.items()]
9282
return ' '.join(parts)
9383

84+
def config2dict(lines):
85+
""" Convert a sequence of output lines from `scontrol show config` to a dict.
86+
87+
As per man page uppercase keys are derived parameters, mixed case are from
88+
from config files.
89+
90+
The following case-insensitive conversions of values are carried out:
91+
- '(null)' and 'n/a' are converted to None.
92+
- yes and no are converted to True and False respectively
93+
94+
Except for these, values are always strings.
95+
"""
96+
cfg = {}
97+
for line in lines:
98+
if '=' not in line: # ditch blank/info lines
99+
continue
100+
else:
101+
parts = [x.strip() for x in line.split('=', maxsplit=1)] # maxplit handles '=' in values
102+
if len(parts) != 2:
103+
raise errors.AnsibleFilterError(f'line {line} cannot be split into key=value')
104+
k, v = parts
105+
small_v = v.lower()
106+
if small_v == '(null)':
107+
v = None
108+
elif small_v == 'n/a':
109+
v = None
110+
elif small_v == 'no':
111+
v = False
112+
elif small_v == 'yes':
113+
v = True
114+
cfg[k] = v
115+
return cfg
116+
117+
94118
class FilterModule(object):
95119

96120
def filters(self):
97121
return {
98122
'hostlist_expression': hostlist_expression,
99123
'error': error,
100124
'dict2parameters': dict2parameters,
125+
'config2dict': config2dict,
101126
}

handlers/main.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,4 @@
11
---
2-
# NOTE: We need this running before slurmdbd
3-
- name: Restart Munge service
4-
service:
5-
name: "munge"
6-
state: restarted
7-
when: openhpc_slurm_service_started | bool
82

93
# NOTE: we need this running before slurmctld start
104
- name: Issue slurmdbd restart command
@@ -58,7 +52,15 @@
5852
service:
5953
name: "slurmd"
6054
state: restarted
55+
retries: 5
56+
register: slurmd_restart
57+
until: slurmd_restart is success
58+
delay: 30
6159
when:
6260
- openhpc_slurm_service_started | bool
6361
- openhpc_enable.batch | default(false) | bool
6462
# 2nd condition required as notification happens on controller, which isn't necessarily a compute note
63+
64+
- name: Reload facts
65+
ansible.builtin.setup:
66+
filter: ansible_local

molecule/README.md

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,23 @@ Test options in "Other" column flow down through table unless changed.
66

77
Test | # Partitions | Groups in partitions? | Other
88
--- | --- | --- | ---
9-
test1 | 1 | N | 2x compute node, sequential names (default test), config on all nodes
9+
test1 | 1 | N | 2x compute node, sequential names (default test)
1010
test1b | 1 | N | 1x compute node
1111
test1c | 1 | N | 2x compute nodes, nonsequential names
1212
test2 | 2 | N | 4x compute node, sequential names
13-
test3 | 1 | Y | -
13+
test3 | 1 | Y | 4x compute nodes in 2x groups, single partition
1414
test4 | 1 | N | 2x compute node, accounting enabled
15-
test5 | 1 | N | As for #1 but configless
16-
test6 | 1 | N | 0x compute nodes, configless
17-
test7 | 1 | N | 1x compute node, no login node so specified munge key, configless (checks image build should work)
18-
test8 | 1 | N | 2x compute node, 2x login-only nodes, configless
15+
test5 | - | - | [removed, now always configless]
16+
test6 | 1 | N | 0x compute nodes
17+
test7 | 1 | N | [removed, image build should just run install.yml task, this is not expected to work]
18+
test8 | 1 | N | 2x compute node, 2x login-only nodes
1919
test9 | 1 | N | As test8 but uses `--limit=testohpc-control,testohpc-compute-0` and checks login nodes still end up in slurm.conf
20-
test10 | 1 | N | As for #5 but then tries to add an additional node
21-
test11 | 1 | N | As for #5 but then deletes a node (actually changes the partition due to molecule/ansible limitations)
22-
test12 | 1 | N | As for #5 but enabling job completion and testing `sacct -c`
23-
test13 | 1 | N | As for #5 but tests `openhpc_config` variable.
24-
test14 | 1 | N | As for #5 but also tests `extra_nodes` via State=DOWN nodes.
25-
20+
test10 | 1 | N | As for #1 but then tries to add an additional node
21+
test11 | 1 | N | As for #1 but then deletes a node (actually changes the partition due to molecule/ansible limitations)
22+
test12 | 1 | N | As for #1 but enabling job completion and testing `sacct -c`
23+
test13 | 1 | N | As for #1 but tests `openhpc_config` variable.
24+
test14 | - | - | [removed, extra_nodes removed]
25+
test15 | 1 | Y | As for #1 but also tests partitions with different name but with the same NodeName.
2626

2727
# Local Installation & Running
2828

molecule/test1/converge.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
88
runtime: true
99
openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
10-
openhpc_slurm_partitions:
10+
openhpc_nodegroups:
1111
- name: "compute"
1212
openhpc_cluster_name: testohpc
1313
tasks:

molecule/test1/molecule.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ platforms:
99
- testohpc_login
1010
command: /sbin/init
1111
tmpfs:
12-
- /run
13-
- /tmp
12+
/run: rw
13+
/tmp: rw
1414
volumes:
1515
- /sys/fs/cgroup:/sys/fs/cgroup:ro
1616
network: net1
@@ -21,8 +21,8 @@ platforms:
2121
- testohpc_compute
2222
command: /sbin/init
2323
tmpfs:
24-
- /run
25-
- /tmp
24+
/run: rw
25+
/tmp: rw
2626
volumes:
2727
- /sys/fs/cgroup:/sys/fs/cgroup:ro
2828
network: net1
@@ -33,8 +33,8 @@ platforms:
3333
- testohpc_compute
3434
command: /sbin/init
3535
tmpfs:
36-
- /run
37-
- /tmp
36+
/run: rw
37+
/tmp: rw
3838
volumes:
3939
- /sys/fs/cgroup:/sys/fs/cgroup:ro
4040
network: net1

0 commit comments

Comments
 (0)