Skip to content

Commit ecbdad4

Browse files
committed
Merge branch 'master' into feat/b64-mungekey
2 parents cc720b8 + a28eba7 commit ecbdad4

File tree

7 files changed

+120
-24
lines changed

7 files changed

+120
-24
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
# as the container command is flaky.
1515
# This job builds an image using the upstream rockylinux:9.3 image which ensures
1616
# that the image used for the molecule workflow is always updated.
17-
runs-on: ubuntu-22.04
17+
runs-on: ubuntu-latest
1818
defaults:
1919
run:
2020
working-directory: molecule/images
@@ -36,7 +36,7 @@ jobs:
3636

3737
molecule:
3838
name: Molecule
39-
runs-on: ubuntu-22.04
39+
runs-on: ubuntu-latest
4040
needs: build
4141
strategy:
4242
fail-fast: false
@@ -108,7 +108,7 @@ jobs:
108108

109109
checks:
110110
name: Checks
111-
runs-on: ubuntu-22.04
111+
runs-on: ubuntu-latest
112112
steps:
113113
- name: Check out the codebase.
114114
uses: actions/checkout@v3

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,14 @@ partition configuration for each.
107107
[slurm.conf](https://slurm.schedmd.com/slurm.conf.html). Keys are slurm.conf
108108
parameter names and values are lists or strings as appropriate. This can be
109109
used to supplement or override the template defaults. Templated parameters can
110-
also be removed by setting the value to the literal string`'omit'` - note
110+
also be removed by setting the value to the literal string `'omit'` - note
111+
that this is *not the same* as the Ansible `omit` [special variable](https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html#term-omit).
112+
113+
`openhpc_cgroup_config`: Optional. Mapping of additional parameters and values for
114+
[cgroup.conf](https://slurm.schedmd.com/cgroup.conf.html). Keys are cgroup.conf
115+
parameter names and values are lists or strings as appropriate. This can be
116+
used to supplement or override the template defaults. Templated parameters can
117+
also be removed by setting the value to the literal string `'omit'` - note
111118
that this is *not the same* as the Ansible `omit` [special variable](https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html#term-omit).
112119

113120
`openhpc_ram_multiplier`: Optional, default `0.95`. Multiplier used in the calculation: `total_memory * openhpc_ram_multiplier` when setting `RealMemory` for the partition in slurm.conf. Can be overriden on a per partition basis using `openhpc_slurm_partitions.ram_multiplier`. Has no effect if `openhpc_slurm_partitions.ram_mb` is set.

defaults/main.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,14 @@ openhpc_default_config:
4040
PropagateResourceLimitsExcept: MEMLOCK
4141
Epilog: /etc/slurm/slurm.epilog.clean
4242
ReturnToService: 2
43+
openhpc_cgroup_default_config:
44+
ConstrainCores: "yes"
45+
ConstrainDevices: "yes"
46+
ConstrainRAMSpace: "yes"
47+
ConstrainSwapSpace: "yes"
4348

4449
openhpc_config: {}
50+
openhpc_cgroup_config: {}
4551
openhpc_gres_template: gres.conf.j2
4652
openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}"
4753

files/nodegroup.schema

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
{ "$schema": "https://json-schema.org/draft/2020-12/schema",
2+
"type": "object",
3+
"definitions": {
4+
"gres": {
5+
"type": "array",
6+
"items": {
7+
"type": "object",
8+
"properties": {
9+
"conf": {
10+
"type": "string",
11+
"minLength": 1
12+
},
13+
"file": {
14+
"type": "string",
15+
"minLength": 1
16+
}
17+
},
18+
"required": [
19+
"conf"
20+
]
21+
}
22+
}
23+
},
24+
"properties": {
25+
"name": {
26+
"type": "string",
27+
"minLength": 1
28+
},
29+
"ram_mb": {
30+
"type": "number",
31+
},
32+
"ram_multiplier": {
33+
"type": "number",
34+
},
35+
"features": {
36+
"type": "array",
37+
"items": {
38+
"type": "string"
39+
}
40+
},
41+
"node_params": {
42+
"type": "object",
43+
},
44+
"gres_autodetect": {
45+
"type": "string",
46+
"minLength": 1
47+
},
48+
"gres": {
49+
"$ref": "#/definitions/gres"
50+
}
51+
},
52+
"required": [
53+
"name"
54+
],
55+
"if": {
56+
"properties": {
57+
"gres_autodetect": {
58+
"const": "off"
59+
}
60+
}
61+
},
62+
"then": {
63+
"properties": {
64+
"gres": {
65+
"items": {
66+
"required": [
67+
"file"
68+
]
69+
}
70+
}
71+
}
72+
},
73+
"else": {
74+
"properties": {
75+
"gres": {
76+
"items": {
77+
"not": {
78+
"required": [
79+
"file"
80+
]
81+
}
82+
}
83+
}
84+
}
85+
}
86+
}

tasks/runtime.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@
9191
owner: root
9292
group: root
9393
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool
94+
notify:
95+
- Restart slurmctld service
96+
register: ohpc_cgroup_conf
97+
# NB uses restart rather than reload as this is needed in some cases
9498

9599
- name: Remove local tempfile for slurm.conf templating
96100
ansible.builtin.file:
@@ -125,7 +129,7 @@
125129
changed_when: true
126130
when:
127131
- openhpc_slurm_control_host in ansible_play_hosts
128-
- hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler
132+
- hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_cgroup_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler
129133
notify:
130134
- Restart slurmd service
131135

tasks/validate.yml

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,15 @@
2121
delegate_to: localhost
2222
run_once: true
2323

24-
- name: Check gres configuration when gres_autodetect is set
25-
assert:
26-
that:
27-
- _failure_reasons | selectattr('when', 'truthy') | length == 0
28-
fail_msg: >
29-
Your nodegroup definition must include a single gres dictionary containing a conf key
30-
if gres_autodetect is set. The following nodegroup failed this check: {{ item }}.
31-
Reasons for failure: {{ _failure_reasons | selectattr('when', 'truthy') | map(attribute='msg') | join(', ') }}
24+
- name: Validate openhpc_nodegroups
25+
ansible.utils.validate:
26+
criteria: "{{ lookup('file', 'nodegroup.schema') }}"
27+
engine: 'ansible.utils.jsonschema'
28+
data: "{{ item }}"
3229
vars:
33-
_openhpc_gres_autodetect_groups: "{{ openhpc_nodegroups | selectattr('gres_autodetect', 'defined') | selectattr('gres_autodetect', 'search', '(?!off).*') }}"
34-
_failure_reasons:
35-
- msg: The gres key was a list with more than one item
36-
when: "{{ item.gres | length != 1 }}"
37-
- msg: The gres dictionary does not contain a conf key
38-
when: "{{ item.gres.0.conf is not defined }}"
30+
ansible_jsonschema_draft: '2020-12'
3931
delegate_to: localhost
40-
loop: "{{ _openhpc_gres_autodetect_groups }}"
32+
loop: "{{ openhpc_nodegroups }}"
4133
run_once: true
4234

4335
- name: Fail if partition configuration is outdated

templates/cgroup.conf.j2

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
# See man slurm.conf and man cgroup.conf for further
66
# information on cgroup configuration parameters
77
#--
8-
ConstrainCores=yes
9-
ConstrainDevices=yes
10-
ConstrainRAMSpace=yes
11-
ConstrainSwapSpace=yes
8+
{% for k, v in openhpc_cgroup_default_config | combine(openhpc_cgroup_config) | items %}
9+
{% if v != "omit" %}{# allow removing items using setting key: null #}
10+
{{ k }}={{ v | join(',') if (v is sequence and v is not string) else v }}
11+
{% endif %}
12+
{% endfor %}

0 commit comments

Comments
 (0)