Skip to content

Commit 38e5fd3

Browse files
committed
PoC of automating partition/nodegroup config
1 parent 611513c commit 38e5fd3

File tree

6 files changed

+17
-35
lines changed

6 files changed

+17
-35
lines changed

environments/.stackhpc/inventory/group_vars/all/z_partitions.yml

Lines changed: 0 additions & 18 deletions
This file was deleted.

environments/common/inventory/group_vars/all/openhpc.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,21 @@ openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}"
1515
openhpc_slurmdbd_mysql_username: slurm
1616
openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init
1717
openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}"
18-
openhpc_slurm_partitions:
19-
- name: "compute"
18+
openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime
19+
name: rebuild
20+
groups: "{{ cluster_compute_groups }}"
21+
default: NO
22+
maxtime: 30
23+
partition_params:
24+
PriorityJobFactor: 65533
25+
Hidden: YES
26+
RootOnly: YES
27+
DisableRootJobs: NO
28+
PreemptMode: 'OFF'
29+
OverSubscribe: EXCLUSIVE
30+
openhpc_nodegroups: "{{ cluster_compute_groups | map('community.general.dict_kv', 'name') }}" # create nodegroup for each compute group
31+
openhpc_user_partitions: "{{ openhpc_nodegroups }}" # create partition for each nodegroup (actually role default) - this is what we'd expect to be changed
32+
openhpc_partitions: "{{ openhpc_user_partitions + [openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else [] }}" # auto-create rebuild partition if reqd.
2033
openhpc_packages_default:
2134
# system packages
2235
- podman

environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,3 @@ resource "local_file" "hosts" {
1111
)
1212
filename = "../inventory/hosts.yml"
1313
}
14-
15-
resource "local_file" "partitions" {
16-
content = templatefile("${path.module}/partitions.tpl",
17-
{
18-
"compute_groups": module.compute,
19-
},
20-
)
21-
filename = "../inventory/group_vars/all/partitions.yml" # as all/ is created by skeleton
22-
}

environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ all:
22
vars:
33
openhpc_cluster_name: ${cluster_name}
44
cluster_domain_suffix: ${cluster_domain_suffix}
5+
cluster_compute_groups: ${jsonencode(keys(compute_groups))}
56

67
control:
78
hosts:

environments/skeleton/{{cookiecutter.environment}}/tofu/partitions.tpl

Lines changed: 0 additions & 5 deletions
This file was deleted.

requirements.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ roles:
44
version: v25.3.2
55
name: stackhpc.nfs
66
- src: https://github.com/stackhpc/ansible-role-openhpc.git
7-
version: v0.28.0
7+
version: feat/nodegroups # TODO: bump to release
88
name: stackhpc.openhpc
99
- src: https://github.com/stackhpc/ansible-node-exporter.git
1010
version: stackhpc

0 commit comments

Comments
 (0)