Skip to content

Commit 2bea51c

Browse files
committed
review suggestions
1 parent b903cdd commit 2bea51c

File tree

8 files changed

+21
-160
lines changed

8 files changed

+21
-160
lines changed

.github/workflows/stackhpc.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,6 @@ jobs:
183183
. venv/bin/activate
184184
. environments/.stackhpc/activate
185185
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
186-
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
187186
ansible-playbook -v ansible/ci/check_slurm.yml
188187
189188
- name: Check sacct state survived reimage

ansible/roles/compute_init/README.md

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -42,30 +42,8 @@ The following roles/groups are currently fully functional:
4242
node and all compute nodes.
4343
- `openhpc`: all functionality
4444

45-
All of the above are defined in the skeleton cookiecutter config, and are
46-
toggleable via a terraform compute_init autovar file. In the .stackhpc
47-
environment, the compute init roles are set by default to:
48-
- `enable_compute`: This encompasses the openhpc role functionality while being
49-
a global toggle for the entire compute-init script.
50-
- `etc_hosts`
51-
- `nfs`
52-
- `basic_users`
53-
- `eessi`
54-
55-
# CI workflow
56-
57-
The compute node rebuild is tested in CI after the tests for rebuilding the
58-
login and control nodes. The process follows
59-
60-
1. Compute nodes are reimaged:
61-
62-
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
63-
64-
2. Ansible-init runs against newly reimaged compute nodes
65-
66-
3. Run sinfo and check nodes have expected slurm state
67-
68-
ansible-playbook -v ansible/ci/check_slurm.yml
45+
The above may be enabled by setting the compute_init_enable property on the
46+
terraform compute variable.
6947

7048
# Development/debugging
7149

docs/experimental/compute-init.md

Lines changed: 8 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -2,112 +2,17 @@
22

33
See the role README.md
44

5-
# Results/progress
5+
# CI workflow
66

7-
Without any metadata:
7+
The compute node rebuild is tested in CI after the tests for rebuilding the
8+
login and control nodes. The process follows
89

9-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
10-
● ansible-init.service
11-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
12-
Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago
13-
Main PID: 16089 (ansible-init)
14-
Tasks: 8 (limit: 10912)
15-
Memory: 99.5M
16-
CPU: 11.687s
17-
CGroup: /system.slice/ansible-init.service
18-
├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init
19-
├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml
20-
├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml
21-
├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0"
22-
├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py
23-
├─16363 /usr/bin/mount /mnt/cluster
24-
└─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync
10+
1. Compute nodes are reimaged:
2511

26-
Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1]
27-
Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] **********************
28-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1]
29-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ********************************************************************
30-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1]
31-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] ***************************************
32-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access>
33-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1]
34-
Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ******************************************************
35-
Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None
36-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
12+
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
3713

38-
Added metadata via horizon:
14+
2. Ansible-init runs against newly reimaged compute nodes
3915

40-
compute_groups ["compute"]
16+
3. Run sinfo and check nodes have expected slurm state
4117

42-
43-
OK:
44-
45-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
46-
● ansible-init.service
47-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
48-
Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago
49-
Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS)
50-
Main PID: 16089 (code=exited, status=0/SUCCESS)
51-
CPU: 13.003s
52-
53-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => {
54-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share"
55-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: }
56-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ********************************************************************
57-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP *********************************************************************
58-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1
59-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post
60-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done
61-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully
62-
Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service.
63-
64-
Now run site.yml, then restart ansible-init again:
65-
66-
67-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
68-
● ansible-init.service
69-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
70-
Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago
71-
Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS)
72-
Main PID: 18921 (code=exited, status=0/SUCCESS)
73-
CPU: 8.240s
74-
75-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] **********************
76-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1]
77-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ********************************************************************
78-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1]
79-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP *********************************************************************
80-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0
81-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post
82-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done
83-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully
84-
Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service.
85-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/host
86-
hosts hostvars/
87-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-
88-
rl9-compute-0/ rl9-compute-1/
89-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-
90-
rl9-compute-0/ rl9-compute-1/
91-
[root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/
92-
hostvars.yml
93-
94-
This commit - shows that hostvars have loaded:
95-
96-
[root@rl9-compute-0 rocky]# systemctl status ansible-init
97-
● ansible-init.service
98-
Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled)
99-
Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago
100-
Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS)
101-
Main PID: 27585 (code=exited, status=0/SUCCESS)
102-
CPU: 8.161s
103-
104-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] ****************************************
105-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => {
106-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0"
107-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: }
108-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP *********************************************************************
109-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0
110-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post
111-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done
112-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully
113-
Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service.
18+
ansible-playbook -v ansible/ci/check_slurm.yml

environments/.stackhpc/terraform/compute_init.auto.tfvars

Lines changed: 0 additions & 7 deletions
This file was deleted.

environments/.stackhpc/terraform/main.tf

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,6 @@ variable "k3s_token" {
5858
type = string
5959
}
6060

61-
variable "compute_init_enable" {
62-
type = list(string)
63-
}
64-
6561
data "openstack_images_image_v2" "cluster" {
6662
name = var.cluster_image[var.os_version]
6763
most_recent = true
@@ -78,7 +74,6 @@ module "cluster" {
7874
cluster_image_id = data.openstack_images_image_v2.cluster.id
7975
control_node_flavor = var.control_node_flavor
8076
k3s_token = var.k3s_token
81-
compute_init_enable = var.compute_init_enable
8277

8378
login_nodes = {
8479
login-0: var.other_node_flavor
@@ -87,6 +82,7 @@ module "cluster" {
8782
standard: { # NB: can't call this default!
8883
nodes: ["compute-0", "compute-1"]
8984
flavor: var.other_node_flavor
85+
compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"]
9086
}
9187
# Example of how to add another partition:
9288
# extra: {

environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,5 @@ module "compute" {
1919
control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0]
2020
security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id]
2121

22-
compute_init_enable = var.compute_init_enable
22+
compute_init_enable = each.value.compute_init_enable
2323
}

environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,14 @@ resource "openstack_compute_instance_v2" "compute" {
4444
access_network = true
4545
}
4646

47-
metadata = {
48-
environment_root = var.environment_root
49-
k3s_token = var.k3s_token
50-
control_address = var.control_address
51-
enable_compute = contains(var.compute_init_enable, "compute")
52-
enable_resolv_conf = contains(var.compute_init_enable, "resolv_conf")
53-
enable_etc_hosts = contains(var.compute_init_enable, "etc_hosts")
54-
enable_nfs = contains(var.compute_init_enable, "nfs")
55-
enable_manila = contains(var.compute_init_enable, "manila")
56-
enable_basic_users = contains(var.compute_init_enable, "basic_users")
57-
enable_eessi = contains(var.compute_init_enable, "eessi")
58-
}
47+
metadata = merge(
48+
{
49+
environment_root = var.environment_root
50+
k3s_token = var.k3s_token
51+
control_address = var.control_address
52+
},
53+
{for e in var.compute_init_enable: e => true}
54+
)
5955

6056
user_data = <<-EOF
6157
#cloud-config

environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ variable "compute" {
5252
image_id: Overrides variable cluster_image_id
5353
vnic_type: Overrides variable vnic_type
5454
vnic_profile: Overrides variable vnic_profile
55-
compute_init_enable: Toggles ansible-init rebuild
55+
compute_init_enable: Toggles compute-init rebuild (see compute-init role docs)
5656
EOF
5757
}
5858

@@ -136,10 +136,4 @@ variable "root_volume_size" {
136136
variable "k3s_token" {
137137
description = "K3s cluster authentication token, set automatically by Ansible"
138138
type = string
139-
}
140-
141-
variable "compute_init_enable" {
142-
type = list(string)
143-
description = "Groups to activate for ansible-init compute rebuilds"
144-
default = []
145139
}

0 commit comments

Comments
 (0)