From 984be6ee504cce9a368cd6c277309424043104fb Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 10 Mar 2025 15:47:01 +0000 Subject: [PATCH 1/6] update docs for compute-init dev --- ansible/roles/compute_init/README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 16f0987e9..e64ea6ffb 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -151,7 +151,11 @@ a new image: 3. Add metadata to a compute node e.g. via Horizon to turn on compute-init playbook functionality. -4. Fake an image build to deploy the compute-init playbook: +4. Stop ansible-init from running + + ansible all -ba "systemctl stop ansible-init" + +5. Fake an image build to deploy the compute-init playbook: ansible-playbook ansible/fatimage.yml --tags compute_init @@ -159,16 +163,13 @@ a new image: in the builder group, which conveniently means any changes made to that play also get picked up. -5. Fake a reimage of compute to run ansible-init and the compute-init playbook: - - On compute node where metadata was added: +6. Fake a reimage of compute to run ansible-init and the updated compute-init playbook: - [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init - [root@rl9-compute-0 rocky]# systemctl status ansible-init + ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init" Use `systemctl status ansible-init` to view stdout/stderr from Ansible. -Steps 4/5 can be repeated with changes to the compute script. If required, +Steps 4/5/6 can be repeated with changes to the compute script. If required, reimage the compute node(s) first as in step 2 and/or add additional metadata as in step 3. From 08c119c7c933c0948adae8fbe8ab27cb3f6a24ba Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 10 Mar 2025 15:53:38 +0000 Subject: [PATCH 2/6] allow numerical sort of compute-init playbook --- ansible/roles/compute_init/tasks/install.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index cbacb062e..6032eed53 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -60,7 +60,7 @@ - name: Add compute initialisation playbook copy: src: compute-init.yml - dest: /etc/ansible-init/playbooks/1-compute-init.yml + dest: /etc/ansible-init/playbooks/10-compute-init.yml owner: root group: root mode: 0644 From 67a42688eae70925bd9ff8fd069685a7193609a3 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 10 Mar 2025 15:56:04 +0000 Subject: [PATCH 3/6] optimise copy of directories in compute-init --- ansible/roles/compute_init/tasks/install.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 6032eed53..8288b65fe 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -16,12 +16,14 @@ - roles - name: Inject files from roles - copy: + synchronize: src: '{{ item.src }}' dest: '/etc/ansible-init/playbooks/{{ item.dest }}' - owner: root - group: root - mode: 0644 + archive: false + rsync_opts: ["-p", "--chmod=D770,F644", "--owner=root", "--group=root"] + recursive: true + use_ssh_args: true + become: true loop: - src: ../../resolv_conf/templates/resolv.conf.j2 dest: templates/resolv.conf.j2 From 19af5a9633addd813eba44b3a6da1fbc33c66bdd Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 11 Mar 2025 00:07:59 +0000 Subject: [PATCH 4/6] bump images --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 3e8293206..f25caa5e7 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250305-1110-534ed276", - "RL9": "openhpc-RL9-250305-1110-534ed276" + "RL8": "openhpc-RL8-250310-1757-82d85eb5", + "RL9": "openhpc-RL9-250310-1756-82d85eb5" } } From d05208bc180832adcb916b60656e5f6e15769bec Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 11 Mar 2025 10:11:05 +0000 Subject: [PATCH 5/6] compute init functionality for new nodes added to existing cluster --- ansible/roles/compute_init/files/compute-init.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 906961c96..bf486f5b2 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -95,6 +95,20 @@ - meta: end_play when: _mount_mnt_cluster.failed + - name: Check if hostvars exist + stat: + path: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" + register: hostvars_stat + + - block: + - name: Report skipping initialization if host vars does not exist + # meta: end_play produces no output + debug: + msg: "Skipping compute initialization: hostvars does not exist" + + - meta: end_play + when: not hostvars_stat.stat.exists + - name: Load hostvars from NFS # this is higher priority than vars block = normal ansible's hostvars include_vars: From 603de947d93c19165b8f62e5f21c9dad9f970e80 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 11 Mar 2025 11:59:38 +0000 Subject: [PATCH 6/6] bump images --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index f25caa5e7..0af12befc 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250310-1757-82d85eb5", - "RL9": "openhpc-RL9-250310-1756-82d85eb5" + "RL8": "openhpc-RL8-250311-1020-d05208bc", + "RL9": "openhpc-RL9-250311-1020-d05208bc" } }