From cccd6c9c74d2362a87e28f000bf1ceb69ebfb680 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 31 Mar 2025 17:18:19 +0100 Subject: [PATCH 1/2] Compute-Init: wait for cloud-init before NFS mount We are seeing issues where compute-init hits: TASK [Check if hostvars exist] FAILED! => {"changed": false, "msg": "Permission denied"} We have found we are ignoring errors on the mount. Its possible the mount will fail if the host networking has not be setup. Lets wait to make sure we can talk to NFS before attempting the NFS mount, mostly checking because the host networking stack might not yet be setup correctly. We could do "cloud-init status --wait" and block on cloud-init having finished, however we don't really depend on all parts of cloud-init being complete. Equally, we could think about ansible-init systemd unit file depending on cloud-init or the network being available, but there are cases where we do not want that. --- ansible/roles/compute_init/files/compute-init.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index b09bd7f3b..25af01154 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -61,7 +61,13 @@ owner: slurm group: root mode: u=rX,g=rwX,o= - + + - name: Wait for NFS to reachable (checks host network up) + ansible.builtin.wait_for: + port: 2049 + host: '{{ server_node_ip }}' + timeout: 120 + - name: Mount /mnt/cluster mount: path: /mnt/cluster @@ -70,8 +76,6 @@ opts: ro,sync state: mounted register: _mount_mnt_cluster - ignore_errors: true - # TODO: add some retries here? - block: - name: Report skipping initialization if cannot mount nfs From 07cebd1a56eccd3a9f1df82f57fa818c9579b76e Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 31 Mar 2025 18:05:23 +0100 Subject: [PATCH 2/2] bump images --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 7e213c00c..01f61fa0a 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250326-1048-3e132168", - "RL9": "openhpc-RL9-250326-1049-3e132168" + "RL8": "openhpc-RL8-250331-1627-cccd6c9c", + "RL9": "openhpc-RL9-250331-1627-cccd6c9c" } }