Skip to content

Commit 5a78679

Browse files
authored
Merge pull request #403 from ComputeCanada/cloud-failure-handling
Improve cloud-init failure handling
2 parents 6cf9ba5 + a8b4109 commit 5a78679

3 files changed

Lines changed: 66 additions & 27 deletions

File tree

common/configuration/puppet.yaml.tftpl

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ users:
2929
%{ endfor ~}
3030

3131
runcmd:
32+
- "flag_cloud_init_failed () { echo \"$${BASH_COMMAND}\" >> /run/cloud-init-failed; }"
33+
- trap 'flag_cloud_init_failed' ERR
3234
- chmod 755 /etc # avoid issue with Rocky 9.4
3335
- test ! -d /${sudoer_username} && userdel -f -r ${sudoer_username} && cloud-init clean -r
3436
%{ if cloud_provider != "incus" }
@@ -47,12 +49,13 @@ runcmd:
4749
- |
4850
if ! test -f /etc/magic-castle-release; then
4951
# Enable fastest mirror for distribution using dnf package manager
52+
dnf -y install dnf-plugins-core
5053
dnf config-manager --setopt=fastestmirror=True --save
5154
# If the image has not openssh-server installed but sshd_config still exists
5255
# installing the new RPM will not overwrite the file and depending on the file
5356
# content it might catastrophic (some sshd_config are empty, some miss esential lines).
5457
# Therefore when openssh-server is not installed, we remove sshd_config before installing it.
55-
"[ -z $(rpm -qa openssh-server) ] && rm -f /etc/ssh/sshd_config"
58+
[ -z "$(rpm -qa openssh-server)" ] && rm -f /etc/ssh/sshd_config
5659
dnf -y install openssh openssh-server rsync
5760
echo -e "Include /etc/ssh/sshd_config.d/50-authenticationmethods.conf" >> /etc/ssh/sshd_config
5861
sed -i '/HostKey \/etc\/ssh\/ssh_host_ecdsa_key/ s/^#*/#/' /etc/ssh/sshd_config
@@ -73,9 +76,11 @@ runcmd:
7376
dnf -y install openvox-agent-8.23.1
7477
install -m 700 /dev/null /opt/puppetlabs/bin/postrun
7578
# kernel configuration
79+
%{ if cloud_provider != "incus" ~}
7680
systemctl disable kdump
7781
grubby --update-kernel=ALL --args="rd.driver.blacklist=nouveau nouveau.modeset=0 crashkernel=0M"
7882
grub2-mkconfig -o /boot/grub2/grub.cfg
83+
%{ endif ~}
7984
fi
8085
%{ if contains(tags, "puppet") }
8186
# Install puppetserver
@@ -156,6 +161,7 @@ runcmd:
156161
# If the current image has already been configured with Magic Castle Puppet environment,
157162
# we can start puppet and skip reboot, reducing the delay for bringing the node up.
158163
- test -f /etc/magic-castle-release && systemctl start puppet || true
164+
- test -f /run/cloud-init-failed && echo 'WARNING - some steps cloud-init runcmd failed, listed in /run/cloud-init-failed. Manual fixing and rebooting required. ' | tee /etc/motd || true
159165

160166
write_files:
161167
# If the ip addresses of the puppet servers are not known in advance, we cannot restrict the ssh connection to them.
@@ -172,7 +178,8 @@ write_files:
172178
facts : {
173179
blocklist : [
174180
"EC2", "az_metadata", "cloud.provider", "hypervisors"
175-
%{ if cloud_provider != "gcp" },"GCE",%{ endif }
181+
%{ if cloud_provider != "gcp" },"GCE"%{ endif }
182+
%{ if cloud_provider == "incus" },"kmods"%{ endif }
176183
],
177184
}
178185
path: /etc/puppetlabs/facter/facter.conf
@@ -255,7 +262,7 @@ output: { all: "| tee -a /var/log/cloud-init-output.log" }
255262
power_state:
256263
delay: now
257264
mode: reboot
258-
condition: test ! -f /etc/magic-castle-release
265+
condition: test ! -f /etc/magic-castle-release && test ! -f /run/cloud-init-failed
259266
260267
# Configure owner of /var/log/cloud-init.log
261268
syslog_fix_perms: root:systemd-journal

common/variables.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ variable "config_git_url" {
8282
variable "config_version" {
8383
type = string
8484
description = "Tag, branch, or commit that specifies which Puppet configuration revision is to be used"
85+
validation {
86+
condition = length(var.config_version) >= 1
87+
error_message = "The config_version variable cannot be an empty string. It must match a commit hash, a tag or a branch."
88+
}
8589
}
8690

8791
variable "hieradata" {

docs/README.md

Lines changed: 52 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -335,10 +335,14 @@ destroy the cluster or change it manually on the Puppet server.
335335

336336
Since Magic Cluster configuration is managed with git, it is possible to specify
337337
which version of the configuration you wish to use. Typically, it will match the
338-
version number of the release you have downloaded (i.e: `9.3`).
338+
version number of the release you have downloaded (i.e: `15.1.0`).
339339

340340
**Requirement**: Must refer to a git commit, tag or branch existing
341-
in the git repository pointed by `config_git_url`.
341+
in the git repository pointed by `config_git_url`. It cannot be an empty string.
342+
343+
**Warning**: The validity of the string as a git reference is not verified. In the
344+
event it is invalid, Magic Castle defaults to using the latest release tag available
345+
and logs a warning in the puppet server message of the day (`/etc/motd`).
342346

343347
**Post build modification effect**: none. To change the Puppet configuration version,
344348
destroy the cluster or change it manually on the Puppet server.
@@ -617,7 +621,7 @@ available models per region
617621
618622
##### Incus
619623
620-
- `target`: name of the [specific cluster member](https://linuxcontainers.org/incus/docs/main/howto/cluster_manage_instance/#launch-an-instance-on-a-specific-cluster-member) to deploy the instance. **Only use with Incus cluster.**
624+
- `target`: name of the [specific cluster member](https://linuxcontainers.org/incus/docs/main/howto/cluster_manage_instance/#launch-an-instance-on-a-specific-cluster-member) to deploy the instance. **Only use with Incus cluster.**
621625
622626
#### 4.7.3 Post build modification effect
623627
@@ -1383,35 +1387,59 @@ for more information.
13831387
13841388
## 8. Deployment
13851389
1386-
To create the resources defined by your main, enter the following command
1387-
```
1390+
To create the resources defined in your Terraform configuration, run:
1391+
1392+
```bash
13881393
terraform apply
13891394
```
13901395
1391-
The command will produce the same output as the `plan` command, but after
1392-
the output it will ask for a confirmation to perform the proposed actions.
1393-
Enter `yes`.
1396+
This command will first display the execution plan (equivalent to `terraform plan`) and then prompt you to confirm the proposed actions. Type `yes` to proceed.
1397+
1398+
Terraform will then create the infrastructure resources defined in the configuration. This step typically takes a few minutes. Once completed, Terraform will output:
1399+
1400+
- Guest account usernames and passwords
1401+
- The sudo-enabled username
1402+
- The floating IP address of the login node
1403+
1404+
### Important: Cluster Readiness
1405+
1406+
Although Terraform reports completion once the connection information is displayed,
1407+
**the cluster is not immediately ready for use**.
1408+
1409+
Instance creation is only the first phase of the cluster build. A second, automated configuration phase follows, during which Magic Castle installs and configures core services such as:
1410+
user accounts, FreeIPA, Slurm, JupyterHub, etc.
13941411
1395-
Terraform will then proceed to create the resources defined by the
1396-
configuration file. It should take a few minutes. Once the creation process
1397-
is completed, Terraform will output the guest account usernames and password,
1398-
the sudoer username and the floating ip of the login
1399-
node.
1412+
This configuration phase typically takes **approximately 15 minutes** after the instances are created.
14001413
1401-
**Warning**: although the instance creation process is finished once Terraform
1402-
outputs the connection information, you will not be able to
1403-
connect and use the cluster immediately. The instance creation is only the
1404-
first phase of the cluster-building process. The configuration: the
1405-
creation of the user accounts, installation of FreeIPA, Slurm, configuration
1406-
of JupyterHub, etc.; takes around 15 minutes after the instances are created.
1414+
### Instance Configuration Process
1415+
1416+
Each instance goes through a two-stage configuration process:
1417+
1418+
1. **cloud-init**
1419+
- Upgrades operating system packages
1420+
- Installs Puppet
1421+
2. **Puppet**
1422+
- Installs and configures software based on the instance role, as defined by instance tags (e.g. `node`)
1423+
1424+
#### Logs and Troubleshooting
1425+
1426+
Logs for each stage are available at:
1427+
1428+
1. **cloud-init**: `/var/log/cloud-init-output.log`
1429+
2. **Puppet**: `journalctl -u puppet`
1430+
1431+
If an error occurs during the first (cloud-init) stage, a warning is displayed in the instance
1432+
message of the day (e.g.: `/etc/motd`). The failed commands are recorded in:
1433+
1434+
```
1435+
/run/cloud-init-failed
1436+
```
14071437
1408-
Once it is booted, you can follow an instance configuration process by looking at:
1438+
Because successful completion of the first stage is required for the second stage to proceed, the configuration process halts if cloud-init fails.
14091439
1410-
* `/var/log/cloud-init-output.log`
1411-
* `journalctl -u puppet`
1440+
You may resume the configuration by manually re-running the failed commands listed in `/run/cloud-init-failed` once the underlying issue has been resolved.
14121441
1413-
If unexpected problems occur during configuration, you can provide these
1414-
logs to the authors of Magic Castle to help you debug.
1442+
Failures during the first stage are rare and are most often caused by external dependencies, such as temporary unavailability of GitHub or package repositories.
14151443
14161444
### 8.1 Deployment Customization
14171445

0 commit comments

Comments
 (0)