Skip to content

Commit 9876739

Browse files
committed
fix: verify rescue in stateInstallTalos before installing
Root cause of repeated installer failures: SSH reachable but server running Debian (not rescue). stateCheckRescueActive had rescue detection, but stateInstallTalos didn't — it trusted SSH=rescue. Fix: Before installing, verify hostname=rescue or /etc/hetzner-build. If not rescue, fix EFI boot order (delete non-PXE entries), reboot, re-activate rescue. Server will PXE boot into rescue on next cycle. This affected: node3, node5, node9, node1, storage-7 — all servers with pre-existing OS (Debian from cephadm or old Talos).
1 parent ce44a3e commit 9876739

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

controllers/hetznerrobotmachine_controller.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,44 @@ func (r *HetznerRobotMachineReconciler) stateInstallTalos(
520520
return ctrl.Result{RequeueAfter: requeueAfterShort}, nil
521521
}
522522

523+
// Verify the system is actually rescue before installing.
524+
// A pre-existing OS (Debian from cephadm, old Talos, etc.) has SSH on port 22
525+
// but is NOT rescue — installing Talos on a running OS fails silently (exit 1).
526+
{
527+
privateKey, keyErr := r.getSSHPrivateKey(ctx, hrc)
528+
if keyErr == nil {
529+
verifyClient := sshrescue.New(serverIP, privateKey)
530+
if connErr := verifyClient.Connect(); connErr == nil {
531+
out, _ := verifyClient.Run("([ \"$(hostname)\" = \"rescue\" ] || test -f /etc/hetzner-build) && echo RESCUE || echo NOT_RESCUE")
532+
verifyClient.Close()
533+
if strings.TrimSpace(out) != "RESCUE" {
534+
logger.Info("SSH reachable but NOT rescue (existing OS), fixing EFI and rebooting into rescue",
535+
"ip", serverIP, "hostname", strings.TrimSpace(out))
536+
// Fix EFI boot order — delete non-PXE entries so next boot goes to PXE rescue
537+
fixClient := sshrescue.New(serverIP, privateKey)
538+
if fixErr := fixClient.Connect(); fixErr == nil {
539+
_, _ = fixClient.Run(`
540+
if command -v efibootmgr > /dev/null 2>&1; then
541+
mount -o remount,rw /sys/firmware/efi/efivars 2>/dev/null || \
542+
mount -t efivarfs efivarfs /sys/firmware/efi/efivars 2>/dev/null || true
543+
for entry in $(efibootmgr 2>/dev/null | grep '^Boot[0-9A-Fa-f]' | grep -iv 'pxe\|network\|ipv4\|ipv6' | grep -o '^Boot[0-9A-Fa-f]*' | sed 's/Boot//'); do
544+
efibootmgr -b "$entry" -B 2>/dev/null
545+
done
546+
fi
547+
nohup bash -c 'sleep 1 && reboot' &>/dev/null &
548+
`)
549+
fixClient.Close()
550+
}
551+
// Re-activate rescue for the PXE boot
552+
sshFingerprint, _ := r.getSSHKeyFingerprint(ctx, hrc)
553+
_, _ = robotClient.ActivateRescue(ctx, serverID, sshFingerprint)
554+
hrm.Status.ProvisioningState = infrav1.StateActivatingRescue
555+
return ctrl.Result{RequeueAfter: 90 * time.Second}, nil
556+
}
557+
}
558+
}
559+
}
560+
523561
logger.Info("Installing Talos via rescue SSH", "ip", serverIP)
524562

525563
// Get private key

0 commit comments

Comments
 (0)