Skip to content

Commit c09fb2e

Browse files
committed
🌱 Before using the ssh Rescue Client, ensure the rescue system is active.
1 parent 95b628c commit c09fb2e

File tree

2 files changed

+58
-316
lines changed

2 files changed

+58
-316
lines changed

pkg/services/baremetal/host/host.go

Lines changed: 52 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -592,46 +592,18 @@ func (s *Service) actionRegistering(_ context.Context) actionResult {
592592
}
593593
sshClient := s.scope.SSHClientFactory.NewClient(in)
594594

595-
// Check hostname with sshClient
596-
out := sshClient.GetHostName()
597-
hostName := trimLineBreak(out.StdOut)
598-
if hostName != rescue {
599-
// give the reboot some time until it takes effect
600-
if s.hasJustRebooted() {
601-
return actionContinue{delay: 2 * time.Second}
602-
}
603-
604-
isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out)
605-
if err != nil {
606-
// This can happen if the bare-metal server was taken by another mgt-cluster.
607-
// Check in https://robot.hetzner.com/server for the "History" of the server.
608-
return actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)}
609-
}
610-
611-
failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError)
612-
if failed {
613-
return s.recordActionFailure(infrav1.PermanentError, err.Error())
614-
}
615-
if err != nil {
616-
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
617-
}
618-
timeSinceReboot := "unknown"
619-
if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil {
620-
timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String()
621-
}
622-
623-
s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName,
624-
"isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot)
625-
return actionContinue{delay: 10 * time.Second}
595+
ok, res := s.validateRescueSystemIsActive(sshClient)
596+
if !ok {
597+
return res
626598
}
627-
628599
output := sshClient.GetHardwareDetailsDebug()
629600
if output.Err != nil {
630601
return actionError{err: fmt.Errorf("failed to obtain hardware for debugging: %w", output.Err)}
631602
}
603+
632604
msg := fmt.Sprintf("%s\n\n", output.StdOut)
633-
if out.StdErr != "" {
634-
msg += fmt.Sprintf("stderr:\n%s\n\n", out.StdErr)
605+
if output.StdErr != "" {
606+
msg += fmt.Sprintf("stderr:\n%s\n\n", output.StdErr)
635607
}
636608
record.Eventf(s.scope.HetznerBareMetalHost, "GetHardwareDetails", msg)
637609

@@ -711,6 +683,42 @@ func (s *Service) actionRegistering(_ context.Context) actionResult {
711683
return actionComplete{}
712684
}
713685

686+
func (s *Service) validateRescueSystemIsActive(sshClient sshclient.Client) (ok bool, ar actionResult) {
687+
// Check hostname with sshClient
688+
out := sshClient.GetHostName()
689+
hostName := trimLineBreak(out.StdOut)
690+
if hostName == rescue {
691+
return true, actionContinue{}
692+
}
693+
// give the reboot some time until it takes effect
694+
if s.hasJustRebooted() {
695+
return false, actionContinue{delay: 2 * time.Second}
696+
}
697+
698+
isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out)
699+
if err != nil {
700+
// This can happen if the bare-metal server was taken by another mgt-cluster.
701+
// Check in https://robot.hetzner.com/server for the "History" of the server.
702+
return false, actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)}
703+
}
704+
705+
failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError)
706+
if failed {
707+
return false, s.recordActionFailure(infrav1.PermanentError, err.Error())
708+
}
709+
if err != nil {
710+
return false, actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
711+
}
712+
timeSinceReboot := "unknown"
713+
if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil {
714+
timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String()
715+
}
716+
717+
s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName,
718+
"isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot)
719+
return false, actionContinue{delay: 10 * time.Second}
720+
}
721+
714722
func validateRootDeviceWwnsAreSubsetOfExistingWwns(rootDeviceHints *infrav1.RootDeviceHints, storageDevices []infrav1.Storage) error {
715723
knownWWNs := make([]string, 0, len(storageDevices))
716724
for _, sd := range storageDevices {
@@ -1097,6 +1105,11 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult {
10971105
}
10981106
sshClient := s.scope.SSHClientFactory.NewClient(in)
10991107

1108+
ok, res := s.validateRescueSystemIsActive(sshClient)
1109+
if !ok {
1110+
return res
1111+
}
1112+
11001113
exitStatus, output, err := sshClient.ExecutePreProvisionCommand(ctx, s.scope.PreProvisionCommand)
11011114
if err != nil {
11021115
return actionError{err: fmt.Errorf("failed to execute pre-provision command: %w", err)}
@@ -1128,6 +1141,11 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult {
11281141
}
11291142
sshClient := s.scope.SSHClientFactory.NewClient(in)
11301143

1144+
ok, res := s.validateRescueSystemIsActive(sshClient)
1145+
if !ok {
1146+
return res
1147+
}
1148+
11311149
state, err := sshClient.GetInstallImageState()
11321150
if err != nil {
11331151
return actionError{err: fmt.Errorf("failed to get state of installimage processes: %w", err)}
@@ -1450,46 +1468,6 @@ func getDeviceNames(wwn []string, storageDevices []infrav1.Storage) []string {
14501468
return deviceNames
14511469
}
14521470

1453-
func analyzeSSHOutputInstallImage(out sshclient.Output, sshClient sshclient.Client, port int) (isTimeout, isConnectionRefused bool, reterr error) {
1454-
// check err
1455-
if out.Err != nil {
1456-
switch {
1457-
case os.IsTimeout(out.Err) || sshclient.IsTimeoutError(out.Err):
1458-
isTimeout = true
1459-
return isTimeout, false, nil
1460-
case sshclient.IsAuthenticationFailedError(out.Err):
1461-
if err := handleAuthenticationFailed(sshClient, port); err != nil {
1462-
return false, false, fmt.Errorf("original ssh error: %w. err: %w", out.Err, err)
1463-
}
1464-
return false, false, handleAuthenticationFailed(sshClient, port)
1465-
case sshclient.IsConnectionRefusedError(out.Err):
1466-
return false, verifyConnectionRefused(sshClient, port), nil
1467-
}
1468-
1469-
return false, false, fmt.Errorf("unhandled ssh error while getting hostname: %w", out.Err)
1470-
}
1471-
1472-
// check stderr
1473-
if out.StdErr != "" {
1474-
// This is an unexpected error
1475-
return false, false, fmt.Errorf("%w: StdErr: %s", errSSHGetHostname, out.StdErr)
1476-
}
1477-
1478-
// check stdout
1479-
hostname := trimLineBreak(out.StdOut)
1480-
switch hostname {
1481-
case "":
1482-
// Hostname should not be empty. This is unexpected.
1483-
return false, false, errEmptyHostName
1484-
case rescue: // We are in wrong boot, nothing has to be done to trigger reboot
1485-
return false, false, nil
1486-
}
1487-
1488-
// We are in the case that hostName != rescue && StdOut != hostName
1489-
// This is unexpected
1490-
return false, false, fmt.Errorf("%w: %s", errUnexpectedHostName, hostname)
1491-
}
1492-
14931471
func handleAuthenticationFailed(sshClient sshclient.Client, port int) error {
14941472
// Check whether we are in the wrong system in the case that rescue and os system might be running on the same port.
14951473
if port == rescuePort {

0 commit comments

Comments
 (0)