Skip to content
Merged
Changes from 2 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c09fb2e
:seedling: Before using the ssh Rescue Client, ensure the rescue syst…
guettli May 7, 2025
78ef843
revert remove of unused func.
guettli May 7, 2025
685a42f
Merge remote-tracking branch 'origin/main' into tg/check-hostname-bef…
guettli May 8, 2025
4c2c41f
fix e2e tests.
guettli May 8, 2025
4c515a7
log reboots, and retry if hostname does not match.
guettli May 8, 2025
ae4d56d
:seedling: Show conflicts in SaveHostAndReturn in logs
guettli May 9, 2025
fe40256
Merge branch 'tg/show-conflicts-in-SaveHostAndReturn-in-logs' into tg…
guettli May 9, 2025
75a4f40
fix tests (add ctx)
guettli May 9, 2025
b0d0078
use uncached APIReader to avoid cache issues
guettli May 9, 2025
2cf266d
more debug output.
guettli May 9, 2025
d48f03e
log every reconcile.
guettli May 9, 2025
ce03621
logs are not visible. Strange, try print.
guettli May 9, 2025
d9b837d
found "bug": hostname changes during "installimage".
guettli May 9, 2025
5e2029f
typo.
guettli May 9, 2025
e7f1d54
Merge remote-tracking branch 'origin/main' into tg/check-hostname-bef…
guettli May 9, 2025
182da6b
Merge remote-tracking branch 'origin/main' into tg/check-hostname-bef…
guettli May 13, 2025
0979533
set permanent error, if hostname is unexpected (not rescue system).
guettli May 13, 2025
023af57
added HetznerCluster to log values again.
guettli May 13, 2025
b8ba0d3
fixed typo.
guettli May 13, 2025
3e6fd3e
inlined function which gets used only once.
guettli May 13, 2025
951c144
add log of HetznerBareMetalMachine again.
guettli May 13, 2025
cb977e6
git restore -s main controllers/hetznerbaremetalhost_controller.go
guettli May 14, 2025
c74dd1f
remove setting permanent error.
guettli May 14, 2025
e060a31
Merge remote-tracking branch 'origin/main' into tg/check-hostname-bef…
guettli May 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 52 additions & 34 deletions pkg/services/baremetal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,46 +592,18 @@ func (s *Service) actionRegistering(_ context.Context) actionResult {
}
sshClient := s.scope.SSHClientFactory.NewClient(in)

// Check hostname with sshClient
out := sshClient.GetHostName()
hostName := trimLineBreak(out.StdOut)
if hostName != rescue {
// give the reboot some time until it takes effect
if s.hasJustRebooted() {
return actionContinue{delay: 2 * time.Second}
}

isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out)
if err != nil {
// This can happen if the bare-metal server was taken by another mgt-cluster.
// Check in https://robot.hetzner.com/server for the "History" of the server.
return actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)}
}

failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError)
if failed {
return s.recordActionFailure(infrav1.PermanentError, err.Error())
}
if err != nil {
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
}
timeSinceReboot := "unknown"
if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil {
timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String()
}

s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName,
"isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot)
return actionContinue{delay: 10 * time.Second}
ok, res := s.validateRescueSystemIsActive(sshClient)
if !ok {
return res
}

output := sshClient.GetHardwareDetailsDebug()
if output.Err != nil {
return actionError{err: fmt.Errorf("failed to obtain hardware for debugging: %w", output.Err)}
}

msg := fmt.Sprintf("%s\n\n", output.StdOut)
if out.StdErr != "" {
msg += fmt.Sprintf("stderr:\n%s\n\n", out.StdErr)
if output.StdErr != "" {
msg += fmt.Sprintf("stderr:\n%s\n\n", output.StdErr)
}
record.Eventf(s.scope.HetznerBareMetalHost, "GetHardwareDetails", msg)

Expand Down Expand Up @@ -711,6 +683,42 @@ func (s *Service) actionRegistering(_ context.Context) actionResult {
return actionComplete{}
}

func (s *Service) validateRescueSystemIsActive(sshClient sshclient.Client) (ok bool, ar actionResult) {
// Check hostname with sshClient
out := sshClient.GetHostName()
hostName := trimLineBreak(out.StdOut)
if hostName == rescue {
return true, actionContinue{}
}
// give the reboot some time until it takes effect
if s.hasJustRebooted() {
return false, actionContinue{delay: 2 * time.Second}
}

isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out)
if err != nil {
// This can happen if the bare-metal server was taken by another mgt-cluster.
// Check in https://robot.hetzner.com/server for the "History" of the server.
return false, actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)}
}

failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError)
if failed {
return false, s.recordActionFailure(infrav1.PermanentError, err.Error())
}
if err != nil {
return false, actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
}
timeSinceReboot := "unknown"
if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil {
timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String()
}

s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName,
"isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot)
return false, actionContinue{delay: 10 * time.Second}
}

func validateRootDeviceWwnsAreSubsetOfExistingWwns(rootDeviceHints *infrav1.RootDeviceHints, storageDevices []infrav1.Storage) error {
knownWWNs := make([]string, 0, len(storageDevices))
for _, sd := range storageDevices {
Expand Down Expand Up @@ -1097,6 +1105,11 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult {
}
sshClient := s.scope.SSHClientFactory.NewClient(in)

ok, res := s.validateRescueSystemIsActive(sshClient)
if !ok {
return res
}

exitStatus, output, err := sshClient.ExecutePreProvisionCommand(ctx, s.scope.PreProvisionCommand)
if err != nil {
return actionError{err: fmt.Errorf("failed to execute pre-provision command: %w", err)}
Expand Down Expand Up @@ -1128,6 +1141,11 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult {
}
sshClient := s.scope.SSHClientFactory.NewClient(in)

ok, res := s.validateRescueSystemIsActive(sshClient)
if !ok {
return res
}

state, err := sshClient.GetInstallImageState()
if err != nil {
return actionError{err: fmt.Errorf("failed to get state of installimage processes: %w", err)}
Expand Down
Loading