From c09fb2ee49e0596bc3ab3d14ac559f989f187dd6 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Wed, 7 May 2025 11:21:56 +0200 Subject: [PATCH 01/19] :seedling: Before using the ssh Rescue Client, ensure the rescue system is active. --- pkg/services/baremetal/host/host.go | 126 +++++------- pkg/services/baremetal/host/host_test.go | 248 +---------------------- 2 files changed, 58 insertions(+), 316 deletions(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index dd6dc8ab1..279e496ca 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -592,46 +592,18 @@ func (s *Service) actionRegistering(_ context.Context) actionResult { } sshClient := s.scope.SSHClientFactory.NewClient(in) - // Check hostname with sshClient - out := sshClient.GetHostName() - hostName := trimLineBreak(out.StdOut) - if hostName != rescue { - // give the reboot some time until it takes effect - if s.hasJustRebooted() { - return actionContinue{delay: 2 * time.Second} - } - - isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out) - if err != nil { - // This can happen if the bare-metal server was taken by another mgt-cluster. - // Check in https://robot.hetzner.com/server for the "History" of the server. - return actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)} - } - - failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError) - if failed { - return s.recordActionFailure(infrav1.PermanentError, err.Error()) - } - if err != nil { - return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} - } - timeSinceReboot := "unknown" - if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil { - timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String() - } - - s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName, - "isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot) - return actionContinue{delay: 10 * time.Second} + ok, res := s.validateRescueSystemIsActive(sshClient) + if !ok { + return res } - output := sshClient.GetHardwareDetailsDebug() if output.Err != nil { return actionError{err: fmt.Errorf("failed to obtain hardware for debugging: %w", output.Err)} } + msg := fmt.Sprintf("%s\n\n", output.StdOut) - if out.StdErr != "" { - msg += fmt.Sprintf("stderr:\n%s\n\n", out.StdErr) + if output.StdErr != "" { + msg += fmt.Sprintf("stderr:\n%s\n\n", output.StdErr) } record.Eventf(s.scope.HetznerBareMetalHost, "GetHardwareDetails", msg) @@ -711,6 +683,42 @@ func (s *Service) actionRegistering(_ context.Context) actionResult { return actionComplete{} } +func (s *Service) validateRescueSystemIsActive(sshClient sshclient.Client) (ok bool, ar actionResult) { + // Check hostname with sshClient + out := sshClient.GetHostName() + hostName := trimLineBreak(out.StdOut) + if hostName == rescue { + return true, actionContinue{} + } + // give the reboot some time until it takes effect + if s.hasJustRebooted() { + return false, actionContinue{delay: 2 * time.Second} + } + + isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out) + if err != nil { + // This can happen if the bare-metal server was taken by another mgt-cluster. + // Check in https://robot.hetzner.com/server for the "History" of the server. + return false, actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)} + } + + failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError) + if failed { + return false, s.recordActionFailure(infrav1.PermanentError, err.Error()) + } + if err != nil { + return false, actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} + } + timeSinceReboot := "unknown" + if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil { + timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String() + } + + s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName, + "isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot) + return false, actionContinue{delay: 10 * time.Second} +} + func validateRootDeviceWwnsAreSubsetOfExistingWwns(rootDeviceHints *infrav1.RootDeviceHints, storageDevices []infrav1.Storage) error { knownWWNs := make([]string, 0, len(storageDevices)) for _, sd := range storageDevices { @@ -1097,6 +1105,11 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult { } sshClient := s.scope.SSHClientFactory.NewClient(in) + ok, res := s.validateRescueSystemIsActive(sshClient) + if !ok { + return res + } + exitStatus, output, err := sshClient.ExecutePreProvisionCommand(ctx, s.scope.PreProvisionCommand) if err != nil { return actionError{err: fmt.Errorf("failed to execute pre-provision command: %w", err)} @@ -1128,6 +1141,11 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { } sshClient := s.scope.SSHClientFactory.NewClient(in) + ok, res := s.validateRescueSystemIsActive(sshClient) + if !ok { + return res + } + state, err := sshClient.GetInstallImageState() if err != nil { return actionError{err: fmt.Errorf("failed to get state of installimage processes: %w", err)} @@ -1450,46 +1468,6 @@ func getDeviceNames(wwn []string, storageDevices []infrav1.Storage) []string { return deviceNames } -func analyzeSSHOutputInstallImage(out sshclient.Output, sshClient sshclient.Client, port int) (isTimeout, isConnectionRefused bool, reterr error) { - // check err - if out.Err != nil { - switch { - case os.IsTimeout(out.Err) || sshclient.IsTimeoutError(out.Err): - isTimeout = true - return isTimeout, false, nil - case sshclient.IsAuthenticationFailedError(out.Err): - if err := handleAuthenticationFailed(sshClient, port); err != nil { - return false, false, fmt.Errorf("original ssh error: %w. err: %w", out.Err, err) - } - return false, false, handleAuthenticationFailed(sshClient, port) - case sshclient.IsConnectionRefusedError(out.Err): - return false, verifyConnectionRefused(sshClient, port), nil - } - - return false, false, fmt.Errorf("unhandled ssh error while getting hostname: %w", out.Err) - } - - // check stderr - if out.StdErr != "" { - // This is an unexpected error - return false, false, fmt.Errorf("%w: StdErr: %s", errSSHGetHostname, out.StdErr) - } - - // check stdout - hostname := trimLineBreak(out.StdOut) - switch hostname { - case "": - // Hostname should not be empty. This is unexpected. - return false, false, errEmptyHostName - case rescue: // We are in wrong boot, nothing has to be done to trigger reboot - return false, false, nil - } - - // We are in the case that hostName != rescue && StdOut != hostName - // This is unexpected - return false, false, fmt.Errorf("%w: %s", errUnexpectedHostName, hostname) -} - func handleAuthenticationFailed(sshClient sshclient.Client, port int) error { // Check whether we are in the wrong system in the case that rescue and os system might be running on the same port. if port == rescuePort { diff --git a/pkg/services/baremetal/host/host_test.go b/pkg/services/baremetal/host/host_test.go index 4fb8021c2..be1c26460 100644 --- a/pkg/services/baremetal/host/host_test.go +++ b/pkg/services/baremetal/host/host_test.go @@ -725,66 +725,7 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { expectedErrMessage string } - DescribeTable("analyzeSSHOutputInstallImage - out.Err", - func(tc testCaseAnalyzeSSHOutputInstallImageOutErr) { - host := helpers.BareMetalHost( - "test-host", - "default", - ) - - robotMock := robotmock.Client{} - robotMock.On("GetBootRescue", mock.Anything).Return(&models.Rescue{Active: tc.rescueActive}, nil) - - service := newTestService(host, &robotMock, nil, nil, nil) - - isTimeout, isConnectionRefused, err := service.analyzeSSHOutputRegistering(sshclient.Output{Err: tc.err}) - Expect(isTimeout).To(Equal(tc.expectedIsTimeout)) - Expect(isConnectionRefused).To(Equal(tc.expectedIsConnectionRefused)) - if tc.expectedErrMessage != "" { - Expect(err).To(Not(BeNil())) - Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) - } else { - Expect(err).To(BeNil()) - } - }, - Entry("timeout error", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: timeout, - rescueActive: true, - expectedIsTimeout: true, - expectedIsConnectionRefused: false, - expectedErrMessage: "", - }), - Entry("authenticationFailed error, rescue active", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrAuthenticationFailed, - rescueActive: true, - expectedIsTimeout: false, - expectedIsConnectionRefused: false, - expectedErrMessage: "", - }), - Entry("authenticationFailed error, rescue not active", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrAuthenticationFailed, - rescueActive: false, - expectedIsTimeout: false, - expectedIsConnectionRefused: false, - expectedErrMessage: "wrong ssh key", - }), - Entry("connectionRefused error, rescue active", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrConnectionRefused, - rescueActive: true, - expectedIsTimeout: false, - expectedIsConnectionRefused: true, - expectedErrMessage: "", - }), - Entry("connectionRefused error, rescue not active", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrConnectionRefused, - rescueActive: false, - expectedIsTimeout: false, - expectedIsConnectionRefused: true, - expectedErrMessage: "", - }), - ) - - type testCaseAnalyzeSSHOutputInstallImageStdErr struct { + type testCaseAnalyzeSSHOutputRegistering struct { hasNilErr bool stdErr string hostName string @@ -792,7 +733,7 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { } DescribeTable("analyzeSSHOutputRegistering - toggle stdErr and hostName", - func(tc testCaseAnalyzeSSHOutputInstallImageStdErr) { + func(tc testCaseAnalyzeSSHOutputRegistering) { var err error if !tc.hasNilErr { err = errTest @@ -824,25 +765,25 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { Expect(err).To(BeNil()) } }, - Entry("stderr not empty", testCaseAnalyzeSSHOutputInstallImageStdErr{ + Entry("stderr not empty", testCaseAnalyzeSSHOutputRegistering{ hasNilErr: true, stdErr: "command failed", hostName: "hostName", expectedErrMessage: "failed to get hostname via ssh: StdErr:", }), - Entry("stderr not empty - err != nil", testCaseAnalyzeSSHOutputInstallImageStdErr{ + Entry("stderr not empty - err != nil", testCaseAnalyzeSSHOutputRegistering{ hasNilErr: false, stdErr: "command failed", hostName: "", expectedErrMessage: "unhandled ssh error while getting hostname", }), - Entry("stderr not empty - wrong hostName", testCaseAnalyzeSSHOutputInstallImageStdErr{ + Entry("stderr not empty - wrong hostName", testCaseAnalyzeSSHOutputRegistering{ hasNilErr: true, stdErr: "command failed", hostName: "", expectedErrMessage: "failed to get hostname via ssh: StdErr:", }), - Entry("stderr empty - wrong hostName", testCaseAnalyzeSSHOutputInstallImageStdErr{ + Entry("stderr empty - wrong hostName", testCaseAnalyzeSSHOutputRegistering{ hasNilErr: true, stdErr: "", hostName: "", @@ -861,83 +802,6 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { expectedErrMessage string } - DescribeTable("analyzeSSHOutputInstallImage - out.Err", - func(tc testCaseAnalyzeSSHOutputInstallImageOutErr) { - sshMock := &sshmock.Client{} - var errFromGetHostName error - if !tc.errFromGetHostNameNil { - errFromGetHostName = errTest - } - sshMock.On("GetHostName").Return(sshclient.Output{Err: errFromGetHostName}) - - isTimeout, isConnectionRefused, err := analyzeSSHOutputInstallImage(sshclient.Output{Err: tc.err}, sshMock, tc.port) - Expect(isTimeout).To(Equal(tc.expectedIsTimeout)) - Expect(isConnectionRefused).To(Equal(tc.expectedIsConnectionRefused)) - if tc.expectedErrMessage != "" { - Expect(err).To(Not(BeNil())) - Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) - } else { - Expect(err).To(BeNil()) - } - }, - Entry("timeout error", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: timeout, - errFromGetHostNameNil: true, - port: 22, - expectedIsTimeout: true, - expectedIsConnectionRefused: false, - expectedErrMessage: "", - }), - Entry("authenticationFailed error, port 22, no hostName error", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrAuthenticationFailed, - errFromGetHostNameNil: true, - port: 22, - expectedIsTimeout: false, - expectedIsConnectionRefused: false, - expectedErrMessage: "", - }), - Entry("authenticationFailed error, port 22, hostName error", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrAuthenticationFailed, - errFromGetHostNameNil: false, - port: 22, - expectedIsTimeout: false, - expectedIsConnectionRefused: false, - expectedErrMessage: "wrong ssh key", - }), - Entry("authenticationFailed error, port != 22", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrAuthenticationFailed, - errFromGetHostNameNil: true, - port: 23, - expectedIsTimeout: false, - expectedIsConnectionRefused: false, - expectedErrMessage: "wrong ssh key", - }), - Entry("connectionRefused error, port 22", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrConnectionRefused, - errFromGetHostNameNil: true, - port: 22, - expectedIsTimeout: false, - expectedIsConnectionRefused: true, - expectedErrMessage: "", - }), - Entry("connectionRefused error, port != 22, hostname error", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrConnectionRefused, - errFromGetHostNameNil: false, - port: 23, - expectedIsTimeout: false, - expectedIsConnectionRefused: true, - expectedErrMessage: "", - }), - Entry("connectionRefused error, port != 22, no hostname error", testCaseAnalyzeSSHOutputInstallImageOutErr{ - err: sshclient.ErrConnectionRefused, - errFromGetHostNameNil: true, - port: 23, - expectedIsTimeout: false, - expectedIsConnectionRefused: false, - expectedErrMessage: "", - }), - ) - type testCaseAnalyzeSSHOutputInstallImageStdErr struct { hasNilErr bool stdErr string @@ -945,112 +809,12 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { expectedErrMessage string } - DescribeTable("analyzeSSHOutputInstallImage - StdErr not empty", - func(tc testCaseAnalyzeSSHOutputInstallImageStdErr) { - var err error - if !tc.hasNilErr { - err = errTest - } - hostName := "rescue" - if tc.hasWrongHostName { - hostName = "wrongHostName" - } - - out := sshclient.Output{ - StdOut: hostName, - StdErr: tc.stdErr, - Err: err, - } - isTimeout, isConnectionRefused, err := analyzeSSHOutputInstallImage(out, nil, 22) - Expect(isTimeout).To(Equal(false)) - Expect(isConnectionRefused).To(Equal(false)) - if tc.expectedErrMessage != "" { - Expect(err).To(Not(BeNil())) - Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) - } else { - Expect(err).To(BeNil()) - } - }, - Entry("stderr not empty", testCaseAnalyzeSSHOutputInstallImageStdErr{ - hasNilErr: true, - stdErr: "command failed", - hasWrongHostName: false, - expectedErrMessage: "failed to get hostname via ssh: StdErr:", - }), - Entry("stderr not empty - err != nil", testCaseAnalyzeSSHOutputInstallImageStdErr{ - hasNilErr: false, - stdErr: "command failed", - hasWrongHostName: false, - expectedErrMessage: "unhandled ssh error while getting hostname", - }), - Entry("stderr not empty - wrong hostName", testCaseAnalyzeSSHOutputInstallImageStdErr{ - hasNilErr: true, - stdErr: "command failed", - hasWrongHostName: true, - expectedErrMessage: "failed to get hostname via ssh: StdErr:", - }), - ) - type testCaseAnalyzeSSHOutputInstallImageWrongHostname struct { hasNilErr bool stdErr string hostName string expectedErrMessage string } - - DescribeTable("analyzeSSHOutputInstallImage - wrong hostName", - func(tc testCaseAnalyzeSSHOutputInstallImageWrongHostname) { - var err error - if !tc.hasNilErr { - err = errTest - } - - out := sshclient.Output{ - StdOut: tc.hostName, - StdErr: tc.stdErr, - Err: err, - } - isTimeout, isConnectionRefused, err := analyzeSSHOutputInstallImage(out, nil, 22) - Expect(isTimeout).To(Equal(false)) - Expect(isConnectionRefused).To(Equal(false)) - if tc.expectedErrMessage != "" { - Expect(err).To(Not(BeNil())) - Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) - } else { - Expect(err).To(BeNil()) - } - }, - Entry("empty hostname", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ - hasNilErr: true, - stdErr: "", - hostName: "", - expectedErrMessage: "hostname is empty", - }), - Entry("empty hostname - err not empty", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ - hasNilErr: false, - stdErr: "", - hostName: "", - expectedErrMessage: "unhandled ssh error while getting hostname", - }), - Entry("empty hostname stderr not empty", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ - hasNilErr: true, - stdErr: "command failed", - hostName: "", - expectedErrMessage: "failed to get hostname via ssh: StdErr:", - }), - Entry("hostname == rescue", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ - hasNilErr: true, - stdErr: "", - hostName: "rescue", - expectedErrMessage: "", - }), - Entry("hostname == otherHostName", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ - hasNilErr: true, - stdErr: "", - hostName: "otherHostName", - expectedErrMessage: "unexpected hostname", - }), - ) }) var _ = Describe("analyzeSSHOutputProvisioned", func() { From 78ef8431cc738b1e808db0ecac4263f2201a7031 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Wed, 7 May 2025 14:27:38 +0200 Subject: [PATCH 02/19] revert remove of unused func. --- pkg/services/baremetal/host/host.go | 40 ++++ pkg/services/baremetal/host/host_test.go | 248 ++++++++++++++++++++++- 2 files changed, 282 insertions(+), 6 deletions(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 279e496ca..fd847f30d 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1468,6 +1468,46 @@ func getDeviceNames(wwn []string, storageDevices []infrav1.Storage) []string { return deviceNames } +func analyzeSSHOutputInstallImage(out sshclient.Output, sshClient sshclient.Client, port int) (isTimeout, isConnectionRefused bool, reterr error) { + // check err + if out.Err != nil { + switch { + case os.IsTimeout(out.Err) || sshclient.IsTimeoutError(out.Err): + isTimeout = true + return isTimeout, false, nil + case sshclient.IsAuthenticationFailedError(out.Err): + if err := handleAuthenticationFailed(sshClient, port); err != nil { + return false, false, fmt.Errorf("original ssh error: %w. err: %w", out.Err, err) + } + return false, false, handleAuthenticationFailed(sshClient, port) + case sshclient.IsConnectionRefusedError(out.Err): + return false, verifyConnectionRefused(sshClient, port), nil + } + + return false, false, fmt.Errorf("unhandled ssh error while getting hostname: %w", out.Err) + } + + // check stderr + if out.StdErr != "" { + // This is an unexpected error + return false, false, fmt.Errorf("%w: StdErr: %s", errSSHGetHostname, out.StdErr) + } + + // check stdout + hostname := trimLineBreak(out.StdOut) + switch hostname { + case "": + // Hostname should not be empty. This is unexpected. + return false, false, errEmptyHostName + case rescue: // We are in wrong boot, nothing has to be done to trigger reboot + return false, false, nil + } + + // We are in the case that hostName != rescue && StdOut != hostName + // This is unexpected + return false, false, fmt.Errorf("%w: %s", errUnexpectedHostName, hostname) +} + func handleAuthenticationFailed(sshClient sshclient.Client, port int) error { // Check whether we are in the wrong system in the case that rescue and os system might be running on the same port. if port == rescuePort { diff --git a/pkg/services/baremetal/host/host_test.go b/pkg/services/baremetal/host/host_test.go index be1c26460..4fb8021c2 100644 --- a/pkg/services/baremetal/host/host_test.go +++ b/pkg/services/baremetal/host/host_test.go @@ -725,7 +725,66 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { expectedErrMessage string } - type testCaseAnalyzeSSHOutputRegistering struct { + DescribeTable("analyzeSSHOutputInstallImage - out.Err", + func(tc testCaseAnalyzeSSHOutputInstallImageOutErr) { + host := helpers.BareMetalHost( + "test-host", + "default", + ) + + robotMock := robotmock.Client{} + robotMock.On("GetBootRescue", mock.Anything).Return(&models.Rescue{Active: tc.rescueActive}, nil) + + service := newTestService(host, &robotMock, nil, nil, nil) + + isTimeout, isConnectionRefused, err := service.analyzeSSHOutputRegistering(sshclient.Output{Err: tc.err}) + Expect(isTimeout).To(Equal(tc.expectedIsTimeout)) + Expect(isConnectionRefused).To(Equal(tc.expectedIsConnectionRefused)) + if tc.expectedErrMessage != "" { + Expect(err).To(Not(BeNil())) + Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) + } else { + Expect(err).To(BeNil()) + } + }, + Entry("timeout error", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: timeout, + rescueActive: true, + expectedIsTimeout: true, + expectedIsConnectionRefused: false, + expectedErrMessage: "", + }), + Entry("authenticationFailed error, rescue active", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrAuthenticationFailed, + rescueActive: true, + expectedIsTimeout: false, + expectedIsConnectionRefused: false, + expectedErrMessage: "", + }), + Entry("authenticationFailed error, rescue not active", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrAuthenticationFailed, + rescueActive: false, + expectedIsTimeout: false, + expectedIsConnectionRefused: false, + expectedErrMessage: "wrong ssh key", + }), + Entry("connectionRefused error, rescue active", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrConnectionRefused, + rescueActive: true, + expectedIsTimeout: false, + expectedIsConnectionRefused: true, + expectedErrMessage: "", + }), + Entry("connectionRefused error, rescue not active", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrConnectionRefused, + rescueActive: false, + expectedIsTimeout: false, + expectedIsConnectionRefused: true, + expectedErrMessage: "", + }), + ) + + type testCaseAnalyzeSSHOutputInstallImageStdErr struct { hasNilErr bool stdErr string hostName string @@ -733,7 +792,7 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { } DescribeTable("analyzeSSHOutputRegistering - toggle stdErr and hostName", - func(tc testCaseAnalyzeSSHOutputRegistering) { + func(tc testCaseAnalyzeSSHOutputInstallImageStdErr) { var err error if !tc.hasNilErr { err = errTest @@ -765,25 +824,25 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { Expect(err).To(BeNil()) } }, - Entry("stderr not empty", testCaseAnalyzeSSHOutputRegistering{ + Entry("stderr not empty", testCaseAnalyzeSSHOutputInstallImageStdErr{ hasNilErr: true, stdErr: "command failed", hostName: "hostName", expectedErrMessage: "failed to get hostname via ssh: StdErr:", }), - Entry("stderr not empty - err != nil", testCaseAnalyzeSSHOutputRegistering{ + Entry("stderr not empty - err != nil", testCaseAnalyzeSSHOutputInstallImageStdErr{ hasNilErr: false, stdErr: "command failed", hostName: "", expectedErrMessage: "unhandled ssh error while getting hostname", }), - Entry("stderr not empty - wrong hostName", testCaseAnalyzeSSHOutputRegistering{ + Entry("stderr not empty - wrong hostName", testCaseAnalyzeSSHOutputInstallImageStdErr{ hasNilErr: true, stdErr: "command failed", hostName: "", expectedErrMessage: "failed to get hostname via ssh: StdErr:", }), - Entry("stderr empty - wrong hostName", testCaseAnalyzeSSHOutputRegistering{ + Entry("stderr empty - wrong hostName", testCaseAnalyzeSSHOutputInstallImageStdErr{ hasNilErr: true, stdErr: "", hostName: "", @@ -802,6 +861,83 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { expectedErrMessage string } + DescribeTable("analyzeSSHOutputInstallImage - out.Err", + func(tc testCaseAnalyzeSSHOutputInstallImageOutErr) { + sshMock := &sshmock.Client{} + var errFromGetHostName error + if !tc.errFromGetHostNameNil { + errFromGetHostName = errTest + } + sshMock.On("GetHostName").Return(sshclient.Output{Err: errFromGetHostName}) + + isTimeout, isConnectionRefused, err := analyzeSSHOutputInstallImage(sshclient.Output{Err: tc.err}, sshMock, tc.port) + Expect(isTimeout).To(Equal(tc.expectedIsTimeout)) + Expect(isConnectionRefused).To(Equal(tc.expectedIsConnectionRefused)) + if tc.expectedErrMessage != "" { + Expect(err).To(Not(BeNil())) + Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) + } else { + Expect(err).To(BeNil()) + } + }, + Entry("timeout error", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: timeout, + errFromGetHostNameNil: true, + port: 22, + expectedIsTimeout: true, + expectedIsConnectionRefused: false, + expectedErrMessage: "", + }), + Entry("authenticationFailed error, port 22, no hostName error", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrAuthenticationFailed, + errFromGetHostNameNil: true, + port: 22, + expectedIsTimeout: false, + expectedIsConnectionRefused: false, + expectedErrMessage: "", + }), + Entry("authenticationFailed error, port 22, hostName error", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrAuthenticationFailed, + errFromGetHostNameNil: false, + port: 22, + expectedIsTimeout: false, + expectedIsConnectionRefused: false, + expectedErrMessage: "wrong ssh key", + }), + Entry("authenticationFailed error, port != 22", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrAuthenticationFailed, + errFromGetHostNameNil: true, + port: 23, + expectedIsTimeout: false, + expectedIsConnectionRefused: false, + expectedErrMessage: "wrong ssh key", + }), + Entry("connectionRefused error, port 22", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrConnectionRefused, + errFromGetHostNameNil: true, + port: 22, + expectedIsTimeout: false, + expectedIsConnectionRefused: true, + expectedErrMessage: "", + }), + Entry("connectionRefused error, port != 22, hostname error", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrConnectionRefused, + errFromGetHostNameNil: false, + port: 23, + expectedIsTimeout: false, + expectedIsConnectionRefused: true, + expectedErrMessage: "", + }), + Entry("connectionRefused error, port != 22, no hostname error", testCaseAnalyzeSSHOutputInstallImageOutErr{ + err: sshclient.ErrConnectionRefused, + errFromGetHostNameNil: true, + port: 23, + expectedIsTimeout: false, + expectedIsConnectionRefused: false, + expectedErrMessage: "", + }), + ) + type testCaseAnalyzeSSHOutputInstallImageStdErr struct { hasNilErr bool stdErr string @@ -809,12 +945,112 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { expectedErrMessage string } + DescribeTable("analyzeSSHOutputInstallImage - StdErr not empty", + func(tc testCaseAnalyzeSSHOutputInstallImageStdErr) { + var err error + if !tc.hasNilErr { + err = errTest + } + hostName := "rescue" + if tc.hasWrongHostName { + hostName = "wrongHostName" + } + + out := sshclient.Output{ + StdOut: hostName, + StdErr: tc.stdErr, + Err: err, + } + isTimeout, isConnectionRefused, err := analyzeSSHOutputInstallImage(out, nil, 22) + Expect(isTimeout).To(Equal(false)) + Expect(isConnectionRefused).To(Equal(false)) + if tc.expectedErrMessage != "" { + Expect(err).To(Not(BeNil())) + Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) + } else { + Expect(err).To(BeNil()) + } + }, + Entry("stderr not empty", testCaseAnalyzeSSHOutputInstallImageStdErr{ + hasNilErr: true, + stdErr: "command failed", + hasWrongHostName: false, + expectedErrMessage: "failed to get hostname via ssh: StdErr:", + }), + Entry("stderr not empty - err != nil", testCaseAnalyzeSSHOutputInstallImageStdErr{ + hasNilErr: false, + stdErr: "command failed", + hasWrongHostName: false, + expectedErrMessage: "unhandled ssh error while getting hostname", + }), + Entry("stderr not empty - wrong hostName", testCaseAnalyzeSSHOutputInstallImageStdErr{ + hasNilErr: true, + stdErr: "command failed", + hasWrongHostName: true, + expectedErrMessage: "failed to get hostname via ssh: StdErr:", + }), + ) + type testCaseAnalyzeSSHOutputInstallImageWrongHostname struct { hasNilErr bool stdErr string hostName string expectedErrMessage string } + + DescribeTable("analyzeSSHOutputInstallImage - wrong hostName", + func(tc testCaseAnalyzeSSHOutputInstallImageWrongHostname) { + var err error + if !tc.hasNilErr { + err = errTest + } + + out := sshclient.Output{ + StdOut: tc.hostName, + StdErr: tc.stdErr, + Err: err, + } + isTimeout, isConnectionRefused, err := analyzeSSHOutputInstallImage(out, nil, 22) + Expect(isTimeout).To(Equal(false)) + Expect(isConnectionRefused).To(Equal(false)) + if tc.expectedErrMessage != "" { + Expect(err).To(Not(BeNil())) + Expect(err.Error()).To(ContainSubstring(tc.expectedErrMessage)) + } else { + Expect(err).To(BeNil()) + } + }, + Entry("empty hostname", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ + hasNilErr: true, + stdErr: "", + hostName: "", + expectedErrMessage: "hostname is empty", + }), + Entry("empty hostname - err not empty", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ + hasNilErr: false, + stdErr: "", + hostName: "", + expectedErrMessage: "unhandled ssh error while getting hostname", + }), + Entry("empty hostname stderr not empty", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ + hasNilErr: true, + stdErr: "command failed", + hostName: "", + expectedErrMessage: "failed to get hostname via ssh: StdErr:", + }), + Entry("hostname == rescue", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ + hasNilErr: true, + stdErr: "", + hostName: "rescue", + expectedErrMessage: "", + }), + Entry("hostname == otherHostName", testCaseAnalyzeSSHOutputInstallImageWrongHostname{ + hasNilErr: true, + stdErr: "", + hostName: "otherHostName", + expectedErrMessage: "unexpected hostname", + }), + ) }) var _ = Describe("analyzeSSHOutputProvisioned", func() { From 4c2c41fb5c96e5711e78d7e559dc1cbf7f9f13c8 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 8 May 2025 17:16:00 +0200 Subject: [PATCH 03/19] fix e2e tests. --- pkg/services/baremetal/host/host.go | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index fd847f30d..0fb2d9fe9 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -354,6 +354,9 @@ func (s *Service) ensureSSHKey(sshSecretRef infrav1.SSHSecretRef, sshSecret *cor return sshKey, actionComplete{} } +// handleIncompleteBoot checks if the reboot was successful. +// If it was not successful, it tries other reboot methods. +// Order: SSH -> Software -> Hardware. func (s *Service) handleIncompleteBoot(isRebootIntoRescue, isTimeout, isConnectionRefused bool) (failed bool, err error) { // Connection refused error might be a sign that the ssh port is wrong - but might also come // right after a reboot and is expected then. Therefore, we wait for some time and if the @@ -713,7 +716,10 @@ func (s *Service) validateRescueSystemIsActive(sshClient sshclient.Client) (ok b if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil { timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String() } - + if !isSSHTimeoutError && !isSSHConnectionRefusedError { + s.scope.Logger.Info("Expected the rescue system, but got different hostname", "hostName", hostName, "timeSinceReboot", timeSinceReboot) + return false, actionContinue{delay: 10 * time.Second} + } s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName, "isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot) return false, actionContinue{delay: 10 * time.Second} @@ -1105,9 +1111,12 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult { } sshClient := s.scope.SSHClientFactory.NewClient(in) - ok, res := s.validateRescueSystemIsActive(sshClient) - if !ok { - return res + out := sshClient.GetHostName() + hostName := trimLineBreak(out.StdOut) + if hostName != rescue { + // This is unexpected. We should be in rescue mode. + return actionError{err: fmt.Errorf("expected rescue system, but got %q: %s", hostName, + out.String())} } exitStatus, output, err := sshClient.ExecutePreProvisionCommand(ctx, s.scope.PreProvisionCommand) @@ -1141,9 +1150,12 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { } sshClient := s.scope.SSHClientFactory.NewClient(in) - ok, res := s.validateRescueSystemIsActive(sshClient) - if !ok { - return res + out := sshClient.GetHostName() + hostName := trimLineBreak(out.StdOut) + if hostName != rescue { + // This is unexpected. We should be in rescue mode. + return actionError{err: fmt.Errorf("expected rescue system, but got %q: %s", hostName, + out.String())} } state, err := sshClient.GetInstallImageState() From 4c515a7c266994f62f51c4593fae68208a0ca6a5 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 8 May 2025 20:05:09 +0200 Subject: [PATCH 04/19] log reboots, and retry if hostname does not match. --- main.go | 2 +- pkg/services/baremetal/host/host.go | 87 +++++++++++++++-------------- 2 files changed, 46 insertions(+), 43 deletions(-) diff --git a/main.go b/main.go index 5141f9032..a309c1a75 100644 --- a/main.go +++ b/main.go @@ -259,7 +259,7 @@ func main() { os.Exit(1) } - setupLog.Info("starting manager", "version", caphversion.Get().String()) + setupLog.Info("starting manager", "version", caphversion.Get().String(), "args", os.Args) if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") os.Exit(1) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 0fb2d9fe9..42c2de1aa 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -170,7 +170,7 @@ func SaveHostAndReturn(ctx context.Context, cl client.Client, host *infrav1.Hetz // previous: None // next: Registering -func (s *Service) actionPreparing(_ context.Context) actionResult { +func (s *Service) actionPreparing(ctx context.Context) actionResult { markProvisionPending(s.scope.HetznerBareMetalHost, infrav1.StatePreparing) server, err := s.scope.RobotClient.GetBMServer(s.scope.HetznerBareMetalHost.Spec.ServerID) @@ -252,7 +252,7 @@ func (s *Service) actionPreparing(_ context.Context) actionResult { return actionError{err: fmt.Errorf("failed to reboot server via ssh (actionPreparing): %w", err)} } msg := "Rebooting into rescue mode." - createSSHRebootEvent(s.scope.HetznerBareMetalHost, msg) + createSSHRebootEvent(ctx, s.scope.HetznerBareMetalHost, msg) // we immediately set an error message in the host status to track the reboot we just performed s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeSSHRebootTriggered, fmt.Sprintf("Phase %s, reboot via ssh: %s", s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, msg)) @@ -267,7 +267,7 @@ func (s *Service) actionPreparing(_ context.Context) actionResult { return actionError{err: fmt.Errorf(errMsgFailedReboot, err)} } - msg := createRebootEvent(s.scope.HetznerBareMetalHost, rebootType, "Reboot into rescue system.") + msg := createRebootEvent(ctx, s.scope.HetznerBareMetalHost, rebootType, "Reboot into rescue system.") // we immediately set an error message in the host status to track the reboot we just performed. // This is not a real error. Sooner or later we should track the reboots differently. s.scope.HetznerBareMetalHost.SetError(errorType, msg) @@ -357,7 +357,7 @@ func (s *Service) ensureSSHKey(sshSecretRef infrav1.SSHSecretRef, sshSecret *cor // handleIncompleteBoot checks if the reboot was successful. // If it was not successful, it tries other reboot methods. // Order: SSH -> Software -> Hardware. -func (s *Service) handleIncompleteBoot(isRebootIntoRescue, isTimeout, isConnectionRefused bool) (failed bool, err error) { +func (s *Service) handleIncompleteBoot(ctx context.Context, isRebootIntoRescue, isTimeout, isConnectionRefused bool) (failed bool, err error) { // Connection refused error might be a sign that the ssh port is wrong - but might also come // right after a reboot and is expected then. Therefore, we wait for some time and if the // error keeps coming, we give an error. @@ -407,22 +407,22 @@ func (s *Service) handleIncompleteBoot(isRebootIntoRescue, isTimeout, isConnecti // We did not get an error with ssh - but also not the expected hostname. Therefore, // the (ssh) reboot did not start. We trigger an API reboot instead. - return false, s.handleErrorTypeSSHRebootFailed(isTimeout, isRebootIntoRescue) + return false, s.handleErrorTypeSSHRebootFailed(ctx, isTimeout, isRebootIntoRescue) case infrav1.ErrorTypeSSHRebootTriggered: - return false, s.handleErrorTypeSSHRebootFailed(isTimeout, isRebootIntoRescue) + return false, s.handleErrorTypeSSHRebootFailed(ctx, isTimeout, isRebootIntoRescue) case infrav1.ErrorTypeSoftwareRebootTriggered: - return false, s.handleErrorTypeSoftwareRebootFailed(isTimeout, isRebootIntoRescue) + return false, s.handleErrorTypeSoftwareRebootFailed(ctx, isTimeout, isRebootIntoRescue) case infrav1.ErrorTypeHardwareRebootTriggered: - return s.handleErrorTypeHardwareRebootFailed(isTimeout, isRebootIntoRescue) + return s.handleErrorTypeHardwareRebootFailed(ctx, isTimeout, isRebootIntoRescue) } return false, fmt.Errorf("%w: %s", errUnexpectedErrorType, s.scope.HetznerBareMetalHost.Spec.Status.ErrorType) } -func (s *Service) handleErrorTypeSSHRebootFailed(isSSHTimeoutError, wantsRescue bool) error { +func (s *Service) handleErrorTypeSSHRebootFailed(ctx context.Context, isSSHTimeoutError, wantsRescue bool) error { // If it is not a timeout error, then the ssh command (get hostname) worked, but didn't give us the // right hostname. This means that the server has not been rebooted and we need to escalate. // If we got a timeout error from ssh, it means that the server has not yet finished rebooting. @@ -448,7 +448,7 @@ func (s *Service) handleErrorTypeSSHRebootFailed(isSSHTimeoutError, wantsRescue } msg := fmt.Sprintf("Reboot via ssh into %s failed. Now using rebootType %q.", rebootInto, rebootType) - msg = createRebootEvent(s.scope.HetznerBareMetalHost, rebootType, msg) + msg = createRebootEvent(ctx, s.scope.HetznerBareMetalHost, rebootType, msg) // we immediately set an error message in the host status to track the reboot we just performed s.scope.HetznerBareMetalHost.SetError(errorType, msg) } @@ -472,7 +472,7 @@ func rebootAndErrorTypeAfterTimeout(host *infrav1.HetznerBareMetalHost) (infrav1 return rebootType, errorType } -func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRescue bool) error { +func (s *Service) handleErrorTypeSoftwareRebootFailed(ctx context.Context, isSSHTimeoutError, wantsRescue bool) error { rebootInto := "node" if wantsRescue { rebootInto = "rescue mode" @@ -495,7 +495,7 @@ func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRe } msg := fmt.Sprintf("Reboot via type 'software' into %s failed. Now using rebootType %q.", rebootInto, infrav1.RebootTypeHardware) - msg = createRebootEvent(s.scope.HetznerBareMetalHost, infrav1.RebootTypeHardware, msg) + msg = createRebootEvent(ctx, s.scope.HetznerBareMetalHost, infrav1.RebootTypeHardware, msg) // we immediately set an error message in the host status to track the reboot we just performed s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeHardwareRebootTriggered, msg) } @@ -504,7 +504,7 @@ func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRe } // handleErrorTypeHardwareRebootFailed deals with hardware reboot failed cases and returns whether we should fail the process. -func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRescue bool) (bool, error) { +func (s *Service) handleErrorTypeHardwareRebootFailed(ctx context.Context, isSSHTimeoutError, wantsRescue bool) (bool, error) { rebootInto := "node" if wantsRescue { rebootInto = "rescue mode" @@ -532,7 +532,7 @@ func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRe } msg := fmt.Sprintf("Reboot via ssh into %s failed. Now using rebootType %q.", rebootInto, infrav1.RebootTypeHardware) - createRebootEvent(s.scope.HetznerBareMetalHost, infrav1.RebootTypeHardware, msg) + createRebootEvent(ctx, s.scope.HetznerBareMetalHost, infrav1.RebootTypeHardware, msg) } // if hardware reboots time out, we should fail @@ -584,7 +584,7 @@ func (s *Service) ensureRescueMode() error { // previous: Preparing // next: PreProvisioning -func (s *Service) actionRegistering(_ context.Context) actionResult { +func (s *Service) actionRegistering(ctx context.Context) actionResult { markProvisionPending(s.scope.HetznerBareMetalHost, infrav1.StateRegistering) creds := sshclient.CredentialsFromSecret(s.scope.RescueSSHSecret, s.scope.HetznerCluster.Spec.SSHKeys.RobotRescueSecretRef) @@ -595,7 +595,7 @@ func (s *Service) actionRegistering(_ context.Context) actionResult { } sshClient := s.scope.SSHClientFactory.NewClient(in) - ok, res := s.validateRescueSystemIsActive(sshClient) + ok, res := s.validateRescueSystemIsActive(ctx, sshClient) if !ok { return res } @@ -686,7 +686,7 @@ func (s *Service) actionRegistering(_ context.Context) actionResult { return actionComplete{} } -func (s *Service) validateRescueSystemIsActive(sshClient sshclient.Client) (ok bool, ar actionResult) { +func (s *Service) validateRescueSystemIsActive(ctx context.Context, sshClient sshclient.Client) (ok bool, ar actionResult) { // Check hostname with sshClient out := sshClient.GetHostName() hostName := trimLineBreak(out.StdOut) @@ -705,7 +705,7 @@ func (s *Service) validateRescueSystemIsActive(sshClient sshclient.Client) (ok b return false, actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)} } - failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError) + failed, err := s.handleIncompleteBoot(ctx, true, isSSHTimeoutError, isSSHConnectionRefusedError) if failed { return false, s.recordActionFailure(infrav1.PermanentError, err.Error()) } @@ -1115,8 +1115,9 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult { hostName := trimLineBreak(out.StdOut) if hostName != rescue { // This is unexpected. We should be in rescue mode. - return actionError{err: fmt.Errorf("expected rescue system, but got %q: %s", hostName, - out.String())} + // Might happen when Reconcile reads stale data from the cache. + ctrl.LoggerFrom(ctx).Info("pre-provision: expected rescue system, but found different hostname", "hostname", hostName, "sshOutput", out.String()) + return actionContinue{delay: 10 * time.Second} } exitStatus, output, err := sshClient.ExecutePreProvisionCommand(ctx, s.scope.PreProvisionCommand) @@ -1154,8 +1155,9 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { hostName := trimLineBreak(out.StdOut) if hostName != rescue { // This is unexpected. We should be in rescue mode. - return actionError{err: fmt.Errorf("expected rescue system, but got %q: %s", hostName, - out.String())} + // Might happen when Reconcile reads stale data from the cache. + ctrl.LoggerFrom(ctx).Info("pre-provision: expected rescue system, but found different hostname", "hostname", hostName, "sshOutput", out.String()) + return actionContinue{delay: 10 * time.Second} } state, err := sshClient.GetInstallImageState() @@ -1371,7 +1373,7 @@ echo %q return actionContinue{delay: 10 * time.Second} } -func (s *Service) actionImageInstallingFinished(_ context.Context, sshClient sshclient.Client) actionResult { +func (s *Service) actionImageInstallingFinished(ctx context.Context, sshClient sshclient.Client) actionResult { output, err := sshClient.GetResultOfInstallImage() if err != nil { return actionError{ @@ -1400,7 +1402,7 @@ func (s *Service) actionImageInstallingFinished(_ context.Context, sshClient ssh record.Warn(s.scope.HetznerBareMetalHost, "RebootFailed", err.Error()) return actionError{err: err} } - createSSHRebootEvent(s.scope.HetznerBareMetalHost, "machine image and cloud-init data got installed") + createSSHRebootEvent(ctx, s.scope.HetznerBareMetalHost, "machine image and cloud-init data got installed") s.scope.Logger.Info("RebootAfterInstallimageSucceeded", "stdout", out.StdOut, "stderr", out.StdErr) @@ -1543,7 +1545,7 @@ func verifyConnectionRefused(sshClient sshclient.Client, port int) bool { return true } -func (s *Service) actionEnsureProvisioned(_ context.Context) (ar actionResult) { +func (s *Service) actionEnsureProvisioned(ctx context.Context) (ar actionResult) { markProvisionPending(s.scope.HetznerBareMetalHost, infrav1.StateEnsureProvisioned) sshClient := s.scope.SSHClientFactory.NewClient(sshclient.Input{ PrivateKey: sshclient.CredentialsFromSecret(s.scope.OSSSHSecret, s.scope.HetznerBareMetalHost.Spec.Status.SSHSpec.SecretRef).PrivateKey, @@ -1574,13 +1576,13 @@ func (s *Service) actionEnsureProvisioned(_ context.Context) (ar actionResult) { } // A connection failed error could mean that cloud init is still running (if cloudInit introduces a new port) if isSSHConnectionRefusedError { - if actionRes := s.handleConnectionRefused(); actionRes != nil { + if actionRes := s.handleConnectionRefused(ctx); actionRes != nil { s.scope.Logger.Info("ensureProvisioned: ConnectionRefused", "actionResult", actionRes) return actionRes } } - failed, err := s.handleIncompleteBoot(false, isTimeout, isSSHConnectionRefusedError) + failed, err := s.handleIncompleteBoot(ctx, false, isTimeout, isSSHConnectionRefusedError) if failed { return s.recordActionFailure(infrav1.ProvisioningError, err.Error()) } @@ -1625,7 +1627,7 @@ func (s *Service) actionEnsureProvisioned(_ context.Context) (ar actionResult) { } // Check the status of cloud init - actResult, msg, _ := s.checkCloudInitStatus(sshClient) + actResult, msg, _ := s.checkCloudInitStatus(ctx, sshClient) if _, complete := actResult.(actionComplete); !complete { record.Event(s.scope.HetznerBareMetalHost, "CloudInitStillRunning", msg) return createEventWithCloudInitOutput(actResult) @@ -1635,7 +1637,7 @@ func (s *Service) actionEnsureProvisioned(_ context.Context) (ar actionResult) { // Check this only when the port did not change. Because if it did, then we can already confirm at this point // that the change worked and the new port is usable. This is a strong enough indication for us to assume cloud init worked. if s.scope.HetznerBareMetalHost.Spec.Status.SSHSpec.PortAfterInstallImage == s.scope.HetznerBareMetalHost.Spec.Status.SSHSpec.PortAfterCloudInit { - actResult = s.handleCloudInitNotStarted() + actResult = s.handleCloudInitNotStarted(ctx) if _, complete := actResult.(actionComplete); !complete { s.scope.Logger.Info("ensureProvisioned: handleCloudInitNotStarted", "actResult", actResult) return createEventWithCloudInitOutput(actResult) @@ -1650,7 +1652,7 @@ func (s *Service) actionEnsureProvisioned(_ context.Context) (ar actionResult) { // handleConnectionRefused checks cloud init status via ssh to the old ssh port if the new ssh port // gave a connection refused error. -func (s *Service) handleConnectionRefused() actionResult { +func (s *Service) handleConnectionRefused(ctx context.Context) actionResult { // Nothing to do if ports didn't change. if s.scope.HetznerBareMetalHost.Spec.Status.SSHSpec.PortAfterInstallImage == s.scope.HetznerBareMetalHost.Spec.Status.SSHSpec.PortAfterCloudInit { return nil @@ -1660,7 +1662,7 @@ func (s *Service) handleConnectionRefused() actionResult { Port: s.scope.HetznerBareMetalHost.Spec.Status.SSHSpec.PortAfterInstallImage, IP: s.scope.HetznerBareMetalHost.Spec.Status.GetIPAddress(), }) - actResult, _, err := s.checkCloudInitStatus(oldSSHClient) + actResult, _, err := s.checkCloudInitStatus(ctx, oldSSHClient) // If this ssh client also gives an error, then we go back to analyzing the error of the first ssh call // This happens in the statement below this one. if err == nil { @@ -1668,7 +1670,7 @@ func (s *Service) handleConnectionRefused() actionResult { // then we will soon reboot and be able to access the server via the new port if _, complete := actResult.(actionComplete); complete { // Check whether cloud init did not run successfully even though it shows "done" - actResult := s.handleCloudInitNotStarted() + actResult := s.handleCloudInitNotStarted(ctx) if _, complete := actResult.(actionComplete); complete { return actionContinue{delay: 10 * time.Second} } @@ -1681,7 +1683,7 @@ func (s *Service) handleConnectionRefused() actionResult { return nil } -func (s *Service) checkCloudInitStatus(sshClient sshclient.Client) (actionResult, string, error) { +func (s *Service) checkCloudInitStatus(ctx context.Context, sshClient sshclient.Client) (actionResult, string, error) { out := sshClient.CloudInitStatus() // This error is interesting for further logic and might happen because of the fact that the sshClient has the wrong port if out.Err != nil { @@ -1700,7 +1702,7 @@ func (s *Service) checkCloudInitStatus(sshClient sshclient.Client) (actionResult if err := handleSSHError(out); err != nil { return actionError{err: fmt.Errorf("failed to reboot (%s): %w", msg, err)}, "", nil } - createSSHRebootEvent(s.scope.HetznerBareMetalHost, msg) + createSSHRebootEvent(ctx, s.scope.HetznerBareMetalHost, msg) s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeSSHRebootTriggered, "ssh reboot just triggered") record.Warn(s.scope.HetznerBareMetalHost, "SSHRebootAfterCloudInitStatusDisabled", msg) return actionContinue{delay: 5 * time.Second}, "cloud-init was disabled. Triggered a reboot again", nil @@ -1719,7 +1721,7 @@ func (s *Service) checkCloudInitStatus(sshClient sshclient.Client) (actionResult } } -func (s *Service) handleCloudInitNotStarted() actionResult { +func (s *Service) handleCloudInitNotStarted(ctx context.Context) actionResult { // Check whether cloud init really was successfully. Sigterm causes problems there. oldSSHClient := s.scope.SSHClientFactory.NewClient(sshclient.Input{ PrivateKey: sshclient.CredentialsFromSecret(s.scope.OSSSHSecret, s.scope.HetznerBareMetalHost.Spec.Status.SSHSpec.SecretRef).PrivateKey, @@ -1745,7 +1747,7 @@ func (s *Service) handleCloudInitNotStarted() actionResult { if err := handleSSHError(out); err != nil { return actionError{err: fmt.Errorf("failed to reboot (handleCloudInitNotStarted): %w", err)} } - createSSHRebootEvent(s.scope.HetznerBareMetalHost, "machine image and cloud-init data got installed") + createSSHRebootEvent(ctx, s.scope.HetznerBareMetalHost, "machine image and cloud-init data got installed") record.Eventf(s.scope.HetznerBareMetalHost, "SSHRebootAfterCloudInitSigTermFound", "rebooted via ssh after cloud init logs contained sigterm: %s", trimLineBreak(out.StdOut)) return actionContinue{delay: 10 * time.Second} @@ -1792,7 +1794,7 @@ func analyzeSSHOutputProvisioned(out sshclient.Output) (isTimeout, isConnectionR // previous: EnsureProvisioned // next: Stays in Provisioned (final state) -func (s *Service) actionProvisioned(_ context.Context) actionResult { +func (s *Service) actionProvisioned(ctx context.Context) actionResult { // set host to provisioned conditions.MarkTrue(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition) @@ -1832,7 +1834,7 @@ func (s *Service) actionProvisioned(_ context.Context) actionResult { } return actionError{err: fmt.Errorf("failed to handle incomplete boot - provisioning: %w", err)} } - failed, err := s.handleIncompleteBoot(false, isTimeout, isSSHConnectionRefusedError) + failed, err := s.handleIncompleteBoot(ctx, false, isTimeout, isSSHConnectionRefusedError) if failed { return s.recordActionFailure(infrav1.PermanentError, err.Error()) } @@ -1847,7 +1849,7 @@ func (s *Service) actionProvisioned(_ context.Context) actionResult { return actionError{err: err} } - createSSHRebootEvent(s.scope.HetznerBareMetalHost, "Rebooting because annotation was set") + createSSHRebootEvent(ctx, s.scope.HetznerBareMetalHost, "Rebooting because annotation was set") s.scope.HetznerBareMetalHost.Spec.Status.Rebooted = true return actionContinue{delay: 10 * time.Second} } @@ -1935,16 +1937,17 @@ func markProvisionPending(host *infrav1.HetznerBareMetalHost, state infrav1.Prov ) } -func createSSHRebootEvent(host *infrav1.HetznerBareMetalHost, msg string) { - createRebootEvent(host, infrav1.RebootTypeSSH, msg) +func createSSHRebootEvent(ctx context.Context, host *infrav1.HetznerBareMetalHost, msg string) { + createRebootEvent(ctx, host, infrav1.RebootTypeSSH, msg) } -func createRebootEvent(host *infrav1.HetznerBareMetalHost, rebootType infrav1.RebootType, msg string) string { +func createRebootEvent(ctx context.Context, host *infrav1.HetznerBareMetalHost, rebootType infrav1.RebootType, msg string) string { verboseRebootType := infrav1.VerboseRebootType(rebootType) reason := fmt.Sprintf("RebootBMServerVia%sProvisioningState%s", verboseRebootType, strcase.UpperCamelCase(string(host.Spec.Status.ProvisioningState))) msg = fmt.Sprintf("Phase %s, reboot via %s: %s", host.Spec.Status.ProvisioningState, verboseRebootType, msg) record.Eventf(host, reason, msg) + ctrl.LoggerFrom(ctx).Info(msg, "host", host.Name) return msg } From ae4d56db6bbfa7b98a6d268cb8a3039ceb3dfa48 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 09:34:00 +0200 Subject: [PATCH 05/19] :seedling: Show conflicts in SaveHostAndReturn in logs --- pkg/services/baremetal/host/host.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index dd6dc8ab1..d9c1dce60 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -160,7 +160,7 @@ func SaveHostAndReturn(ctx context.Context, cl client.Client, host *infrav1.Hetz if err := cl.Update(ctx, host); err != nil { if apierrors.IsConflict(err) { log := ctrl.LoggerFrom(ctx) - log.V(1).Info("conflict error. Retrying", "err", err) + log.Info("conflict error. Retrying", "err", err) return reconcile.Result{Requeue: true}, nil } return reconcile.Result{}, fmt.Errorf("failed to update host object: %w", err) From 75a4f40b75bf0daaa2628d8e95b82c832609939e Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 09:55:43 +0200 Subject: [PATCH 06/19] fix tests (add ctx) --- pkg/services/baremetal/host/host_test.go | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pkg/services/baremetal/host/host_test.go b/pkg/services/baremetal/host/host_test.go index 4fb8021c2..0303a7fcf 100644 --- a/pkg/services/baremetal/host/host_test.go +++ b/pkg/services/baremetal/host/host_test.go @@ -259,12 +259,12 @@ var _ = Describe("handleIncompleteBoot", func() { helpers.WithError(tc.hostErrorType, "", 1, metav1.Now()), ) service := newTestService(host, &robotMock, nil, nil, nil) - + ctx := context.Background() if tc.expectedReturnError == nil { - _, err := service.handleIncompleteBoot(tc.isRebootIntoRescue, tc.isTimeOut, tc.isConnectionRefused) + _, err := service.handleIncompleteBoot(ctx, tc.isRebootIntoRescue, tc.isTimeOut, tc.isConnectionRefused) Expect(err).To(Succeed()) } else { - _, err := service.handleIncompleteBoot(tc.isRebootIntoRescue, tc.isTimeOut, tc.isConnectionRefused) + _, err := service.handleIncompleteBoot(ctx, tc.isRebootIntoRescue, tc.isTimeOut, tc.isConnectionRefused) Expect(err).Should(Equal(tc.expectedReturnError)) } @@ -352,8 +352,8 @@ var _ = Describe("handleIncompleteBoot", func() { helpers.WithRebootTypes(tc.rebootTypes), ) service := newTestService(host, &robotMock, nil, nil, nil) - - _, err := service.handleIncompleteBoot(true, tc.isTimeOut, tc.isConnectionRefused) + ctx := context.Background() + _, err := service.handleIncompleteBoot(ctx, true, tc.isTimeOut, tc.isConnectionRefused) Expect(err).To(Succeed()) Expect(host.Spec.Status.ErrorType).To(Equal(tc.expectedHostErrorType)) if tc.expectedRebootType != infrav1.RebootType("") { @@ -439,7 +439,8 @@ var _ = Describe("handleIncompleteBoot", func() { ) service := newTestService(host, &robotMock, nil, nil, nil) - _, err := service.handleIncompleteBoot(true, true, false) + ctx := context.Background() + _, err := service.handleIncompleteBoot(ctx, true, true, false) Expect(err).To(Succeed()) Expect(host.Spec.Status.ErrorType).To(Equal(tc.expectedHostErrorType)) if tc.expectedRebootType != infrav1.RebootType("") { @@ -485,7 +486,8 @@ var _ = Describe("handleIncompleteBoot", func() { ) service := newTestService(host, &robotMock, nil, nil, nil) - failed, err := service.handleIncompleteBoot(true, false, true) + ctx := context.Background() + failed, err := service.handleIncompleteBoot(ctx, true, false, true) Expect(err).ToNot(BeNil()) Expect(failed).To(BeTrue()) Expect(host.Spec.Status.ErrorType).To(Equal(infrav1.ErrorTypeConnectionError)) @@ -510,7 +512,8 @@ var _ = Describe("handleIncompleteBoot", func() { ) service := newTestService(host, &robotMock, nil, nil, nil) - _, err := service.handleIncompleteBoot(true, true, false) + ctx := context.Background() + _, err := service.handleIncompleteBoot(ctx, true, true, false) Expect(err).ToNot(Succeed()) Expect(host.Spec.Status.ErrorType).To(Equal(infrav1.ErrorTypeHardwareRebootTriggered)) Expect(robotMock.AssertNotCalled(GinkgoT(), "RebootBMServer", mock.Anything, mock.Anything)).To(BeTrue()) @@ -545,11 +548,12 @@ var _ = Describe("handleIncompleteBoot", func() { ) service := newTestService(host, &robotMock, nil, nil, nil) + ctx := context.Background() if tc.expectedReturnError == nil { - _, err := service.handleIncompleteBoot(tc.isRebootIntoRescue, false, false) + _, err := service.handleIncompleteBoot(ctx, tc.isRebootIntoRescue, false, false) Expect(err).To(Succeed()) } else { - _, err := service.handleIncompleteBoot(tc.isRebootIntoRescue, false, false) + _, err := service.handleIncompleteBoot(ctx, tc.isRebootIntoRescue, false, false) Expect(err).Should(Equal(tc.expectedReturnError)) } Expect(host.Spec.Status.ErrorType).To(Equal(tc.expectedHostErrorType)) From b0d0078c05c2f4476a8a736b1c0f9ecf3d52cefe Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 10:36:07 +0200 Subject: [PATCH 07/19] use uncached APIReader to avoid cache issues --- controllers/hetznerbaremetalhost_controller.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 13ccfd8f4..73640ab62 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -79,7 +79,8 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl // Fetch the Hetzner bare metal host instance. bmHost := &infrav1.HetznerBareMetalHost{} - err := r.Get(ctx, req.NamespacedName, bmHost) + // Use uncached APIReader to avoid cache issues. + err := r.APIReader.Get(ctx, req.NamespacedName, bmHost) if err != nil { if apierrors.IsNotFound(err) { return reconcile.Result{}, nil From 2cf266d86477fd7d21fb500345880c6692c53572 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 11:55:37 +0200 Subject: [PATCH 08/19] more debug output. --- .../hetznerbaremetalhost_controller.go | 27 ++++++++----------- pkg/services/baremetal/host/host.go | 3 ++- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 73640ab62..6681f6ab4 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -26,7 +26,6 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" - "k8s.io/klog/v2" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -69,13 +68,6 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log := ctrl.LoggerFrom(ctx) start := time.Now() - defer func() { - // check duration of reconcile. Warn if it took too long. - duration := time.Since(start) - if duration > 15*time.Second { - log.Info("Reconcile took too long", "duration", duration, "res", res, "reterr", reterr) - } - }() // Fetch the Hetzner bare metal host instance. bmHost := &infrav1.HetznerBareMetalHost{} @@ -88,10 +80,17 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl return reconcile.Result{}, err } + log = log.WithValues("state", bmHost.Spec.Status.ProvisioningState) + initialProvisioningState := bmHost.Spec.Status.ProvisioningState defer func() { + // check duration of reconcile. Warn if it took too long. + duration := time.Since(start) + if duration > 10*time.Second { + log.Info("Reconcile took too long", "duration", duration, "res", res, "reterr", reterr) + } if initialProvisioningState != bmHost.Spec.Status.ProvisioningState { - log.Info("Provisioning state changed", "from", initialProvisioningState, "to", bmHost.Spec.Status.ProvisioningState) + log.Info("Provisioning state changed", "newState", bmHost.Spec.Status.ProvisioningState, "duration", duration) } }() @@ -141,21 +140,18 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } if err := r.Client.Get(ctx, hetznerClusterName, hetznerCluster); err != nil { if apierrors.IsNotFound(err) { + log.Info("HetznerCluster not found. Requeueing") return reconcile.Result{RequeueAfter: 10 * time.Second}, nil } return reconcile.Result{}, fmt.Errorf("failed to get HetznerCluster: %w", err) } - log = log.WithValues("HetznerCluster", klog.KObj(hetznerCluster)) - // Fetch the Cluster. cluster, err := util.GetClusterFromMetadata(ctx, r.Client, hetznerCluster.ObjectMeta) if err != nil { return reconcile.Result{}, fmt.Errorf("failed to get Cluster: %w", err) } - log = log.WithValues("Cluster", klog.KObj(cluster)) - hetznerBareMetalMachine := &infrav1.HetznerBareMetalMachine{} if bmHost.Spec.ConsumerRef != nil { @@ -169,8 +165,6 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } } - log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) - ctx = ctrl.LoggerInto(ctx, log) // Get Hetzner robot api credentials @@ -246,7 +240,8 @@ func (r *HetznerBareMetalHostReconciler) reconcileSelectedStates(ctx context.Con return reconcile.Result{}, fmt.Errorf("Update() failed after setting ProvisioningState: %w", err) } } - + log := ctrl.LoggerFrom(ctx) + log.Info("HetznerBareMetalHost is in StateNone. Requeueing", "needsUpdate", needsUpdate) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil // Handle StateDeleting diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 1e3f90a99..07c79fe57 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1156,7 +1156,7 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { if hostName != rescue { // This is unexpected. We should be in rescue mode. // Might happen when Reconcile reads stale data from the cache. - ctrl.LoggerFrom(ctx).Info("pre-provision: expected rescue system, but found different hostname", "hostname", hostName, "sshOutput", out.String()) + ctrl.LoggerFrom(ctx).Info("image-installing: expected rescue system, but found different hostname", "hostname", hostName, "sshOutput", out.String()) return actionContinue{delay: 10 * time.Second} } @@ -1370,6 +1370,7 @@ echo %q record.Warnf(s.scope.HetznerBareMetalHost, "ExecuteInstallImageFailed", out.String()) return actionError{err: fmt.Errorf("failed to execute installimage: %w", out.Err)} } + s.scope.Logger.Info("ExecuteInstallImage started successfully", "out", out.String()) return actionContinue{delay: 10 * time.Second} } From d48f03e4a61e781cd4549f2d7f7b1a179771c8ed Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 11:57:44 +0200 Subject: [PATCH 09/19] log every reconcile. --- controllers/hetznerbaremetalhost_controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 6681f6ab4..bd6fc60f6 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -82,6 +82,8 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("state", bmHost.Spec.Status.ProvisioningState) + log.Info("Reconciling HetznerBareMetalHost") // TODO: Remove this log line after debugging + initialProvisioningState := bmHost.Spec.Status.ProvisioningState defer func() { // check duration of reconcile. Warn if it took too long. From ce0362113558111009467371587c11cdad750678 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 14:35:55 +0200 Subject: [PATCH 10/19] logs are not visible. Strange, try print. --- controllers/hetznerbaremetalhost_controller.go | 3 +-- pkg/services/baremetal/host/host.go | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index bd6fc60f6..0f588c97d 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -81,6 +81,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } log = log.WithValues("state", bmHost.Spec.Status.ProvisioningState) + ctx = ctrl.LoggerInto(ctx, log) log.Info("Reconciling HetznerBareMetalHost") // TODO: Remove this log line after debugging @@ -167,8 +168,6 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } } - ctx = ctrl.LoggerInto(ctx, log) - // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) robotCreds, err := getAndValidateRobotCredentials(ctx, req.Namespace, hetznerCluster, secretManager) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 07c79fe57..71340aa0b 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1949,6 +1949,8 @@ func createRebootEvent(ctx context.Context, host *infrav1.HetznerBareMetalHost, strcase.UpperCamelCase(string(host.Spec.Status.ProvisioningState))) msg = fmt.Sprintf("Phase %s, reboot via %s: %s", host.Spec.Status.ProvisioningState, verboseRebootType, msg) record.Eventf(host, reason, msg) - ctrl.LoggerFrom(ctx).Info(msg, "host", host.Name) + logger := ctrl.LoggerFrom(ctx) + fmt.Printf("createRooooooooootEvent %q %v %v\n", msg, logger.Enabled(), logger.GetV()) + logger.Info(msg, "host", host.Name) return msg } From d9b837de7405d884ea2df567a0d797fac0e7bc26 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 15:29:57 +0200 Subject: [PATCH 11/19] found "bug": hostname changes during "installimage". --- controllers/hetznerbaremetalhost_controller.go | 6 +----- pkg/services/baremetal/host/host.go | 15 ++++++++------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 0f588c97d..bd28a261d 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -72,7 +72,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl // Fetch the Hetzner bare metal host instance. bmHost := &infrav1.HetznerBareMetalHost{} // Use uncached APIReader to avoid cache issues. - err := r.APIReader.Get(ctx, req.NamespacedName, bmHost) + err := r.Get(ctx, req.NamespacedName, bmHost) if err != nil { if apierrors.IsNotFound(err) { return reconcile.Result{}, nil @@ -83,8 +83,6 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("state", bmHost.Spec.Status.ProvisioningState) ctx = ctrl.LoggerInto(ctx, log) - log.Info("Reconciling HetznerBareMetalHost") // TODO: Remove this log line after debugging - initialProvisioningState := bmHost.Spec.Status.ProvisioningState defer func() { // check duration of reconcile. Warn if it took too long. @@ -241,8 +239,6 @@ func (r *HetznerBareMetalHostReconciler) reconcileSelectedStates(ctx context.Con return reconcile.Result{}, fmt.Errorf("Update() failed after setting ProvisioningState: %w", err) } } - log := ctrl.LoggerFrom(ctx) - log.Info("HetznerBareMetalHost is in StateNone. Requeueing", "needsUpdate", needsUpdate) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil // Handle StateDeleting diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 71340aa0b..7164c3cd0 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1115,7 +1115,6 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult { hostName := trimLineBreak(out.StdOut) if hostName != rescue { // This is unexpected. We should be in rescue mode. - // Might happen when Reconcile reads stale data from the cache. ctrl.LoggerFrom(ctx).Info("pre-provision: expected rescue system, but found different hostname", "hostname", hostName, "sshOutput", out.String()) return actionContinue{delay: 10 * time.Second} } @@ -1153,10 +1152,13 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { out := sshClient.GetHostName() hostName := trimLineBreak(out.StdOut) - if hostName != rescue { - // This is unexpected. We should be in rescue mode. - // Might happen when Reconcile reads stale data from the cache. - ctrl.LoggerFrom(ctx).Info("image-installing: expected rescue system, but found different hostname", "hostname", hostName, "sshOutput", out.String()) + realHostName := s.scope.Hostname() + if hostName != rescue || hostName != realHostName { + // During InstallImage the hostname changes from "rescue" to the realHostName. + // If it is not one of these two, then this is unexpected. + ctrl.LoggerFrom(ctx).Info( + fmt.Sprintf( + "image-installing: expected hostname %q or %q system, but found different output", rescue, realHostName), "sshOutput", out.String()) return actionContinue{delay: 10 * time.Second} } @@ -1950,7 +1952,6 @@ func createRebootEvent(ctx context.Context, host *infrav1.HetznerBareMetalHost, msg = fmt.Sprintf("Phase %s, reboot via %s: %s", host.Spec.Status.ProvisioningState, verboseRebootType, msg) record.Eventf(host, reason, msg) logger := ctrl.LoggerFrom(ctx) - fmt.Printf("createRooooooooootEvent %q %v %v\n", msg, logger.Enabled(), logger.GetV()) - logger.Info(msg, "host", host.Name) + logger.Info(msg, "reason", reason, "host", host.Name) return msg } From 5e2029f9752a783fa1a05f13755dfa5a9497b340 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Fri, 9 May 2025 15:57:43 +0200 Subject: [PATCH 12/19] typo. --- pkg/services/baremetal/host/host.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 7164c3cd0..411b6057b 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1153,7 +1153,7 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { out := sshClient.GetHostName() hostName := trimLineBreak(out.StdOut) realHostName := s.scope.Hostname() - if hostName != rescue || hostName != realHostName { + if hostName != rescue && hostName != realHostName { // During InstallImage the hostname changes from "rescue" to the realHostName. // If it is not one of these two, then this is unexpected. ctrl.LoggerFrom(ctx).Info( From 097953392dd47e9f013eae10ead54922a09bd897 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Tue, 13 May 2025 11:27:01 +0200 Subject: [PATCH 13/19] set permanent error, if hostname is unexpected (not rescue system). --- .gitignore | 1 + .../baremetal/client/ssh/ssh_client.go | 6 ++-- pkg/services/baremetal/host/host.go | 30 +++++++++++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 00fa7d365..509094797 100644 --- a/.gitignore +++ b/.gitignore @@ -79,5 +79,6 @@ watchall-output* /*.kubeconfig /*.log +/conditions*.txt /.config diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 9a6f5a041..def9b5c4e 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -430,13 +430,15 @@ func (c *sshClient) GetInstallImageState() (InstallImageState, error) { } out = c.runSSH(`[ -e /root/installimage-wrapper.sh.log ]`) - exists, err := out.ExitStatus() + exitStatus, err := out.ExitStatus() if err != nil { return "", fmt.Errorf("failed to check if installimage-wrapper.sh.log exists: %w", err) } - if exists == 0 { + if exitStatus == 0 { + // above log file exists, but installimage is not running: finished. return InstallImageStateFinished, nil } + // installimage is not running and the log file does not exist: not started yet. return InstallImageStateNotStartedYet, nil } diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 411b6057b..01ca5055e 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1112,11 +1112,20 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult { sshClient := s.scope.SSHClientFactory.NewClient(in) out := sshClient.GetHostName() + if out.Err != nil || out.StdErr != "" { + ctrl.LoggerFrom(ctx).Info("pre-provision: rescue sytem not reachable. Will try again", + "sshOutput", out.String()) + return actionContinue{delay: 10 * time.Second} + } + hostName := trimLineBreak(out.StdOut) if hostName != rescue { // This is unexpected. We should be in rescue mode. - ctrl.LoggerFrom(ctx).Info("pre-provision: expected rescue system, but found different hostname", "hostname", hostName, "sshOutput", out.String()) - return actionContinue{delay: 10 * time.Second} + msg := fmt.Sprintf("expected rescue system, but found different hostname %q", hostName) + record.Warnf(s.scope.HetznerBareMetalHost, "PreProvisioningFailed", msg) + ctrl.LoggerFrom(ctx).Error(errors.New("PreProvisioningFailed"), msg) + s.scope.HetznerBareMetalHost.SetError(infrav1.PermanentError, msg) + return actionStop{} } exitStatus, output, err := sshClient.ExecutePreProvisionCommand(ctx, s.scope.PreProvisionCommand) @@ -1151,15 +1160,24 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { sshClient := s.scope.SSHClientFactory.NewClient(in) out := sshClient.GetHostName() + if out.Err != nil || out.StdErr != "" { + ctrl.LoggerFrom(ctx).Info("image-installing: rescue sytem not reachable. Will try again", + "sshOutput", out.String()) + return actionContinue{delay: 10 * time.Second} + } + hostName := trimLineBreak(out.StdOut) realHostName := s.scope.Hostname() if hostName != rescue && hostName != realHostName { // During InstallImage the hostname changes from "rescue" to the realHostName. // If it is not one of these two, then this is unexpected. - ctrl.LoggerFrom(ctx).Info( - fmt.Sprintf( - "image-installing: expected hostname %q or %q system, but found different output", rescue, realHostName), "sshOutput", out.String()) - return actionContinue{delay: 10 * time.Second} + // This is unexpected. We should be in rescue mode. + msg := fmt.Sprintf("expected rescue system (%q or %q), but found different hostname %q", + rescue, realHostName, hostName) + record.Warnf(s.scope.HetznerBareMetalHost, "ImageInstallingFailed", msg) + ctrl.LoggerFrom(ctx).Error(errors.New("ImageInstallingFailed"), msg) + s.scope.HetznerBareMetalHost.SetError(infrav1.PermanentError, msg) + return actionStop{} } state, err := sshClient.GetInstallImageState() From 023af57e416975d809621622b8a6a66afd638e03 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Tue, 13 May 2025 11:36:19 +0200 Subject: [PATCH 14/19] added HetznerCluster to log values again. --- controllers/hetznerbaremetalhost_controller.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index bd28a261d..b50fb445d 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -26,6 +26,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -71,7 +72,6 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl // Fetch the Hetzner bare metal host instance. bmHost := &infrav1.HetznerBareMetalHost{} - // Use uncached APIReader to avoid cache issues. err := r.Get(ctx, req.NamespacedName, bmHost) if err != nil { if apierrors.IsNotFound(err) { @@ -147,6 +147,8 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl return reconcile.Result{}, fmt.Errorf("failed to get HetznerCluster: %w", err) } + log = log.WithValues("HetznerCluster", klog.KObj(hetznerCluster)) + // Fetch the Cluster. cluster, err := util.GetClusterFromMetadata(ctx, r.Client, hetznerCluster.ObjectMeta) if err != nil { @@ -239,6 +241,7 @@ func (r *HetznerBareMetalHostReconciler) reconcileSelectedStates(ctx context.Con return reconcile.Result{}, fmt.Errorf("Update() failed after setting ProvisioningState: %w", err) } } + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil // Handle StateDeleting From b8ba0d35828b4908dd38ba2cab444452b739042a Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Tue, 13 May 2025 11:38:26 +0200 Subject: [PATCH 15/19] fixed typo. --- pkg/services/baremetal/host/host.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 01ca5055e..424cb252e 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1113,7 +1113,7 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult { out := sshClient.GetHostName() if out.Err != nil || out.StdErr != "" { - ctrl.LoggerFrom(ctx).Info("pre-provision: rescue sytem not reachable. Will try again", + ctrl.LoggerFrom(ctx).Info("pre-provision: rescue system not reachable. Will try again", "sshOutput", out.String()) return actionContinue{delay: 10 * time.Second} } @@ -1161,7 +1161,7 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { out := sshClient.GetHostName() if out.Err != nil || out.StdErr != "" { - ctrl.LoggerFrom(ctx).Info("image-installing: rescue sytem not reachable. Will try again", + ctrl.LoggerFrom(ctx).Info("image-installing: rescue system not reachable. Will try again", "sshOutput", out.String()) return actionContinue{delay: 10 * time.Second} } From 3e6fd3e452891b46835a77a356b8574ab6660f56 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Tue, 13 May 2025 14:23:45 +0200 Subject: [PATCH 16/19] inlined function which gets used only once. --- pkg/services/baremetal/host/host.go | 84 ++++++++++++++--------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 424cb252e..42d359b2d 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -595,10 +595,49 @@ func (s *Service) actionRegistering(ctx context.Context) actionResult { } sshClient := s.scope.SSHClientFactory.NewClient(in) - ok, res := s.validateRescueSystemIsActive(ctx, sshClient) - if !ok { - return res + // Check hostname with sshClient + out := sshClient.GetHostName() + hostName := trimLineBreak(out.StdOut) + + if hostName != rescue { + // give the reboot some time until it takes effect + if s.hasJustRebooted() { + return actionContinue{delay: 2 * time.Second} + } + + isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out) + if err != nil { + // This can happen if the bare-metal server was taken by another mgt-cluster. + // Check in https://robot.hetzner.com/server for the "History" of the server. + return actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)} + } + + failed, err := s.handleIncompleteBoot(ctx, true, isSSHTimeoutError, isSSHConnectionRefusedError) + if failed { + return s.recordActionFailure(infrav1.PermanentError, err.Error()) + } + if err != nil { + return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} + } + + if !isSSHTimeoutError && !isSSHConnectionRefusedError { + msg := fmt.Sprintf("expected rescue system, but found different hostname %q", hostName) + record.Warn(s.scope.HetznerBareMetalHost, "RegisteringFailed", msg) + ctrl.LoggerFrom(ctx).Error(errors.New("RegisteringFailed"), msg) + s.scope.HetznerBareMetalHost.SetError(infrav1.PermanentError, msg) + return actionStop{} + } + + timeSinceReboot := "unknown" + if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil { + timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String() + } + + s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName, + "isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot) + return actionContinue{delay: 10 * time.Second} } + output := sshClient.GetHardwareDetailsDebug() if output.Err != nil { return actionError{err: fmt.Errorf("failed to obtain hardware for debugging: %w", output.Err)} @@ -686,45 +725,6 @@ func (s *Service) actionRegistering(ctx context.Context) actionResult { return actionComplete{} } -func (s *Service) validateRescueSystemIsActive(ctx context.Context, sshClient sshclient.Client) (ok bool, ar actionResult) { - // Check hostname with sshClient - out := sshClient.GetHostName() - hostName := trimLineBreak(out.StdOut) - if hostName == rescue { - return true, actionContinue{} - } - // give the reboot some time until it takes effect - if s.hasJustRebooted() { - return false, actionContinue{delay: 2 * time.Second} - } - - isSSHTimeoutError, isSSHConnectionRefusedError, err := s.analyzeSSHOutputRegistering(out) - if err != nil { - // This can happen if the bare-metal server was taken by another mgt-cluster. - // Check in https://robot.hetzner.com/server for the "History" of the server. - return false, actionError{err: fmt.Errorf("failed to handle incomplete boot - registering: %w", err)} - } - - failed, err := s.handleIncompleteBoot(ctx, true, isSSHTimeoutError, isSSHConnectionRefusedError) - if failed { - return false, s.recordActionFailure(infrav1.PermanentError, err.Error()) - } - if err != nil { - return false, actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} - } - timeSinceReboot := "unknown" - if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil { - timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String() - } - if !isSSHTimeoutError && !isSSHConnectionRefusedError { - s.scope.Logger.Info("Expected the rescue system, but got different hostname", "hostName", hostName, "timeSinceReboot", timeSinceReboot) - return false, actionContinue{delay: 10 * time.Second} - } - s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName, - "isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot) - return false, actionContinue{delay: 10 * time.Second} -} - func validateRootDeviceWwnsAreSubsetOfExistingWwns(rootDeviceHints *infrav1.RootDeviceHints, storageDevices []infrav1.Storage) error { knownWWNs := make([]string, 0, len(storageDevices)) for _, sd := range storageDevices { From 951c1444aac9a5dba2fda3dd37c6ebbdec366e78 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Tue, 13 May 2025 14:26:53 +0200 Subject: [PATCH 17/19] add log of HetznerBareMetalMachine again. --- controllers/hetznerbaremetalhost_controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index b50fb445d..a3348522f 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -168,6 +168,8 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } } + log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) + // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) robotCreds, err := getAndValidateRobotCredentials(ctx, req.Namespace, hetznerCluster, secretManager) From cb977e6f0506890e6e14516f181095005542141e Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Wed, 14 May 2025 09:34:39 +0200 Subject: [PATCH 18/19] git restore -s main controllers/hetznerbaremetalhost_controller.go --- .../hetznerbaremetalhost_controller.go | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index a3348522f..13ccfd8f4 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -69,6 +69,13 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log := ctrl.LoggerFrom(ctx) start := time.Now() + defer func() { + // check duration of reconcile. Warn if it took too long. + duration := time.Since(start) + if duration > 15*time.Second { + log.Info("Reconcile took too long", "duration", duration, "res", res, "reterr", reterr) + } + }() // Fetch the Hetzner bare metal host instance. bmHost := &infrav1.HetznerBareMetalHost{} @@ -80,18 +87,10 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl return reconcile.Result{}, err } - log = log.WithValues("state", bmHost.Spec.Status.ProvisioningState) - ctx = ctrl.LoggerInto(ctx, log) - initialProvisioningState := bmHost.Spec.Status.ProvisioningState defer func() { - // check duration of reconcile. Warn if it took too long. - duration := time.Since(start) - if duration > 10*time.Second { - log.Info("Reconcile took too long", "duration", duration, "res", res, "reterr", reterr) - } if initialProvisioningState != bmHost.Spec.Status.ProvisioningState { - log.Info("Provisioning state changed", "newState", bmHost.Spec.Status.ProvisioningState, "duration", duration) + log.Info("Provisioning state changed", "from", initialProvisioningState, "to", bmHost.Spec.Status.ProvisioningState) } }() @@ -141,7 +140,6 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } if err := r.Client.Get(ctx, hetznerClusterName, hetznerCluster); err != nil { if apierrors.IsNotFound(err) { - log.Info("HetznerCluster not found. Requeueing") return reconcile.Result{RequeueAfter: 10 * time.Second}, nil } return reconcile.Result{}, fmt.Errorf("failed to get HetznerCluster: %w", err) @@ -155,6 +153,8 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl return reconcile.Result{}, fmt.Errorf("failed to get Cluster: %w", err) } + log = log.WithValues("Cluster", klog.KObj(cluster)) + hetznerBareMetalMachine := &infrav1.HetznerBareMetalMachine{} if bmHost.Spec.ConsumerRef != nil { @@ -170,6 +170,8 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) + ctx = ctrl.LoggerInto(ctx, log) + // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) robotCreds, err := getAndValidateRobotCredentials(ctx, req.Namespace, hetznerCluster, secretManager) From c74dd1f315426858e9de545f228e5f47db3b7099 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Wed, 14 May 2025 09:57:59 +0200 Subject: [PATCH 19/19] remove setting permanent error. --- pkg/services/baremetal/host/host.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 42d359b2d..1af890af3 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -620,14 +620,6 @@ func (s *Service) actionRegistering(ctx context.Context) actionResult { return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} } - if !isSSHTimeoutError && !isSSHConnectionRefusedError { - msg := fmt.Sprintf("expected rescue system, but found different hostname %q", hostName) - record.Warn(s.scope.HetznerBareMetalHost, "RegisteringFailed", msg) - ctrl.LoggerFrom(ctx).Error(errors.New("RegisteringFailed"), msg) - s.scope.HetznerBareMetalHost.SetError(infrav1.PermanentError, msg) - return actionStop{} - } - timeSinceReboot := "unknown" if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil { timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String()