@@ -592,46 +592,18 @@ func (s *Service) actionRegistering(_ context.Context) actionResult {
592
592
}
593
593
sshClient := s .scope .SSHClientFactory .NewClient (in )
594
594
595
- // Check hostname with sshClient
596
- out := sshClient .GetHostName ()
597
- hostName := trimLineBreak (out .StdOut )
598
- if hostName != rescue {
599
- // give the reboot some time until it takes effect
600
- if s .hasJustRebooted () {
601
- return actionContinue {delay : 2 * time .Second }
602
- }
603
-
604
- isSSHTimeoutError , isSSHConnectionRefusedError , err := s .analyzeSSHOutputRegistering (out )
605
- if err != nil {
606
- // This can happen if the bare-metal server was taken by another mgt-cluster.
607
- // Check in https://robot.hetzner.com/server for the "History" of the server.
608
- return actionError {err : fmt .Errorf ("failed to handle incomplete boot - registering: %w" , err )}
609
- }
610
-
611
- failed , err := s .handleIncompleteBoot (true , isSSHTimeoutError , isSSHConnectionRefusedError )
612
- if failed {
613
- return s .recordActionFailure (infrav1 .PermanentError , err .Error ())
614
- }
615
- if err != nil {
616
- return actionError {err : fmt .Errorf (errMsgFailedHandlingIncompleteBoot , err )}
617
- }
618
- timeSinceReboot := "unknown"
619
- if s .scope .HetznerBareMetalHost .Spec .Status .LastUpdated != nil {
620
- timeSinceReboot = time .Since (s .scope .HetznerBareMetalHost .Spec .Status .LastUpdated .Time ).String ()
621
- }
622
-
623
- s .scope .Logger .Info ("Could not reach rescue system. Will retry some seconds later." , "out" , out .String (), "hostName" , hostName ,
624
- "isSSHTimeoutError" , isSSHTimeoutError , "isSSHConnectionRefusedError" , isSSHConnectionRefusedError , "timeSinceReboot" , timeSinceReboot )
625
- return actionContinue {delay : 10 * time .Second }
595
+ ok , res := s .validateRescueSystemIsActive (sshClient )
596
+ if ! ok {
597
+ return res
626
598
}
627
-
628
599
output := sshClient .GetHardwareDetailsDebug ()
629
600
if output .Err != nil {
630
601
return actionError {err : fmt .Errorf ("failed to obtain hardware for debugging: %w" , output .Err )}
631
602
}
603
+
632
604
msg := fmt .Sprintf ("%s\n \n " , output .StdOut )
633
- if out .StdErr != "" {
634
- msg += fmt .Sprintf ("stderr:\n %s\n \n " , out .StdErr )
605
+ if output .StdErr != "" {
606
+ msg += fmt .Sprintf ("stderr:\n %s\n \n " , output .StdErr )
635
607
}
636
608
record .Eventf (s .scope .HetznerBareMetalHost , "GetHardwareDetails" , msg )
637
609
@@ -711,6 +683,42 @@ func (s *Service) actionRegistering(_ context.Context) actionResult {
711
683
return actionComplete {}
712
684
}
713
685
686
+ func (s * Service ) validateRescueSystemIsActive (sshClient sshclient.Client ) (ok bool , ar actionResult ) {
687
+ // Check hostname with sshClient
688
+ out := sshClient .GetHostName ()
689
+ hostName := trimLineBreak (out .StdOut )
690
+ if hostName == rescue {
691
+ return true , actionContinue {}
692
+ }
693
+ // give the reboot some time until it takes effect
694
+ if s .hasJustRebooted () {
695
+ return false , actionContinue {delay : 2 * time .Second }
696
+ }
697
+
698
+ isSSHTimeoutError , isSSHConnectionRefusedError , err := s .analyzeSSHOutputRegistering (out )
699
+ if err != nil {
700
+ // This can happen if the bare-metal server was taken by another mgt-cluster.
701
+ // Check in https://robot.hetzner.com/server for the "History" of the server.
702
+ return false , actionError {err : fmt .Errorf ("failed to handle incomplete boot - registering: %w" , err )}
703
+ }
704
+
705
+ failed , err := s .handleIncompleteBoot (true , isSSHTimeoutError , isSSHConnectionRefusedError )
706
+ if failed {
707
+ return false , s .recordActionFailure (infrav1 .PermanentError , err .Error ())
708
+ }
709
+ if err != nil {
710
+ return false , actionError {err : fmt .Errorf (errMsgFailedHandlingIncompleteBoot , err )}
711
+ }
712
+ timeSinceReboot := "unknown"
713
+ if s .scope .HetznerBareMetalHost .Spec .Status .LastUpdated != nil {
714
+ timeSinceReboot = time .Since (s .scope .HetznerBareMetalHost .Spec .Status .LastUpdated .Time ).String ()
715
+ }
716
+
717
+ s .scope .Logger .Info ("Could not reach rescue system. Will retry some seconds later." , "out" , out .String (), "hostName" , hostName ,
718
+ "isSSHTimeoutError" , isSSHTimeoutError , "isSSHConnectionRefusedError" , isSSHConnectionRefusedError , "timeSinceReboot" , timeSinceReboot )
719
+ return false , actionContinue {delay : 10 * time .Second }
720
+ }
721
+
714
722
func validateRootDeviceWwnsAreSubsetOfExistingWwns (rootDeviceHints * infrav1.RootDeviceHints , storageDevices []infrav1.Storage ) error {
715
723
knownWWNs := make ([]string , 0 , len (storageDevices ))
716
724
for _ , sd := range storageDevices {
@@ -1097,6 +1105,11 @@ func (s *Service) actionPreProvisioning(ctx context.Context) actionResult {
1097
1105
}
1098
1106
sshClient := s .scope .SSHClientFactory .NewClient (in )
1099
1107
1108
+ ok , res := s .validateRescueSystemIsActive (sshClient )
1109
+ if ! ok {
1110
+ return res
1111
+ }
1112
+
1100
1113
exitStatus , output , err := sshClient .ExecutePreProvisionCommand (ctx , s .scope .PreProvisionCommand )
1101
1114
if err != nil {
1102
1115
return actionError {err : fmt .Errorf ("failed to execute pre-provision command: %w" , err )}
@@ -1128,6 +1141,11 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult {
1128
1141
}
1129
1142
sshClient := s .scope .SSHClientFactory .NewClient (in )
1130
1143
1144
+ ok , res := s .validateRescueSystemIsActive (sshClient )
1145
+ if ! ok {
1146
+ return res
1147
+ }
1148
+
1131
1149
state , err := sshClient .GetInstallImageState ()
1132
1150
if err != nil {
1133
1151
return actionError {err : fmt .Errorf ("failed to get state of installimage processes: %w" , err )}
@@ -1450,46 +1468,6 @@ func getDeviceNames(wwn []string, storageDevices []infrav1.Storage) []string {
1450
1468
return deviceNames
1451
1469
}
1452
1470
1453
- func analyzeSSHOutputInstallImage (out sshclient.Output , sshClient sshclient.Client , port int ) (isTimeout , isConnectionRefused bool , reterr error ) {
1454
- // check err
1455
- if out .Err != nil {
1456
- switch {
1457
- case os .IsTimeout (out .Err ) || sshclient .IsTimeoutError (out .Err ):
1458
- isTimeout = true
1459
- return isTimeout , false , nil
1460
- case sshclient .IsAuthenticationFailedError (out .Err ):
1461
- if err := handleAuthenticationFailed (sshClient , port ); err != nil {
1462
- return false , false , fmt .Errorf ("original ssh error: %w. err: %w" , out .Err , err )
1463
- }
1464
- return false , false , handleAuthenticationFailed (sshClient , port )
1465
- case sshclient .IsConnectionRefusedError (out .Err ):
1466
- return false , verifyConnectionRefused (sshClient , port ), nil
1467
- }
1468
-
1469
- return false , false , fmt .Errorf ("unhandled ssh error while getting hostname: %w" , out .Err )
1470
- }
1471
-
1472
- // check stderr
1473
- if out .StdErr != "" {
1474
- // This is an unexpected error
1475
- return false , false , fmt .Errorf ("%w: StdErr: %s" , errSSHGetHostname , out .StdErr )
1476
- }
1477
-
1478
- // check stdout
1479
- hostname := trimLineBreak (out .StdOut )
1480
- switch hostname {
1481
- case "" :
1482
- // Hostname should not be empty. This is unexpected.
1483
- return false , false , errEmptyHostName
1484
- case rescue : // We are in wrong boot, nothing has to be done to trigger reboot
1485
- return false , false , nil
1486
- }
1487
-
1488
- // We are in the case that hostName != rescue && StdOut != hostName
1489
- // This is unexpected
1490
- return false , false , fmt .Errorf ("%w: %s" , errUnexpectedHostName , hostname )
1491
- }
1492
-
1493
1471
func handleAuthenticationFailed (sshClient sshclient.Client , port int ) error {
1494
1472
// Check whether we are in the wrong system in the case that rescue and os system might be running on the same port.
1495
1473
if port == rescuePort {
0 commit comments