From 3d01d2dbbf07e942d8d611343b3dd3be8bf71bb4 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 12:25:34 +0100 Subject: [PATCH 01/22] :seedling: Remove usage of FailureReason and FailureMessage (baremetal) > Deprecated: This field is deprecated and is > going to be removed when support for v1beta1 > will be dropped. Please see > https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md > for more details. --- api/v1beta1/hetznerbaremetalhost_types.go | 5 ++- api/v1beta1/hetznerbaremetalmachine_types.go | 13 +++--- .../hetznerbaremetalmachine_types_test.go | 31 +------------- api/v1beta1/zz_generated.deepcopy.go | 4 +- ...luster.x-k8s.io_hetznerbaremetalhosts.yaml | 5 ++- ...ter.x-k8s.io_hetznerbaremetalmachines.yaml | 6 ++- .../hetznerbaremetalhost_controller.go | 11 +++++ .../hetznerbaremetalmachine_controller.go | 7 ++++ ...hetznerbaremetalmachine_controller_test.go | 19 ++++++--- pkg/scope/baremetalmachine.go | 26 ++++++++++++ pkg/services/baremetal/baremetal/baremetal.go | 42 +++++++++---------- 11 files changed, 96 insertions(+), 73 deletions(-) diff --git a/api/v1beta1/hetznerbaremetalhost_types.go b/api/v1beta1/hetznerbaremetalhost_types.go index 95c40ab17..b8b5c2349 100644 --- a/api/v1beta1/hetznerbaremetalhost_types.go +++ b/api/v1beta1/hetznerbaremetalhost_types.go @@ -230,8 +230,9 @@ type HetznerBareMetalHostSpec struct { // +optional ConsumerRef *corev1.ObjectReference `json:"consumerRef,omitempty"` - // MaintenanceMode indicates that a machine is supposed to be deprovisioned - // and won't be selected by any Hetzner bare metal machine. + // MaintenanceMode indicates that a machine is supposed to be deprovisioned. The CAPI Machine + // will get the cluster.x-k8s.io/remediate-machine annotation, and CAPI will deprovision the + // machine. Accordingly new the host won't be selected by any Hetzner bare metal machine. MaintenanceMode *bool `json:"maintenanceMode,omitempty"` // Description is a human-entered text used to help identify the host. diff --git a/api/v1beta1/hetznerbaremetalmachine_types.go b/api/v1beta1/hetznerbaremetalmachine_types.go index a671c9806..f8d6bb81a 100644 --- a/api/v1beta1/hetznerbaremetalmachine_types.go +++ b/api/v1beta1/hetznerbaremetalmachine_types.go @@ -25,7 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/selection" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11 + //nolint:staticcheck // we will handle that, when we update to capi v1.11 ) const ( @@ -298,8 +298,11 @@ type HetznerBareMetalMachineStatus struct { LastUpdated *metav1.Time `json:"lastUpdated,omitempty"` // FailureReason will be set in the event that there is a terminal problem. + // + // Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details. + // // +optional - FailureReason *capierrors.MachineStatusError `json:"failureReason,omitempty"` + FailureReason *string `json:"failureReason,omitempty"` // FailureMessage will be set in the event that there is a terminal problem. // +optional @@ -357,12 +360,6 @@ func (hbmm *HetznerBareMetalMachine) SetConditions(conditions clusterv1.Conditio hbmm.Status.Conditions = conditions } -// SetFailure sets a failure reason and message. -func (hbmm *HetznerBareMetalMachine) SetFailure(reason capierrors.MachineStatusError, message string) { - hbmm.Status.FailureReason = &reason - hbmm.Status.FailureMessage = &message -} - // GetImageSuffix tests whether the suffix is known and outputs it if yes. Otherwise it returns an error. func GetImageSuffix(url string) (string, error) { if strings.HasPrefix(url, "oci://") { diff --git a/api/v1beta1/hetznerbaremetalmachine_types_test.go b/api/v1beta1/hetznerbaremetalmachine_types_test.go index b00934ea2..0beec94db 100644 --- a/api/v1beta1/hetznerbaremetalmachine_types_test.go +++ b/api/v1beta1/hetznerbaremetalmachine_types_test.go @@ -23,7 +23,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/stretchr/testify/require" - capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11 + //nolint:staticcheck // we will handle that, when we update to capi v1.11 ) var _ = Describe("Test Image.GetDetails", func() { @@ -177,35 +177,6 @@ var _ = Describe("Test GetImageSuffix", func() { ) }) -var _ = Describe("Test SetFailure", func() { - bmMachine := HetznerBareMetalMachine{} - newFailureMessage := "bad error" - newFailureReason := capierrors.CreateMachineError - - It("sets new failure on the machine with existing failure", func() { - failureMessage := "first message" - failureReason := capierrors.MachineStatusError("first error") - bmMachine.Status.FailureMessage = &failureMessage - bmMachine.Status.FailureReason = &failureReason - - bmMachine.SetFailure(newFailureReason, newFailureMessage) - - Expect(bmMachine.Status.FailureMessage).ToNot(BeNil()) - Expect(bmMachine.Status.FailureReason).ToNot(BeNil()) - Expect(*bmMachine.Status.FailureMessage).To(Equal(newFailureMessage)) - Expect(*bmMachine.Status.FailureReason).To(Equal(newFailureReason)) - }) - - It("sets new failure on the machine without existing failure", func() { - bmMachine.SetFailure(newFailureReason, newFailureMessage) - - Expect(bmMachine.Status.FailureMessage).ToNot(BeNil()) - Expect(bmMachine.Status.FailureReason).ToNot(BeNil()) - Expect(*bmMachine.Status.FailureMessage).To(Equal(newFailureMessage)) - Expect(*bmMachine.Status.FailureReason).To(Equal(newFailureReason)) - }) -}) - var _ = Describe("Test HasHostAnnotation", func() { type testCaseHasHostAnnotation struct { annotations map[string]string diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index ab12dee41..55127476c 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -22,7 +22,7 @@ package v1beta1 import ( "github.com/hetznercloud/hcloud-go/v2/hcloud" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" apiv1beta1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -896,7 +896,7 @@ func (in *HetznerBareMetalMachineStatus) DeepCopyInto(out *HetznerBareMetalMachi } if in.FailureReason != nil { in, out := &in.FailureReason, &out.FailureReason - *out = new(errors.MachineStatusError) + *out = new(string) **out = **in } if in.FailureMessage != nil { diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml index e347061c3..2d3500414 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml @@ -133,8 +133,9 @@ spec: type: string maintenanceMode: description: |- - MaintenanceMode indicates that a machine is supposed to be deprovisioned - and won't be selected by any Hetzner bare metal machine. + MaintenanceMode indicates that a machine is supposed to be deprovisioned. The CAPI Machine + will get the cluster.x-k8s.io/remediate-machine annotation, and CAPI will deprovision the + machine. Accordingly new the host won't be selected by any Hetzner bare metal machine. type: boolean rootDeviceHints: description: |- diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml index 9d7cfe261..9ecd6cfe5 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml @@ -399,8 +399,10 @@ spec: a terminal problem. type: string failureReason: - description: FailureReason will be set in the event that there is - a terminal problem. + description: |- + FailureReason will be set in the event that there is a terminal problem. + + Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details. type: string lastUpdated: description: LastUpdated identifies when this status was last observed. diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 74d2ca0d3..e47fffba6 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -227,7 +227,18 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) + machine, err := util.GetOwnerMachine(ctx, r.Client, hetznerBareMetalMachine.ObjectMeta) ctx = ctrl.LoggerInto(ctx, log) + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed to get owner machine: %w", err) + } + + _, exits := machine.Annotations[clusterv1.RemediateMachineAnnotation] + if exits { + // The hbmm of this hsot will be deleted soon. Do no reconcile it. + log.Info("CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine.") + return reconcile.Result{}, nil + } // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index 084ad06fa..3dfafa09a 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -159,6 +159,13 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r return r.reconcileDelete(ctx, machineScope) } + _, exist := machine.Annotations[clusterv1.RemediateMachineAnnotation] + if exist { + // This hbmm will be deleted soon. Do no reconcile it. + log.Info("CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine.") + return reconcile.Result{}, nil + } + return r.reconcileNormal(ctx, machineScope) } diff --git a/controllers/hetznerbaremetalmachine_controller_test.go b/controllers/hetznerbaremetalmachine_controller_test.go index 8dfc1afa9..0421b96c9 100644 --- a/controllers/hetznerbaremetalmachine_controller_test.go +++ b/controllers/hetznerbaremetalmachine_controller_test.go @@ -18,6 +18,7 @@ package controllers import ( "context" + "fmt" "testing" "time" @@ -36,13 +37,13 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/reconcile" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" - "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/baremetal" robotmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/robot" sshmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/ssh" sshclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/ssh" @@ -433,12 +434,20 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { By("checking that failure message is set on machine") - Eventually(func() bool { + Eventually(func() error { if err := testEnv.Get(ctx, key, bmMachine); err != nil { - return false + return err } - return bmMachine.Status.FailureMessage != nil && *bmMachine.Status.FailureMessage == baremetal.FailureMessageMaintenanceMode - }, timeout).Should(BeTrue()) + capiMachine, err := util.GetOwnerMachine(ctx, testEnv, bmMachine.ObjectMeta) + if err != nil { + return err + } + _, exists := capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] + if !exists { + return fmt.Errorf("RemediateMachineAnnotation not set on capi machine") + } + return nil + }, timeout).Should(Succeed()) }) It("checks the hetznerBareMetalMachine status running phase", func() { diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index 8b8ebc354..2bd8a781f 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -25,6 +25,7 @@ import ( "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" + "sigs.k8s.io/cluster-api/util/record" "sigs.k8s.io/controller-runtime/pkg/client" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" @@ -123,3 +124,28 @@ func (m *BareMetalMachineScope) IsControlPlane() bool { func (m *BareMetalMachineScope) IsBootstrapReady() bool { return m.Machine.Spec.Bootstrap.DataSecretName != nil } + +// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding +// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be +// created. +func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, message string) error { + obj := m.Machine + + // Create a patch base + patch := client.MergeFrom(obj.DeepCopy()) + + // Modify only annotations on the in-memory copy + if obj.Annotations == nil { + obj.Annotations = map[string]string{} + } + obj.Annotations[clusterv1.RemediateMachineAnnotation] = "" + + // Apply patch – only the diff (annotations) is sent to the API server + if err := m.Client.Patch(ctx, obj, patch); err != nil { + return err + } + + record.Warnf(m.BareMetalMachine, "HetznerBareMetalMachine be remediated: %s", message) + + return nil +} diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index 50a8f9275..bf27bd9b1 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -38,8 +38,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/utils/ptr" - clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11 + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" //nolint:staticcheck // we will handle that, when we update to capi v1.11 "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/record" @@ -208,7 +207,10 @@ func (s *Service) update(ctx context.Context) error { return fmt.Errorf("failed to get host: %w", err) } if host == nil { - s.scope.BareMetalMachine.SetFailure(capierrors.UpdateMachineError, "host not found") + err := s.scope.SetErrorAndRemediate(ctx, "host not found") + if err != nil { + return err + } return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err) } @@ -229,30 +231,23 @@ func (s *Service) update(ctx context.Context) error { } // maintenance mode on the host is a fatal error for the machine object - if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode && s.scope.BareMetalMachine.Status.FailureReason == nil { - s.scope.BareMetalMachine.SetFailure(capierrors.UpdateMachineError, FailureMessageMaintenanceMode) - record.Eventf( - s.scope.BareMetalMachine, - "BareMetalMachineSetFailure", - "set failure reason due to maintenance mode of underlying host", - ) + if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { + err := s.scope.SetErrorAndRemediate(ctx, FailureMessageMaintenanceMode) + if err != nil { + return err + } return nil } - // if host has a fatal error, then it should be set on the machine object as well - if (host.Spec.Status.ErrorType == infrav1.FatalError || host.Spec.Status.ErrorType == infrav1.PermanentError) && - s.scope.BareMetalMachine.Status.FailureReason == nil { - s.scope.BareMetalMachine.SetFailure(capierrors.UpdateMachineError, host.Spec.Status.ErrorMessage) - record.Eventf(s.scope.BareMetalMachine, "BareMetalMachineSetFailure", host.Spec.Status.ErrorMessage) + // if host has a fatal error, then it should be set on the hbmm object as well + if host.Spec.Status.ErrorType == infrav1.FatalError || host.Spec.Status.ErrorType == infrav1.PermanentError { + err := s.scope.SetErrorAndRemediate(ctx, host.Spec.Status.ErrorMessage) + if err != nil { + return err + } return nil } - // if host is healthy, the machine is healthy as well - if host.Spec.Status.ErrorType == infrav1.ErrorType("") { - s.scope.BareMetalMachine.Status.FailureMessage = nil - s.scope.BareMetalMachine.Status.FailureReason = nil - } - // ensure that the references are correctly set on host s.setReferencesOnHost(host) @@ -674,7 +669,10 @@ func (s *Service) setProviderID(ctx context.Context) error { } if host == nil { - s.scope.BareMetalMachine.SetFailure(capierrors.UpdateMachineError, "host not found") + err := s.scope.SetErrorAndRemediate(ctx, "host not found") + if err != nil { + return err + } return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err) } From 1fa140644cc926df253acdba9ad192b23939877b Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 12:50:58 +0100 Subject: [PATCH 02/22] make linter happy. --- api/v1beta1/hetznerbaremetalhost_types.go | 2 +- api/v1beta1/hetznerbaremetalmachine_types.go | 1 - api/v1beta1/hetznerbaremetalmachine_types_test.go | 1 - pkg/services/baremetal/baremetal/baremetal.go | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/api/v1beta1/hetznerbaremetalhost_types.go b/api/v1beta1/hetznerbaremetalhost_types.go index b8b5c2349..d22d60d00 100644 --- a/api/v1beta1/hetznerbaremetalhost_types.go +++ b/api/v1beta1/hetznerbaremetalhost_types.go @@ -232,7 +232,7 @@ type HetznerBareMetalHostSpec struct { // MaintenanceMode indicates that a machine is supposed to be deprovisioned. The CAPI Machine // will get the cluster.x-k8s.io/remediate-machine annotation, and CAPI will deprovision the - // machine. Accordingly new the host won't be selected by any Hetzner bare metal machine. + // machine. Accordingly, the host won't be selected by any Hetzner bare metal machine. MaintenanceMode *bool `json:"maintenanceMode,omitempty"` // Description is a human-entered text used to help identify the host. diff --git a/api/v1beta1/hetznerbaremetalmachine_types.go b/api/v1beta1/hetznerbaremetalmachine_types.go index f8d6bb81a..ed46bc744 100644 --- a/api/v1beta1/hetznerbaremetalmachine_types.go +++ b/api/v1beta1/hetznerbaremetalmachine_types.go @@ -25,7 +25,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/selection" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - //nolint:staticcheck // we will handle that, when we update to capi v1.11 ) const ( diff --git a/api/v1beta1/hetznerbaremetalmachine_types_test.go b/api/v1beta1/hetznerbaremetalmachine_types_test.go index 0beec94db..b8b815686 100644 --- a/api/v1beta1/hetznerbaremetalmachine_types_test.go +++ b/api/v1beta1/hetznerbaremetalmachine_types_test.go @@ -23,7 +23,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/stretchr/testify/require" - //nolint:staticcheck // we will handle that, when we update to capi v1.11 ) var _ = Describe("Test Image.GetDetails", func() { diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index bf27bd9b1..d95f6c5ff 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -38,7 +38,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/utils/ptr" - clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" //nolint:staticcheck // we will handle that, when we update to capi v1.11 + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/record" From 8c7c7a98c50c8261d1c21272736d481559e0f480 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 13:11:05 +0100 Subject: [PATCH 03/22] set condition. --- api/v1beta1/conditions_const.go | 12 ++++++++++++ controllers/hetznerbaremetalhost_controller.go | 7 ++++++- controllers/hetznerbaremetalmachine_controller.go | 7 ++++++- .../hetznerbaremetalmachine_controller_test.go | 14 ++++++++++++++ 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index 9b5111199..ce9f12b71 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -238,3 +238,15 @@ const ( // RebootSucceededCondition indicates that the machine got rebooted successfully. RebootSucceededCondition clusterv1.ConditionType = "RebootSucceeded" ) + +const ( + // NoRemediateMachineAnnotationCondition indicates (if False), that the CAPI machine has the + // "cluster.x-k8s.io/remediate-machine" annotation set. The CAPI machine and the corresponding + // infra-machine will be deleted by CAPI soon. + NoRemediateMachineAnnotationCondition clusterv1.ConditionType = "NoRemediateMachineAnnotation" + + // RemediateMachineAnnotationIsSetReason indicates that the CAPI machine has the + // "cluster.x-k8s.io/remediate-machine" annotation set. The CAPI machine and the corresponding + // infra-machine will be deleted by CAPI soon. + RemediateMachineAnnotationIsSetReason = "RemediateMachineAnnotationIsSet" +) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index e47fffba6..b0203b1c8 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -42,6 +42,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets" @@ -236,9 +237,13 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl _, exits := machine.Annotations[clusterv1.RemediateMachineAnnotation] if exits { // The hbmm of this hsot will be deleted soon. Do no reconcile it. - log.Info("CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine.") + msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." + log.Info(msg) + conditions.MarkFalse(bmHost, v1beta1.NoRemediateMachineAnnotationCondition, + v1beta1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) return reconcile.Result{}, nil } + conditions.MarkTrue(bmHost, v1beta1.NoRemediateMachineAnnotationCondition) // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index 3dfafa09a..b273053ea 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -39,6 +39,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets" @@ -162,9 +163,13 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r _, exist := machine.Annotations[clusterv1.RemediateMachineAnnotation] if exist { // This hbmm will be deleted soon. Do no reconcile it. - log.Info("CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine.") + msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." + log.Info(msg) + conditions.MarkFalse(hbmMachine, v1beta1.NoRemediateMachineAnnotationCondition, + v1beta1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) return reconcile.Result{}, nil } + conditions.MarkTrue(hbmMachine, v1beta1.NoRemediateMachineAnnotationCondition) return r.reconcileNormal(ctx, machineScope) } diff --git a/controllers/hetznerbaremetalmachine_controller_test.go b/controllers/hetznerbaremetalmachine_controller_test.go index 0421b96c9..4b18179d4 100644 --- a/controllers/hetznerbaremetalmachine_controller_test.go +++ b/controllers/hetznerbaremetalmachine_controller_test.go @@ -38,11 +38,13 @@ import ( "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" robotmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/robot" sshmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/ssh" @@ -438,14 +440,26 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { if err := testEnv.Get(ctx, key, bmMachine); err != nil { return err } + capiMachine, err := util.GetOwnerMachine(ctx, testEnv, bmMachine.ObjectMeta) if err != nil { return err } + _, exists := capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] if !exists { return fmt.Errorf("RemediateMachineAnnotation not set on capi machine") } + + c := conditions.Get(bmMachine, v1beta1.NoRemediateMachineAnnotationCondition) + if c == nil { + return fmt.Errorf("Condition NoRemediateMachineAnnotationCondition does not exist.") + } + + if c.Status != corev1.ConditionFalse { + return fmt.Errorf("Condition NoRemediateMachineAnnotationCondition should be False") + } + return nil }, timeout).Should(Succeed()) }) From 8c81c6c43ab7a8a80f86269b61fa213935819156 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 14:07:38 +0100 Subject: [PATCH 04/22] feedback of AI. --- api/v1beta1/conditions_const.go | 6 +++--- api/v1beta1/zz_generated.deepcopy.go | 2 +- ...ture.cluster.x-k8s.io_hetznerbaremetalhosts.yaml | 2 +- controllers/hetznerbaremetalhost_controller.go | 13 ++++++------- controllers/hetznerbaremetalmachine_controller.go | 7 +++---- .../hetznerbaremetalmachine_controller_test.go | 3 +-- pkg/scope/baremetalmachine.go | 3 ++- 7 files changed, 17 insertions(+), 19 deletions(-) diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index ce9f12b71..8a1e8bbfb 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -240,9 +240,9 @@ const ( ) const ( - // NoRemediateMachineAnnotationCondition indicates (if False), that the CAPI machine has the - // "cluster.x-k8s.io/remediate-machine" annotation set. The CAPI machine and the corresponding - // infra-machine will be deleted by CAPI soon. + // NoRemediateMachineAnnotationCondition is: + // - False when the corresponding CAPI Machine has the "cluster.x-k8s.io/remediate-machine" annotation set and will be remediated by CAPI soon. + // - True otherwise. NoRemediateMachineAnnotationCondition clusterv1.ConditionType = "NoRemediateMachineAnnotation" // RemediateMachineAnnotationIsSetReason indicates that the CAPI machine has the diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 55127476c..7d1fa5b2d 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -22,7 +22,7 @@ package v1beta1 import ( "github.com/hetznercloud/hcloud-go/v2/hcloud" - v1 "k8s.io/api/core/v1" + "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" apiv1beta1 "sigs.k8s.io/cluster-api/api/v1beta1" diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml index 2d3500414..8ad6d2bfa 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml @@ -135,7 +135,7 @@ spec: description: |- MaintenanceMode indicates that a machine is supposed to be deprovisioned. The CAPI Machine will get the cluster.x-k8s.io/remediate-machine annotation, and CAPI will deprovision the - machine. Accordingly new the host won't be selected by any Hetzner bare metal machine. + machine. Accordingly, the host won't be selected by any Hetzner bare metal machine. type: boolean rootDeviceHints: description: |- diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index b0203b1c8..c6e6ffc7b 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -42,7 +42,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets" @@ -234,16 +233,16 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl return reconcile.Result{}, fmt.Errorf("failed to get owner machine: %w", err) } - _, exits := machine.Annotations[clusterv1.RemediateMachineAnnotation] - if exits { - // The hbmm of this hsot will be deleted soon. Do no reconcile it. + _, exists := machine.Annotations[clusterv1.RemediateMachineAnnotation] + if exists { + // The hbmm of this host will be deleted soon. Do no reconcile it. msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." log.Info(msg) - conditions.MarkFalse(bmHost, v1beta1.NoRemediateMachineAnnotationCondition, - v1beta1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) + conditions.MarkFalse(bmHost, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) return reconcile.Result{}, nil } - conditions.MarkTrue(bmHost, v1beta1.NoRemediateMachineAnnotationCondition) + conditions.MarkTrue(bmHost, infrav1.NoRemediateMachineAnnotationCondition) // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index b273053ea..12d9d5a7a 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -39,7 +39,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets" @@ -165,11 +164,11 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r // This hbmm will be deleted soon. Do no reconcile it. msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." log.Info(msg) - conditions.MarkFalse(hbmMachine, v1beta1.NoRemediateMachineAnnotationCondition, - v1beta1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) + conditions.MarkFalse(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) return reconcile.Result{}, nil } - conditions.MarkTrue(hbmMachine, v1beta1.NoRemediateMachineAnnotationCondition) + conditions.MarkTrue(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) return r.reconcileNormal(ctx, machineScope) } diff --git a/controllers/hetznerbaremetalmachine_controller_test.go b/controllers/hetznerbaremetalmachine_controller_test.go index 4b18179d4..2f39afe23 100644 --- a/controllers/hetznerbaremetalmachine_controller_test.go +++ b/controllers/hetznerbaremetalmachine_controller_test.go @@ -44,7 +44,6 @@ import ( fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" robotmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/robot" sshmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/ssh" @@ -451,7 +450,7 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { return fmt.Errorf("RemediateMachineAnnotation not set on capi machine") } - c := conditions.Get(bmMachine, v1beta1.NoRemediateMachineAnnotationCondition) + c := conditions.Get(bmMachine, infrav1.NoRemediateMachineAnnotationCondition) if c == nil { return fmt.Errorf("Condition NoRemediateMachineAnnotationCondition does not exist.") } diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index 2bd8a781f..27b2a15b6 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -145,7 +145,8 @@ func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, messag return err } - record.Warnf(m.BareMetalMachine, "HetznerBareMetalMachine be remediated: %s", message) + record.Warnf(m.BareMetalMachine, "HetznerBareMetalMachineWillBeRemediated", + "HetznerBareMetalMachine will be remediated: %s", message) return nil } From 5b3a35db50b3eadb2a2ca8949607a684a9278355 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 14:38:44 +0100 Subject: [PATCH 05/22] set condition on host, too. Do not loose the message... --- .../hetznerbaremetalhost_controller.go | 8 +- .../hetznerbaremetalmachine_controller.go | 9 ++- pkg/scope/baremetalmachine.go | 73 ++++++++++++++++++- pkg/scope/baremetalmachine_test.go | 12 +++ pkg/services/baremetal/baremetal/baremetal.go | 53 +------------- .../baremetal/baremetal/baremetal_test.go | 12 --- 6 files changed, 100 insertions(+), 67 deletions(-) create mode 100644 pkg/scope/baremetalmachine_test.go diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index c6e6ffc7b..fc71d6909 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -238,8 +238,12 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl // The hbmm of this host will be deleted soon. Do no reconcile it. msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." log.Info(msg) - conditions.MarkFalse(bmHost, infrav1.NoRemediateMachineAnnotationCondition, - infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) + c := conditions.Get(bmHost, infrav1.NoRemediateMachineAnnotationCondition) + if c == nil || c.Status != corev1.ConditionFalse { + // Do not overwrite the message of the condition, if the condition already exists. + conditions.MarkFalse(bmHost, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) + } return reconcile.Result{}, nil } conditions.MarkTrue(bmHost, infrav1.NoRemediateMachineAnnotationCondition) diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index 12d9d5a7a..5a25c6d3d 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -23,6 +23,7 @@ import ( "time" "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" @@ -164,8 +165,12 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r // This hbmm will be deleted soon. Do no reconcile it. msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." log.Info(msg) - conditions.MarkFalse(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition, - infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) + c := conditions.Get(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) + if c == nil || c.Status != corev1.ConditionFalse { + // Do not overwrite the message of the condition, if the condition already exists. + conditions.MarkFalse(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) + } return reconcile.Result{}, nil } conditions.MarkTrue(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index 27b2a15b6..f1b7a8042 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -19,8 +19,10 @@ package scope import ( "context" "fmt" + "strings" "github.com/go-logr/logr" + apierrors "k8s.io/apimachinery/pkg/api/errors" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -46,7 +48,7 @@ type BareMetalMachineScopeParams struct { // This is meant to be called for each reconcile iteration. func NewBareMetalMachineScope(params BareMetalMachineScopeParams) (*BareMetalMachineScope, error) { if params.Client == nil { - return nil, fmt.Errorf("cannot create baremetal host scope without client") + return nil, fmt.Errorf("cannot create baremetal machine scope without client") } if params.Machine == nil { return nil, fmt.Errorf("failed to generate new scope from nil Machine") @@ -127,7 +129,8 @@ func (m *BareMetalMachineScope) IsBootstrapReady() bool { // SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding // CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be -// created. +// created, and the condition will be set on both the BareMetalMachine and the corresponding +// HetznerBareMetalHost (if found). func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, message string) error { obj := m.Machine @@ -148,5 +151,71 @@ func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, messag record.Warnf(m.BareMetalMachine, "HetznerBareMetalMachineWillBeRemediated", "HetznerBareMetalMachine will be remediated: %s", message) + conditions.MarkFalse(m.BareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", message) + + // Try to set the condition on the corresponding HetznerBareMetalHost as well + + // Get the host key from the BareMetalMachine annotations + host, patchHelper, err := m.GetAssociatedHost(ctx) + if apierrors.IsNotFound(err) || host == nil { + // It is ok, if there is no coresponding hbmh. + return nil + } + + // Set the condition on the host + conditions.MarkFalse(host, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", message) + + // Patch the host + if err := patchHelper.Patch(ctx, host); err != nil { + return fmt.Errorf("failed to patch HetznerBareMetalHost: %w", err) + } + return nil } + +func splitHostKey(key string) (namespace, name string) { + parts := strings.Split(key, "/") + if len(parts) != 2 { + panic("unexpected host key") + } + return parts[0], parts[1] +} + +// GetAssociatedHost gets the associated host by looking for an annotation on the machine +// that contains a reference to the host. Returns nil if not found. Assumes the +// host is in the same namespace as the machine. +func (m *BareMetalMachineScope) GetAssociatedHost(ctx context.Context) (*infrav1.HetznerBareMetalHost, *patch.Helper, error) { + annotations := m.BareMetalMachine.ObjectMeta.GetAnnotations() + // if no annotations exist on machine, no host can be associated + if annotations == nil { + return nil, nil, nil + } + + // check if host annotation is set and return if not + hostKey, ok := annotations[infrav1.HostAnnotation] + if !ok { + return nil, nil, nil + } + + // find associated host object and return it + hostNamespace, hostName := splitHostKey(hostKey) + + host := infrav1.HetznerBareMetalHost{} + key := client.ObjectKey{ + Name: hostName, + Namespace: hostNamespace, + } + + if err := m.Client.Get(ctx, key, &host); err != nil { + return nil, nil, fmt.Errorf("failed to get host object: %w", err) + } + + helper, err := patch.NewHelper(&host, m.Client) + if err != nil { + return nil, nil, fmt.Errorf("failed to create patch helper: %w", err) + } + + return &host, helper, nil +} diff --git a/pkg/scope/baremetalmachine_test.go b/pkg/scope/baremetalmachine_test.go new file mode 100644 index 000000000..a50f507d2 --- /dev/null +++ b/pkg/scope/baremetalmachine_test.go @@ -0,0 +1,12 @@ +package scope + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Test splitHostKey", func() { + namespace, name := splitHostKey("namespace/name") + Expect(namespace).To(Equal("namespace")) + Expect(name).To(Equal("name")) +}) diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index d95f6c5ff..bed1a498c 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -130,7 +130,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro // Delete implements delete method of bare metal machine. func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) { // get host - ignore if not found - host, helper, err := s.getAssociatedHost(ctx) + host, helper, err := s.scope.GetAssociatedHost(ctx) if err != nil && !apierrors.IsNotFound(err) { return reconcile.Result{}, fmt.Errorf("failed to get associated host: %w", err) } @@ -202,7 +202,7 @@ func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) // update updates a machine and is invoked by the Machine Controller. func (s *Service) update(ctx context.Context) error { - host, helper, err := s.getAssociatedHost(ctx) + host, helper, err := s.scope.GetAssociatedHost(ctx) if err != nil { return fmt.Errorf("failed to get host: %w", err) } @@ -278,7 +278,7 @@ func (s *Service) update(ctx context.Context) error { func (s *Service) associate(ctx context.Context) error { // look for associated host - associatedHost, _, err := s.getAssociatedHost(ctx) + associatedHost, _, err := s.scope.GetAssociatedHost(ctx) if err != nil { return fmt.Errorf("failed to get associated host: %w", err) } @@ -347,43 +347,6 @@ func (s *Service) associate(ctx context.Context) error { return nil } -// getAssociatedHost gets the associated host by looking for an annotation on the machine -// that contains a reference to the host. Returns nil if not found. Assumes the -// host is in the same namespace as the machine. -func (s *Service) getAssociatedHost(ctx context.Context) (*infrav1.HetznerBareMetalHost, *patch.Helper, error) { - annotations := s.scope.BareMetalMachine.ObjectMeta.GetAnnotations() - // if no annotations exist on machine, no host can be associated - if annotations == nil { - return nil, nil, nil - } - - // check if host annotation is set and return if not - hostKey, ok := annotations[infrav1.HostAnnotation] - if !ok { - return nil, nil, nil - } - - // find associated host object and return it - hostNamespace, hostName := splitHostKey(hostKey) - - host := infrav1.HetznerBareMetalHost{} - key := client.ObjectKey{ - Name: hostName, - Namespace: hostNamespace, - } - - if err := s.scope.Client.Get(ctx, key, &host); err != nil { - return nil, nil, fmt.Errorf("failed to get host object: %w", err) - } - - helper, err := patch.NewHelper(&host, s.scope.Client) - if err != nil { - return nil, nil, fmt.Errorf("failed to create patch helper: %w", err) - } - - return &host, helper, nil -} - // ChooseHost tries to find a free hbmh. // If no hbmh was found, then hbmh and err are nil, and the string // "reason" contains human readable details. @@ -663,7 +626,7 @@ func (s *Service) setProviderID(ctx context.Context) error { } // providerID is based on the ID of the host machine - host, _, err := s.getAssociatedHost(ctx) + host, _, err := s.scope.GetAssociatedHost(ctx) if err != nil { return fmt.Errorf("failed to get host: %w", err) } @@ -878,14 +841,6 @@ func hostKey(host *infrav1.HetznerBareMetalHost) string { return host.GetNamespace() + "/" + host.GetName() } -func splitHostKey(key string) (namespace, name string) { - parts := strings.Split(key, "/") - if len(parts) != 2 { - panic("unexpected host key") - } - return parts[0], parts[1] -} - func checkForRequeueError(err error, errMessage string) (res reconcile.Result, reterr error) { if err == nil { return res, nil diff --git a/pkg/services/baremetal/baremetal/baremetal_test.go b/pkg/services/baremetal/baremetal/baremetal_test.go index 6bbb0e20c..cea63ffdd 100644 --- a/pkg/services/baremetal/baremetal/baremetal_test.go +++ b/pkg/services/baremetal/baremetal/baremetal_test.go @@ -714,12 +714,6 @@ var _ = Describe("Test hostKey", func() { Expect(hostKey(host)).To(Equal("namespace/name")) }) -var _ = Describe("Test splitHostKey", func() { - namespace, name := splitHostKey("namespace/name") - Expect(namespace).To(Equal("namespace")) - Expect(name).To(Equal("name")) -}) - var _ = Describe("Test checkForRequeueError", func() { type testCaseCheckForRequeueError struct { err error @@ -759,12 +753,6 @@ var _ = Describe("Test checkForRequeueError", func() { ) }) -var _ = Describe("Test splitHostKey", func() { - namespace, name := splitHostKey("namespace/name") - Expect(namespace).To(Equal("namespace")) - Expect(name).To(Equal("name")) -}) - var _ = Describe("Test analyzePatchError", func() { type testCaseAnalyzePatchError struct { ignoreNotFound bool From 98bcfff54a94b4fb9f7f7125f6c8761415c233d8 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 16:25:15 +0100 Subject: [PATCH 06/22] cleanup, do not set condition from hbmm reconcile on hbmh. --- api/v1beta1/hetznerbaremetalhost_types.go | 2 ++ .../hetznerbaremetalhost_controller.go | 19 +++++++++++++-- pkg/scope/baremetalmachine.go | 23 ++----------------- pkg/services/baremetal/baremetal/baremetal.go | 4 ++-- 4 files changed, 23 insertions(+), 25 deletions(-) diff --git a/api/v1beta1/hetznerbaremetalhost_types.go b/api/v1beta1/hetznerbaremetalhost_types.go index d22d60d00..716df9435 100644 --- a/api/v1beta1/hetznerbaremetalhost_types.go +++ b/api/v1beta1/hetznerbaremetalhost_types.go @@ -120,8 +120,10 @@ const ( // RegistrationError is an error condition occurring when the // controller is unable to retrieve information on a specific server via robot. RegistrationError ErrorType = "registration error" + // PreparationError is an error condition occurring when something fails while preparing host reconciliation. PreparationError ErrorType = "preparation error" + // ProvisioningError is an error condition occurring when the controller // fails to provision or deprovision the Host. ProvisioningError ErrorType = "provisioning error" diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index fc71d6909..d6bc3d0b3 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -185,6 +185,9 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl return res, nil } + // Case "Delete" was handled in reconcileSelectedStates. From now we know that the host has not + // DeletionTimestamp set. + hetznerCluster := &infrav1.HetznerCluster{} hetznerClusterName := client.ObjectKey{ @@ -227,13 +230,13 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) - machine, err := util.GetOwnerMachine(ctx, r.Client, hetznerBareMetalMachine.ObjectMeta) + capiMachine, err := util.GetOwnerMachine(ctx, r.Client, hetznerBareMetalMachine.ObjectMeta) ctx = ctrl.LoggerInto(ctx, log) if err != nil { return reconcile.Result{}, fmt.Errorf("failed to get owner machine: %w", err) } - _, exists := machine.Annotations[clusterv1.RemediateMachineAnnotation] + _, exists := capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] if exists { // The hbmm of this host will be deleted soon. Do no reconcile it. msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." @@ -248,6 +251,18 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } conditions.MarkTrue(bmHost, infrav1.NoRemediateMachineAnnotationCondition) + /* TODO: maybe do that later. + if bmHost.Spec.Status.ErrorType == infrav1.FatalError || bmHost.Spec.Status.ErrorType == infrav1.PermanentError { + // HetznerBaremetalMachineReconciler will detect that error on the host and call + // BareMetalMachineScope.SetErrorAndRemediate(). Then CAPI will delete the machine and the + // hbmm. + log.Info("ErrorType is Fatal or Permanent. Not reconciling host", + "errorType", bmHost.Spec.Status.ErrorType, + "errorMessage", bmHost.Spec.Status.ErrorMessage) + return reconcile.Result{}, nil + } + */ + // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) robotCreds, err := getAndValidateRobotCredentials(ctx, req.Namespace, hetznerCluster, secretManager) diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index f1b7a8042..2654a6870 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -22,7 +22,6 @@ import ( "strings" "github.com/go-logr/logr" - apierrors "k8s.io/apimachinery/pkg/api/errors" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -130,7 +129,8 @@ func (m *BareMetalMachineScope) IsBootstrapReady() bool { // SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding // CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be // created, and the condition will be set on both the BareMetalMachine and the corresponding -// HetznerBareMetalHost (if found). +// HetznerBareMetalHost (if found). The Condition NoRemediateMachineAnnotationCondition will be set +// on the hbmm. func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, message string) error { obj := m.Machine @@ -153,25 +153,6 @@ func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, messag conditions.MarkFalse(m.BareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition, infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", message) - - // Try to set the condition on the corresponding HetznerBareMetalHost as well - - // Get the host key from the BareMetalMachine annotations - host, patchHelper, err := m.GetAssociatedHost(ctx) - if apierrors.IsNotFound(err) || host == nil { - // It is ok, if there is no coresponding hbmh. - return nil - } - - // Set the condition on the host - conditions.MarkFalse(host, infrav1.NoRemediateMachineAnnotationCondition, - infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", message) - - // Patch the host - if err := patchHelper.Patch(ctx, host); err != nil { - return fmt.Errorf("failed to patch HetznerBareMetalHost: %w", err) - } - return nil } diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index bed1a498c..2318cc1db 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -207,7 +207,7 @@ func (s *Service) update(ctx context.Context) error { return fmt.Errorf("failed to get host: %w", err) } if host == nil { - err := s.scope.SetErrorAndRemediate(ctx, "host not found") + err := s.scope.SetErrorAndRemediate(ctx, "Reconcile of hbmm: host not found") if err != nil { return err } @@ -632,7 +632,7 @@ func (s *Service) setProviderID(ctx context.Context) error { } if host == nil { - err := s.scope.SetErrorAndRemediate(ctx, "host not found") + err := s.scope.SetErrorAndRemediate(ctx, "setProviderID failed: host not found") if err != nil { return err } From fb67a9738819f3023d021d2d50aa65dd55280ce5 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 16:33:32 +0100 Subject: [PATCH 07/22] copy condition from hbmm to host. No need to read capi machine. --- .../hetznerbaremetalhost_controller.go | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index d6bc3d0b3..56582a14d 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -230,23 +230,15 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) - capiMachine, err := util.GetOwnerMachine(ctx, r.Client, hetznerBareMetalMachine.ObjectMeta) - ctx = ctrl.LoggerInto(ctx, log) - if err != nil { - return reconcile.Result{}, fmt.Errorf("failed to get owner machine: %w", err) - } - - _, exists := capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] - if exists { + remediateConditionOfHbmm := conditions.Get(hetznerBareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) + if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse { // The hbmm of this host will be deleted soon. Do no reconcile it. - msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." + // Take the Condition of the hbmm and make it available on the hbmh. + msg := "hbmm has NoRemediateMachineAnnotationCondition=False. Not reconciling this host." log.Info(msg) - c := conditions.Get(bmHost, infrav1.NoRemediateMachineAnnotationCondition) - if c == nil || c.Status != corev1.ConditionFalse { - // Do not overwrite the message of the condition, if the condition already exists. - conditions.MarkFalse(bmHost, infrav1.NoRemediateMachineAnnotationCondition, - infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) - } + conditions.MarkFalse(bmHost, infrav1.NoRemediateMachineAnnotationCondition, + remediateConditionOfHbmm.Reason, remediateConditionOfHbmm.Severity, + "%s", remediateConditionOfHbmm.Message) return reconcile.Result{}, nil } conditions.MarkTrue(bmHost, infrav1.NoRemediateMachineAnnotationCondition) From a5521cd6446b87cc4e254286e313cf5fc3781879 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Mon, 17 Nov 2025 17:18:10 +0100 Subject: [PATCH 08/22] delete via RemediateAnnotation does not work well. Code in Remediation controller would need to be changed. --- api/v1beta1/conditions_const.go | 16 +++++----- .../hetznerbaremetalhost_controller.go | 13 ++++---- .../hetznerbaremetalmachine_controller.go | 14 +++------ ...hetznerbaremetalmachine_controller_test.go | 2 +- pkg/scope/baremetalmachine.go | 30 +++++-------------- pkg/services/baremetal/baremetal/baremetal.go | 12 +++++--- 6 files changed, 34 insertions(+), 53 deletions(-) diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index 8a1e8bbfb..fb88854b5 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -240,13 +240,11 @@ const ( ) const ( - // NoRemediateMachineAnnotationCondition is: - // - False when the corresponding CAPI Machine has the "cluster.x-k8s.io/remediate-machine" annotation set and will be remediated by CAPI soon. - // - True otherwise. - NoRemediateMachineAnnotationCondition clusterv1.ConditionType = "NoRemediateMachineAnnotation" - - // RemediateMachineAnnotationIsSetReason indicates that the CAPI machine has the - // "cluster.x-k8s.io/remediate-machine" annotation set. The CAPI machine and the corresponding - // infra-machine will be deleted by CAPI soon. - RemediateMachineAnnotationIsSetReason = "RemediateMachineAnnotationIsSet" + // NoPermanentErrorCondition is set to False, when the corresponding capi machine got deleted, + // and the infra-machine will deleted by capi soon. + NoPermanentErrorCondition clusterv1.ConditionType = "NoPermanentError" + + // PermanentErrorConditionIsSet the corresponding capi machine got deleted, and the + // infra-machine will deleted by capi soon. + PermanentErrorConditionIsSet = "PermanentErrorConditionIsSet" ) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 56582a14d..9744806a1 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -230,18 +230,17 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) - remediateConditionOfHbmm := conditions.Get(hetznerBareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) - if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse { + c := conditions.Get(hetznerBareMetalMachine, infrav1.NoPermanentErrorCondition) + if c != nil && c.Status == corev1.ConditionFalse { // The hbmm of this host will be deleted soon. Do no reconcile it. // Take the Condition of the hbmm and make it available on the hbmh. - msg := "hbmm has NoRemediateMachineAnnotationCondition=False. Not reconciling this host." + msg := "hbmm has NoPermanentErrorCondition=False. Not reconciling this host." log.Info(msg) - conditions.MarkFalse(bmHost, infrav1.NoRemediateMachineAnnotationCondition, - remediateConditionOfHbmm.Reason, remediateConditionOfHbmm.Severity, - "%s", remediateConditionOfHbmm.Message) + conditions.MarkFalse(bmHost, infrav1.NoPermanentErrorCondition, + c.Reason, c.Severity, "%s", c.Message) return reconcile.Result{}, nil } - conditions.MarkTrue(bmHost, infrav1.NoRemediateMachineAnnotationCondition) + conditions.MarkTrue(bmHost, infrav1.NoPermanentErrorCondition) /* TODO: maybe do that later. if bmHost.Spec.Status.ErrorType == infrav1.FatalError || bmHost.Spec.Status.ErrorType == infrav1.PermanentError { diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index 5a25c6d3d..5a1188f9e 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -160,20 +160,14 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r return r.reconcileDelete(ctx, machineScope) } - _, exist := machine.Annotations[clusterv1.RemediateMachineAnnotation] - if exist { + c := conditions.Get(hbmMachine, infrav1.NoPermanentErrorCondition) + if c != nil && c.Status == corev1.ConditionFalse { // This hbmm will be deleted soon. Do no reconcile it. - msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." + msg := "Hbmm has NoRemediateMachineAnnotationCondition=False. CAPI Machine will be deleted soon. Not reconciling this machine." log.Info(msg) - c := conditions.Get(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) - if c == nil || c.Status != corev1.ConditionFalse { - // Do not overwrite the message of the condition, if the condition already exists. - conditions.MarkFalse(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition, - infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) - } return reconcile.Result{}, nil } - conditions.MarkTrue(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) + conditions.MarkTrue(hbmMachine, infrav1.NoPermanentErrorCondition) return r.reconcileNormal(ctx, machineScope) } diff --git a/controllers/hetznerbaremetalmachine_controller_test.go b/controllers/hetznerbaremetalmachine_controller_test.go index 2f39afe23..29417ce6c 100644 --- a/controllers/hetznerbaremetalmachine_controller_test.go +++ b/controllers/hetznerbaremetalmachine_controller_test.go @@ -450,7 +450,7 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { return fmt.Errorf("RemediateMachineAnnotation not set on capi machine") } - c := conditions.Get(bmMachine, infrav1.NoRemediateMachineAnnotationCondition) + c := conditions.Get(bmMachine, infrav1.NoPermanentErrorCondition) if c == nil { return fmt.Errorf("Condition NoRemediateMachineAnnotationCondition does not exist.") } diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index 2654a6870..2c0c0fe73 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -126,33 +126,19 @@ func (m *BareMetalMachineScope) IsBootstrapReady() bool { return m.Machine.Spec.Bootstrap.DataSecretName != nil } -// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding -// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be -// created, and the condition will be set on both the BareMetalMachine and the corresponding -// HetznerBareMetalHost (if found). The Condition NoRemediateMachineAnnotationCondition will be set -// on the hbmm. -func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, message string) error { - obj := m.Machine - - // Create a patch base - patch := client.MergeFrom(obj.DeepCopy()) - - // Modify only annotations on the in-memory copy - if obj.Annotations == nil { - obj.Annotations = map[string]string{} - } - obj.Annotations[clusterv1.RemediateMachineAnnotation] = "" - - // Apply patch – only the diff (annotations) is sent to the API server - if err := m.Client.Patch(ctx, obj, patch); err != nil { - return err +// SetErrorAndDeleteCapiMachine deletes the corresponding CAPI machine. CAPI will remediate that +// machine. Additionally, an event of type Warning will be created, and the condition will be set on +// the BareMetalMachine NoRemediateMachineAnnotationCondition. +func (m *BareMetalMachineScope) SetErrorAndDeleteCapiMachine(ctx context.Context, message string) error { + if err := m.Client.Delete(ctx, m.Machine); err != nil { + return fmt.Errorf("SetErrorAndDeleteCapiMachine Delete failed: %w", err) } record.Warnf(m.BareMetalMachine, "HetznerBareMetalMachineWillBeRemediated", "HetznerBareMetalMachine will be remediated: %s", message) - conditions.MarkFalse(m.BareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition, - infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", message) + conditions.MarkFalse(m.BareMetalMachine, infrav1.NoPermanentErrorCondition, + infrav1.PermanentErrorConditionIsSet, clusterv1.ConditionSeverityInfo, "%s", message) return nil } diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index 2318cc1db..b9da12d0c 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -116,6 +116,10 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro return checkForRequeueError(err, "failed to update machine") } + err = s.scope.SetErrorAndDeleteCapiMachine(ctx, "dummydummydummydummy") + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed SetErrorAndRemediate: %w ", err) + } // set providerID if necessary if err := s.setProviderID(ctx); err != nil { return reconcile.Result{}, fmt.Errorf("failed to set providerID: %w", err) @@ -207,7 +211,7 @@ func (s *Service) update(ctx context.Context) error { return fmt.Errorf("failed to get host: %w", err) } if host == nil { - err := s.scope.SetErrorAndRemediate(ctx, "Reconcile of hbmm: host not found") + err := s.scope.SetErrorAndDeleteCapiMachine(ctx, "Reconcile of hbmm: host not found") if err != nil { return err } @@ -232,7 +236,7 @@ func (s *Service) update(ctx context.Context) error { // maintenance mode on the host is a fatal error for the machine object if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { - err := s.scope.SetErrorAndRemediate(ctx, FailureMessageMaintenanceMode) + err := s.scope.SetErrorAndDeleteCapiMachine(ctx, FailureMessageMaintenanceMode) if err != nil { return err } @@ -241,7 +245,7 @@ func (s *Service) update(ctx context.Context) error { // if host has a fatal error, then it should be set on the hbmm object as well if host.Spec.Status.ErrorType == infrav1.FatalError || host.Spec.Status.ErrorType == infrav1.PermanentError { - err := s.scope.SetErrorAndRemediate(ctx, host.Spec.Status.ErrorMessage) + err := s.scope.SetErrorAndDeleteCapiMachine(ctx, host.Spec.Status.ErrorMessage) if err != nil { return err } @@ -632,7 +636,7 @@ func (s *Service) setProviderID(ctx context.Context) error { } if host == nil { - err := s.scope.SetErrorAndRemediate(ctx, "setProviderID failed: host not found") + err := s.scope.SetErrorAndDeleteCapiMachine(ctx, "setProviderID failed: host not found") if err != nil { return err } From 7d834cad78dd6ed299424afcaff42656cbc5174c Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Tue, 18 Nov 2025 10:14:37 +0100 Subject: [PATCH 09/22] revert last commit. Delete via remediation is ok. --- api/v1beta1/conditions_const.go | 16 +++++----- .../hetznerbaremetalhost_controller.go | 13 ++++---- .../hetznerbaremetalmachine_controller.go | 14 ++++++--- ...hetznerbaremetalmachine_controller_test.go | 2 +- pkg/scope/baremetalmachine.go | 30 ++++++++++++++----- pkg/services/baremetal/baremetal/baremetal.go | 4 --- 6 files changed, 49 insertions(+), 30 deletions(-) diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index fb88854b5..8a1e8bbfb 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -240,11 +240,13 @@ const ( ) const ( - // NoPermanentErrorCondition is set to False, when the corresponding capi machine got deleted, - // and the infra-machine will deleted by capi soon. - NoPermanentErrorCondition clusterv1.ConditionType = "NoPermanentError" - - // PermanentErrorConditionIsSet the corresponding capi machine got deleted, and the - // infra-machine will deleted by capi soon. - PermanentErrorConditionIsSet = "PermanentErrorConditionIsSet" + // NoRemediateMachineAnnotationCondition is: + // - False when the corresponding CAPI Machine has the "cluster.x-k8s.io/remediate-machine" annotation set and will be remediated by CAPI soon. + // - True otherwise. + NoRemediateMachineAnnotationCondition clusterv1.ConditionType = "NoRemediateMachineAnnotation" + + // RemediateMachineAnnotationIsSetReason indicates that the CAPI machine has the + // "cluster.x-k8s.io/remediate-machine" annotation set. The CAPI machine and the corresponding + // infra-machine will be deleted by CAPI soon. + RemediateMachineAnnotationIsSetReason = "RemediateMachineAnnotationIsSet" ) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 9744806a1..56582a14d 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -230,17 +230,18 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) - c := conditions.Get(hetznerBareMetalMachine, infrav1.NoPermanentErrorCondition) - if c != nil && c.Status == corev1.ConditionFalse { + remediateConditionOfHbmm := conditions.Get(hetznerBareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) + if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse { // The hbmm of this host will be deleted soon. Do no reconcile it. // Take the Condition of the hbmm and make it available on the hbmh. - msg := "hbmm has NoPermanentErrorCondition=False. Not reconciling this host." + msg := "hbmm has NoRemediateMachineAnnotationCondition=False. Not reconciling this host." log.Info(msg) - conditions.MarkFalse(bmHost, infrav1.NoPermanentErrorCondition, - c.Reason, c.Severity, "%s", c.Message) + conditions.MarkFalse(bmHost, infrav1.NoRemediateMachineAnnotationCondition, + remediateConditionOfHbmm.Reason, remediateConditionOfHbmm.Severity, + "%s", remediateConditionOfHbmm.Message) return reconcile.Result{}, nil } - conditions.MarkTrue(bmHost, infrav1.NoPermanentErrorCondition) + conditions.MarkTrue(bmHost, infrav1.NoRemediateMachineAnnotationCondition) /* TODO: maybe do that later. if bmHost.Spec.Status.ErrorType == infrav1.FatalError || bmHost.Spec.Status.ErrorType == infrav1.PermanentError { diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index 5a1188f9e..5a25c6d3d 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -160,14 +160,20 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r return r.reconcileDelete(ctx, machineScope) } - c := conditions.Get(hbmMachine, infrav1.NoPermanentErrorCondition) - if c != nil && c.Status == corev1.ConditionFalse { + _, exist := machine.Annotations[clusterv1.RemediateMachineAnnotation] + if exist { // This hbmm will be deleted soon. Do no reconcile it. - msg := "Hbmm has NoRemediateMachineAnnotationCondition=False. CAPI Machine will be deleted soon. Not reconciling this machine." + msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." log.Info(msg) + c := conditions.Get(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) + if c == nil || c.Status != corev1.ConditionFalse { + // Do not overwrite the message of the condition, if the condition already exists. + conditions.MarkFalse(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) + } return reconcile.Result{}, nil } - conditions.MarkTrue(hbmMachine, infrav1.NoPermanentErrorCondition) + conditions.MarkTrue(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) return r.reconcileNormal(ctx, machineScope) } diff --git a/controllers/hetznerbaremetalmachine_controller_test.go b/controllers/hetznerbaremetalmachine_controller_test.go index 29417ce6c..2f39afe23 100644 --- a/controllers/hetznerbaremetalmachine_controller_test.go +++ b/controllers/hetznerbaremetalmachine_controller_test.go @@ -450,7 +450,7 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { return fmt.Errorf("RemediateMachineAnnotation not set on capi machine") } - c := conditions.Get(bmMachine, infrav1.NoPermanentErrorCondition) + c := conditions.Get(bmMachine, infrav1.NoRemediateMachineAnnotationCondition) if c == nil { return fmt.Errorf("Condition NoRemediateMachineAnnotationCondition does not exist.") } diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index 2c0c0fe73..2654a6870 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -126,19 +126,33 @@ func (m *BareMetalMachineScope) IsBootstrapReady() bool { return m.Machine.Spec.Bootstrap.DataSecretName != nil } -// SetErrorAndDeleteCapiMachine deletes the corresponding CAPI machine. CAPI will remediate that -// machine. Additionally, an event of type Warning will be created, and the condition will be set on -// the BareMetalMachine NoRemediateMachineAnnotationCondition. -func (m *BareMetalMachineScope) SetErrorAndDeleteCapiMachine(ctx context.Context, message string) error { - if err := m.Client.Delete(ctx, m.Machine); err != nil { - return fmt.Errorf("SetErrorAndDeleteCapiMachine Delete failed: %w", err) +// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding +// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be +// created, and the condition will be set on both the BareMetalMachine and the corresponding +// HetznerBareMetalHost (if found). The Condition NoRemediateMachineAnnotationCondition will be set +// on the hbmm. +func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, message string) error { + obj := m.Machine + + // Create a patch base + patch := client.MergeFrom(obj.DeepCopy()) + + // Modify only annotations on the in-memory copy + if obj.Annotations == nil { + obj.Annotations = map[string]string{} + } + obj.Annotations[clusterv1.RemediateMachineAnnotation] = "" + + // Apply patch – only the diff (annotations) is sent to the API server + if err := m.Client.Patch(ctx, obj, patch); err != nil { + return err } record.Warnf(m.BareMetalMachine, "HetznerBareMetalMachineWillBeRemediated", "HetznerBareMetalMachine will be remediated: %s", message) - conditions.MarkFalse(m.BareMetalMachine, infrav1.NoPermanentErrorCondition, - infrav1.PermanentErrorConditionIsSet, clusterv1.ConditionSeverityInfo, "%s", message) + conditions.MarkFalse(m.BareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition, + infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", message) return nil } diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index b9da12d0c..eba8b4776 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -116,10 +116,6 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro return checkForRequeueError(err, "failed to update machine") } - err = s.scope.SetErrorAndDeleteCapiMachine(ctx, "dummydummydummydummy") - if err != nil { - return reconcile.Result{}, fmt.Errorf("failed SetErrorAndRemediate: %w ", err) - } // set providerID if necessary if err := s.setProviderID(ctx); err != nil { return reconcile.Result{}, fmt.Errorf("failed to set providerID: %w", err) From cda32cafe0677705bd5ce8c7f65fd6834d4baad5 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Tue, 18 Nov 2025 14:16:59 +0100 Subject: [PATCH 10/22] WIP: give msg to method, so that a more precise error gets created. --- pkg/services/baremetal/remediation/remediation.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/services/baremetal/remediation/remediation.go b/pkg/services/baremetal/remediation/remediation.go index 371dc9dec..9395404e9 100644 --- a/pkg/services/baremetal/remediation/remediation.go +++ b/pkg/services/baremetal/remediation/remediation.go @@ -55,7 +55,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro // try to get information about host from bare metal machine annotations key, err := objectKeyFromAnnotations(s.scope.BareMetalMachine.ObjectMeta.GetAnnotations()) if err != nil { - if err := s.setOwnerRemediatedConditionNew(ctx); err != nil { + if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { err = fmt.Errorf("failed to set remediated condition on capi machine: %w", err) record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) return res, err @@ -68,7 +68,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro var host infrav1.HetznerBareMetalHost if err := s.scope.Client.Get(ctx, key, &host); err != nil { if apierrors.IsNotFound(err) { - if err := s.setOwnerRemediatedConditionNew(ctx); err != nil { + if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { err = fmt.Errorf("failed to set remediated condition on capi machine: %w", err) record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) return res, err @@ -84,7 +84,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro // if host is not provisioned or in maintenance mode, then we do not try to reboot server if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned || host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { - if err := s.setOwnerRemediatedConditionNew(ctx); err != nil { + if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { err := fmt.Errorf("failed to set remediated condition on capi machine: %w", err) record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) return res, err @@ -189,7 +189,7 @@ func (s *Service) handlePhaseWaiting(ctx context.Context) (res reconcile.Result, // preflight checks and handles the Machine deletion s.scope.BareMetalRemediation.Status.Phase = infrav1.PhaseDeleting - if err := s.setOwnerRemediatedConditionNew(ctx); err != nil { + if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { err := fmt.Errorf("failed to set remediated condition on capi machine: %w", err) record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) return res, err @@ -217,9 +217,9 @@ func (s *Service) timeUntilNextRemediation(now time.Time) time.Duration { return nextRemediation } -// setOwnerRemediatedConditionNew sets MachineOwnerRemediatedCondition on CAPI machine object +// setOwnerRemediatedConditionToFailed sets MachineOwnerRemediatedCondition on CAPI machine object // that have failed a healthcheck. -func (s *Service) setOwnerRemediatedConditionNew(ctx context.Context) error { +func (s *Service) setOwnerRemediatedConditionToFailed(ctx context.Context, msg string) error { capiMachine, err := util.GetOwnerMachine(ctx, s.scope.Client, s.scope.BareMetalRemediation.ObjectMeta) if err != nil { return fmt.Errorf("failed to get capi machine: %w", err) @@ -235,7 +235,7 @@ func (s *Service) setOwnerRemediatedConditionNew(ctx context.Context) error { clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, - "remediation through reboot failed", + "Remediation finished: %s", msg, ) if err := patchHelper.Patch(ctx, capiMachine); err != nil { From 700d1eca314a66a05f8d5e5df46e3e10b2d7cd63 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 14:26:20 +0100 Subject: [PATCH 11/22] show exit message of remediation on capi machine condition. --- pkg/services/baremetal/baremetal/baremetal.go | 8 +- .../baremetal/remediation/remediation.go | 110 +++++++++--------- 2 files changed, 59 insertions(+), 59 deletions(-) diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index eba8b4776..2318cc1db 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -207,7 +207,7 @@ func (s *Service) update(ctx context.Context) error { return fmt.Errorf("failed to get host: %w", err) } if host == nil { - err := s.scope.SetErrorAndDeleteCapiMachine(ctx, "Reconcile of hbmm: host not found") + err := s.scope.SetErrorAndRemediate(ctx, "Reconcile of hbmm: host not found") if err != nil { return err } @@ -232,7 +232,7 @@ func (s *Service) update(ctx context.Context) error { // maintenance mode on the host is a fatal error for the machine object if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { - err := s.scope.SetErrorAndDeleteCapiMachine(ctx, FailureMessageMaintenanceMode) + err := s.scope.SetErrorAndRemediate(ctx, FailureMessageMaintenanceMode) if err != nil { return err } @@ -241,7 +241,7 @@ func (s *Service) update(ctx context.Context) error { // if host has a fatal error, then it should be set on the hbmm object as well if host.Spec.Status.ErrorType == infrav1.FatalError || host.Spec.Status.ErrorType == infrav1.PermanentError { - err := s.scope.SetErrorAndDeleteCapiMachine(ctx, host.Spec.Status.ErrorMessage) + err := s.scope.SetErrorAndRemediate(ctx, host.Spec.Status.ErrorMessage) if err != nil { return err } @@ -632,7 +632,7 @@ func (s *Service) setProviderID(ctx context.Context) error { } if host == nil { - err := s.scope.SetErrorAndDeleteCapiMachine(ctx, "setProviderID failed: host not found") + err := s.scope.SetErrorAndRemediate(ctx, "setProviderID failed: host not found") if err != nil { return err } diff --git a/pkg/services/baremetal/remediation/remediation.go b/pkg/services/baremetal/remediation/remediation.go index 9395404e9..3619c1acf 100644 --- a/pkg/services/baremetal/remediation/remediation.go +++ b/pkg/services/baremetal/remediation/remediation.go @@ -51,56 +51,44 @@ func NewService(scope *scope.BareMetalRemediationScope) *Service { } // Reconcile implements reconcilement of HetznerBareMetalRemediations. -func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err error) { +func (s *Service) Reconcile(ctx context.Context) (reconcile.Result, error) { // try to get information about host from bare metal machine annotations key, err := objectKeyFromAnnotations(s.scope.BareMetalMachine.ObjectMeta.GetAnnotations()) if err != nil { - if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { - err = fmt.Errorf("failed to set remediated condition on capi machine: %w", err) - record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) - return res, err - } - record.Event(s.scope.BareMetalRemediation, "ExitRemediation", "exit remediation because bare metal machine has no host annotation") - return res, nil + return s.setOwnerRemediatedConditionToFailed(ctx, err.Error()) + } + + if key == nil { + // BareMetalMachine has not host annotation. + return s.setOwnerRemediatedConditionToFailed(ctx, + "BareMetalMachine has not host annotation.") } // get host var host infrav1.HetznerBareMetalHost - if err := s.scope.Client.Get(ctx, key, &host); err != nil { + if err := s.scope.Client.Get(ctx, *key, &host); err != nil { if apierrors.IsNotFound(err) { - if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { - err = fmt.Errorf("failed to set remediated condition on capi machine: %w", err) - record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) - return res, err - } - record.Event(s.scope.BareMetalRemediation, "ExitRemediation", "host not found - exit remediation") - return res, nil + return s.setOwnerRemediatedConditionToFailed(ctx, + "host not found - remediation failed.") } + + // retry err := fmt.Errorf("failed to find the unhealthy host: %w", err) record.Warn(s.scope.BareMetalRemediation, "FailedToFindHost", err.Error()) - return res, err + return reconcile.Result{}, err } // if host is not provisioned or in maintenance mode, then we do not try to reboot server if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned || host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { - if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { - err := fmt.Errorf("failed to set remediated condition on capi machine: %w", err) - record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) - return res, err - } - record.Eventf( - s.scope.BareMetalRemediation, - "ExitRemediation", - "exit remediation because host is not provisioned or in maintenance mode. Provisioning state: %s. Maintenance mode: %v", - host.Spec.Status.ProvisioningState, host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode, - ) - return res, nil + return s.setOwnerRemediatedConditionToFailed(ctx, + fmt.Sprintf("exit remediation because host is not provisioned or in maintenance mode. Provisioning state: %s. Maintenance mode: %v", + host.Spec.Status.ProvisioningState, host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode)) } if s.scope.BareMetalRemediation.Spec.Strategy.Type != infrav1.RemediationTypeReboot { record.Warn(s.scope.BareMetalRemediation, "UnsupportedRemediationStrategy", "unsupported remediation strategy") - return res, nil + return reconcile.Result{}, nil } // If no phase set, default to running @@ -113,9 +101,11 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro return s.handlePhaseRunning(ctx, host) case infrav1.PhaseWaiting: return s.handlePhaseWaiting(ctx) + case infrav1.PhaseDeleting: + return reconcile.Result{}, nil + default: + return reconcile.Result{}, fmt.Errorf("internal error, unhandled BareMetalRemediation.Status.Phase: %v", s.scope.BareMetalRemediation.Status.Phase) } - - return res, nil } func (s *Service) handlePhaseRunning(ctx context.Context, host infrav1.HetznerBareMetalHost) (res reconcile.Result, err error) { @@ -184,19 +174,7 @@ func (s *Service) handlePhaseWaiting(ctx context.Context) (res reconcile.Result, return reconcile.Result{RequeueAfter: nextCheck}, nil } - // When machine is still unhealthy after remediation, setting of OwnerRemediatedCondition - // moves control to CAPI machine controller. The owning controller will do - // preflight checks and handles the Machine deletion - s.scope.BareMetalRemediation.Status.Phase = infrav1.PhaseDeleting - - if err := s.setOwnerRemediatedConditionToFailed(ctx); err != nil { - err := fmt.Errorf("failed to set remediated condition on capi machine: %w", err) - record.Warn(s.scope.BareMetalRemediation, "FailedSettingConditionOnMachine", err.Error()) - return res, err - } - record.Event(s.scope.BareMetalRemediation, "SetOwnerRemediatedCondition", "because retryLimit is reached and reboot timed out") - - return res, nil + return s.setOwnerRemediatedConditionToFailed(ctx, "because retryLimit is reached and reboot timed out") } // timeUntilNextRemediation checks if it is time to execute a next remediation step @@ -219,17 +197,30 @@ func (s *Service) timeUntilNextRemediation(now time.Time) time.Duration { // setOwnerRemediatedConditionToFailed sets MachineOwnerRemediatedCondition on CAPI machine object // that have failed a healthcheck. -func (s *Service) setOwnerRemediatedConditionToFailed(ctx context.Context, msg string) error { +func (s *Service) setOwnerRemediatedConditionToFailed(ctx context.Context, msg string) (reconcile.Result, error) { capiMachine, err := util.GetOwnerMachine(ctx, s.scope.Client, s.scope.BareMetalRemediation.ObjectMeta) if err != nil { - return fmt.Errorf("failed to get capi machine: %w", err) + if !apierrors.IsNotFound(err) { + // Maybe a network error. Retry + return reconcile.Result{}, fmt.Errorf("failed to get capi machine: %w", err) + } + record.Event(s.scope.BareMetalRemediation, "CapiMachineGone", "CAPI machine does not exist. Remediation will be stopped. Infra Machine will be deleted soon by GC.") + + // do not retry + s.scope.BareMetalRemediation.Status.Phase = infrav1.PhaseDeleting + return reconcile.Result{}, nil } + // There is a capi-machine. Set condition. patchHelper, err := patch.NewHelper(capiMachine, s.scope.Client) if err != nil { - return fmt.Errorf("failed to init patch helper: %s %s/%s %w", capiMachine.Kind, capiMachine.Namespace, capiMachine.Name, err) + // retry + return reconcile.Result{}, fmt.Errorf("failed to init patch helper: %s %s/%s %w", capiMachine.Kind, capiMachine.Namespace, capiMachine.Name, err) } + // When machine is still unhealthy after remediation, setting of OwnerRemediatedCondition + // moves control to CAPI machine controller. The owning controller will do + // preflight checks and handles the Machine deletion conditions.MarkFalse( capiMachine, clusterv1.MachineOwnerRemediatedCondition, @@ -239,26 +230,35 @@ func (s *Service) setOwnerRemediatedConditionToFailed(ctx context.Context, msg s ) if err := patchHelper.Patch(ctx, capiMachine); err != nil { - return fmt.Errorf("failed to patch: %s %s/%s %w", capiMachine.Kind, capiMachine.Namespace, capiMachine.Name, err) + // retry + return reconcile.Result{}, fmt.Errorf("failed to patch: %s %s/%s %w", capiMachine.Kind, capiMachine.Namespace, capiMachine.Name, err) } - return nil + + record.Event(s.scope.BareMetalRemediation, "ExitRemediation", msg) + + // do not retry + s.scope.BareMetalRemediation.Status.Phase = infrav1.PhaseDeleting + return reconcile.Result{}, nil } -func objectKeyFromAnnotations(annotations map[string]string) (client.ObjectKey, error) { +// objectKeyFromAnnotations returns the objectKey from the annotation. Case-1: There is no +// annotation, then the ObjectKey and the error are nil. Case-2: The key exists, but contains a +// syntax error, then an error gets returned. Case-3: there is a valid objectKey. +func objectKeyFromAnnotations(annotations map[string]string) (*client.ObjectKey, error) { if annotations == nil { - return client.ObjectKey{}, fmt.Errorf("unable to get annotations") + return nil, nil } hostKey, ok := annotations[infrav1.HostAnnotation] if !ok { - return client.ObjectKey{}, fmt.Errorf("unable to get HostAnnotation") + return nil, nil } hostNamespace, hostName, err := splitHostKey(hostKey) if err != nil { - return client.ObjectKey{}, fmt.Errorf("failed to parse host key %q: %w", hostKey, err) + return nil, fmt.Errorf("failed to parse host key %q: %w", hostKey, err) } - return client.ObjectKey{Name: hostName, Namespace: hostNamespace}, nil + return &client.ObjectKey{Name: hostName, Namespace: hostNamespace}, nil } // addRebootAnnotation sets reboot annotation on unhealthy host. From f71d8a233e5ace32262d61273dd7f9427196fe66 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 14:44:07 +0100 Subject: [PATCH 12/22] no need to try a reboot. --- controllers/hetznerbaremetalhost_controller.go | 2 +- pkg/services/baremetal/baremetal/baremetal.go | 10 ++++------ pkg/services/baremetal/remediation/remediation.go | 10 ++++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 56582a14d..5ed20d4ae 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -232,7 +232,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl remediateConditionOfHbmm := conditions.Get(hetznerBareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse { - // The hbmm of this host will be deleted soon. Do no reconcile it. + // The hbmm of this host is in remediation. Do no reconcile it. // Take the Condition of the hbmm and make it available on the hbmh. msg := "hbmm has NoRemediateMachineAnnotationCondition=False. Not reconciling this host." log.Info(msg) diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index 2318cc1db..4de4c7691 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -207,18 +207,16 @@ func (s *Service) update(ctx context.Context) error { return fmt.Errorf("failed to get host: %w", err) } if host == nil { - err := s.scope.SetErrorAndRemediate(ctx, "Reconcile of hbmm: host not found") - if err != nil { - return err - } + err = errors.Join(s.scope.SetErrorAndRemediate(ctx, "Reconcile of hbmm: host not found")) return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err) } readyCondition := conditions.Get(host, clusterv1.ReadyCondition) if readyCondition != nil { - if readyCondition.Status == corev1.ConditionTrue { + switch readyCondition.Status { + case corev1.ConditionTrue: conditions.MarkTrue(s.scope.BareMetalMachine, infrav1.HostReadyCondition) - } else if readyCondition.Status == corev1.ConditionFalse { + case corev1.ConditionFalse: conditions.MarkFalse( s.scope.BareMetalMachine, infrav1.HostReadyCondition, diff --git a/pkg/services/baremetal/remediation/remediation.go b/pkg/services/baremetal/remediation/remediation.go index 3619c1acf..42fb07806 100644 --- a/pkg/services/baremetal/remediation/remediation.go +++ b/pkg/services/baremetal/remediation/remediation.go @@ -24,6 +24,7 @@ import ( "strings" "time" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -78,6 +79,15 @@ func (s *Service) Reconcile(ctx context.Context) (reconcile.Result, error) { return reconcile.Result{}, err } + // SetErrorAndRemediate() was used to stop provisioning. No need to try a reboot. + infraMachineCondition := conditions.Get(s.scope.BareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) + if infraMachineCondition != nil && infraMachineCondition.Status == corev1.ConditionFalse { + return s.setOwnerRemediatedConditionToFailed(ctx, + fmt.Sprintf("exit remediation because infra machine has condition set: %s: %s", + infraMachineCondition.Reason, + infraMachineCondition.Message)) + } + // if host is not provisioned or in maintenance mode, then we do not try to reboot server if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned || host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { From 401a4746c235875e5f9e9c4a38c8885d9c1b3be4 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 16:24:25 +0100 Subject: [PATCH 13/22] more logging, WIP --- controllers/csr_controller.go | 2 +- controllers/hetznerbaremetalhost_controller.go | 8 +++++--- controllers/hetznerbaremetalmachine_controller.go | 5 ++++- pkg/services/baremetal/baremetal/baremetal.go | 8 +++++--- pkg/services/baremetal/host/host.go | 6 +++++- 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/controllers/csr_controller.go b/controllers/csr_controller.go index 5e5cfd726..254f6cde0 100644 --- a/controllers/csr_controller.go +++ b/controllers/csr_controller.go @@ -112,7 +112,7 @@ func (r *GuestCSRReconciler) Reconcile(ctx context.Context, req reconcile.Reques if errors.Is(err, errNoHetznerBareMetalMachineByProviderIDFound) { log.Info(fmt.Sprintf("ProviderID not set yet. The hbmm seems to be in 'ensure-provision'. Retrying. %s", err.Error())) - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + return reconcile.Result{RequeueAfter: 15 * time.Second}, nil } if err != nil { log.Error(err, "could not find an associated bm machine or hcloud machine", diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 5ed20d4ae..52a9be333 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -139,7 +139,9 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl if err != nil { log.Error(err, "cache sync failed after BootState change") } - log.Info("Wait for update being in local cache", "durationWaitForLocalCacheSync", time.Since(startReadOwnWrite).Round(time.Millisecond)) + log.Info("Wait for update being in local cache", + "durationWaitForLocalCacheSync", time.Since(startReadOwnWrite).Round(time.Millisecond), + "diff", cmp.Diff(initialHost, bmHost)) }() // End: avoid conflict errors. Wait until local cache is up-to-date // ---------------------------------------------------------------- @@ -232,7 +234,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl remediateConditionOfHbmm := conditions.Get(hetznerBareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse { - // The hbmm of this host is in remediation. Do no reconcile it. + // The hbmm of this host is in remediation. Do not reconcile it. // Take the Condition of the hbmm and make it available on the hbmh. msg := "hbmm has NoRemediateMachineAnnotationCondition=False. Not reconciling this host." log.Info(msg) @@ -333,7 +335,7 @@ func (r *HetznerBareMetalHostReconciler) reconcileSelectedStates(ctx context.Con return ctrl.Result{RequeueAfter: 10 * time.Second}, nil - // Handle StateDeleting + // Handle StateDeleting case infrav1.StateDeleting: if controllerutil.RemoveFinalizer(bmHost, infrav1.HetznerBareMetalHostFinalizer) || controllerutil.RemoveFinalizer(bmHost, infrav1.DeprecatedBareMetalHostFinalizer) { diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index 5a25c6d3d..3352365d1 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -162,7 +162,7 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r _, exist := machine.Annotations[clusterv1.RemediateMachineAnnotation] if exist { - // This hbmm will be deleted soon. Do no reconcile it. + // This hbmm will be deleted soon. Do not reconcile it. msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." log.Info(msg) c := conditions.Get(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) @@ -180,10 +180,12 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r func (r *HetznerBareMetalMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.BareMetalMachineScope) (reconcile.Result, error) { // delete servers + machineScope.Logger.Info("reconcileDelete") result, err := baremetal.NewService(machineScope).Delete(ctx) if err != nil { var requeueError *scope.RequeueAfterError if ok := errors.As(err, &requeueError); ok { + machineScope.Logger.Info("reconcileDelete aaaaaafter", "e", requeueError) return reconcile.Result{Requeue: true, RequeueAfter: requeueError.GetRequeueAfter()}, nil } return reconcile.Result{}, fmt.Errorf("failed to delete servers for HetznerBareMetalMachine %s/%s: %w", @@ -191,6 +193,7 @@ func (r *HetznerBareMetalMachineReconciler) reconcileDelete(ctx context.Context, } emptyResult := reconcile.Result{} if result != emptyResult { + machineScope.Logger.Info("reconcileDelete noooooo empty result", "res", result) return result, nil } // Machine is deleted so remove the finalizer. diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index 4de4c7691..e87dd0346 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -128,7 +128,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro } // Delete implements delete method of bare metal machine. -func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) { +func (s *Service) Delete(ctx context.Context) (reconcile.Result, error) { // get host - ignore if not found host, helper, err := s.scope.GetAssociatedHost(ctx) if err != nil && !apierrors.IsNotFound(err) { @@ -150,7 +150,7 @@ func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) // remove the ownerRef to this host, even if the consumerRef references another machine host.OwnerReferences = s.removeOwnerRef(host.OwnerReferences) - return res, nil + return reconcile.Result{}, nil } if removeMachineSpecsFromHost(host) { @@ -164,6 +164,8 @@ func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) // check if deprovisioning is done if host.Spec.Status.ProvisioningState != infrav1.StateNone { + s.scope.Logger.Info("hbmm is deleting, but host is not deprovisioned yet. Requeueing", + "ProvisioningState", host.Spec.Status.ProvisioningState) return reconcile.Result{RequeueAfter: requeueAfter}, nil } @@ -197,7 +199,7 @@ func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) ) s.scope.BareMetalMachine.Status.Phase = clusterv1.MachinePhaseDeleted - return res, nil + return reconcile.Result{}, nil } // update updates a machine and is invoked by the Machine Controller. diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 33710fbab..cd48550ee 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -43,6 +43,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" @@ -2187,7 +2188,10 @@ func (s *Service) actionProvisioned(ctx context.Context) actionResult { } // next: None -func (s *Service) actionDeprovisioning(_ context.Context) actionResult { +func (s *Service) actionDeprovisioning(ctx context.Context) actionResult { + logger := log.FromContext(ctx) + logger.Info("actionDeprovisioning actionDeprovisioning actionDeprovisioning actionDeprovisioning actionDeprovisioning") + // Update name in robot API if _, err := s.scope.RobotClient.SetBMServerName( s.scope.HetznerBareMetalHost.Spec.ServerID, From 25890c43cf40e9caef42273e495ba68ccfbe8fa2 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 16:37:59 +0100 Subject: [PATCH 14/22] fixed test cases. --- pkg/services/baremetal/host/host.go | 7 +++---- .../baremetal/remediation/remediation_test.go | 14 +++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index cd48550ee..54a2f7019 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -43,7 +43,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" @@ -161,8 +160,8 @@ func SaveHostAndReturn(ctx context.Context, cl client.Client, host *infrav1.Hetz if err := cl.Update(ctx, host); err != nil { if apierrors.IsConflict(err) { - log := ctrl.LoggerFrom(ctx) - log.Info("conflict error. Retrying", "err", err) + logger := ctrl.LoggerFrom(ctx) + logger.Info("conflict error. Retrying", "err", err) return reconcile.Result{Requeue: true}, nil } return reconcile.Result{}, fmt.Errorf("failed to update host object: %w", err) @@ -2189,7 +2188,7 @@ func (s *Service) actionProvisioned(ctx context.Context) actionResult { // next: None func (s *Service) actionDeprovisioning(ctx context.Context) actionResult { - logger := log.FromContext(ctx) + logger := ctrl.LoggerFrom(ctx) logger.Info("actionDeprovisioning actionDeprovisioning actionDeprovisioning actionDeprovisioning actionDeprovisioning") // Update name in robot API diff --git a/pkg/services/baremetal/remediation/remediation_test.go b/pkg/services/baremetal/remediation/remediation_test.go index 3fbf68de0..0b68889b6 100644 --- a/pkg/services/baremetal/remediation/remediation_test.go +++ b/pkg/services/baremetal/remediation/remediation_test.go @@ -81,7 +81,7 @@ var _ = Describe("Test ObjectKeyFromAnnotations", func() { type testCaseObjectKeyFromAnnotations struct { annotations map[string]string expectError bool - expectKey client.ObjectKey + expectKey *client.ObjectKey } DescribeTable("Test ObjectKeyFromAnnotations", @@ -101,26 +101,26 @@ var _ = Describe("Test ObjectKeyFromAnnotations", func() { infrav1.HostAnnotation: "mynamespace/myname", }, expectError: false, - expectKey: client.ObjectKey{Namespace: "mynamespace", Name: "myname"}, + expectKey: &client.ObjectKey{Namespace: "mynamespace", Name: "myname"}, }), Entry("nil annotations", testCaseObjectKeyFromAnnotations{ annotations: nil, - expectError: true, - expectKey: client.ObjectKey{}, + expectError: false, + expectKey: nil, }), Entry("no host annotation", testCaseObjectKeyFromAnnotations{ annotations: map[string]string{ "other-key": "mynamespace/myname", }, - expectError: true, - expectKey: client.ObjectKey{}, + expectError: false, + expectKey: nil, }), Entry("incorrect host key", testCaseObjectKeyFromAnnotations{ annotations: map[string]string{ infrav1.HostAnnotation: "mynamespace-myname", }, expectError: true, - expectKey: client.ObjectKey{}, + expectKey: nil, }), ) }) From 7133120b8d5e8228b77a16081d53aef14aacdae0 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 20:00:30 +0100 Subject: [PATCH 15/22] deduplicated code, cleaned up PR. --- pkg/baremetalutils/baremetalutils.go | 66 ++++++++++++++ pkg/baremetalutils/baremetalutils_test.go | 28 ++++++ pkg/scope/baremetalmachine.go | 49 ++-------- pkg/scope/baremetalmachine_test.go | 12 --- pkg/services/baremetal/baremetal/baremetal.go | 8 +- .../baremetal/remediation/remediation.go | 79 +++++----------- .../baremetal/remediation/remediation_test.go | 91 ------------------- 7 files changed, 131 insertions(+), 202 deletions(-) create mode 100644 pkg/baremetalutils/baremetalutils.go create mode 100644 pkg/baremetalutils/baremetalutils_test.go delete mode 100644 pkg/scope/baremetalmachine_test.go diff --git a/pkg/baremetalutils/baremetalutils.go b/pkg/baremetalutils/baremetalutils.go new file mode 100644 index 000000000..e034ac4c2 --- /dev/null +++ b/pkg/baremetalutils/baremetalutils.go @@ -0,0 +1,66 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baremetalutils + +import ( + "context" + "fmt" + "strings" + + "sigs.k8s.io/controller-runtime/pkg/client" + + infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" +) + +func splitHostKey(key string) (namespace, name string) { + parts := strings.Split(key, "/") + if len(parts) != 2 { + panic("unexpected host key") + } + return parts[0], parts[1] +} + +// GetAssociatedHost gets the associated host by looking for an annotation on the +// machine that contains a reference to the host. Returns nil if not found. Assumes the host is in +// the same namespace as the machine. +func GetAssociatedHost(ctx context.Context, crClient client.Client, hbmm *infrav1.HetznerBareMetalMachine) (*infrav1.HetznerBareMetalHost, error) { + annotations := hbmm.GetAnnotations() + // if no annotations exist on machine, no host can be associated + if annotations == nil { + return nil, nil + } + + // check if host annotation is set and return if not + hostKey, ok := annotations[infrav1.HostAnnotation] + if !ok { + return nil, nil + } + + // find associated host object and return it + hostNamespace, hostName := splitHostKey(hostKey) + + host := &infrav1.HetznerBareMetalHost{} + key := client.ObjectKey{ + Name: hostName, + Namespace: hostNamespace, + } + + if err := crClient.Get(ctx, key, host); err != nil { + return nil, fmt.Errorf("failed to get host object: %w", err) + } + return host, nil +} diff --git a/pkg/baremetalutils/baremetalutils_test.go b/pkg/baremetalutils/baremetalutils_test.go new file mode 100644 index 000000000..ed81f537e --- /dev/null +++ b/pkg/baremetalutils/baremetalutils_test.go @@ -0,0 +1,28 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baremetalutils + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Test splitHostKey", func() { + namespace, name := splitHostKey("namespace/name") + Expect(namespace).To(Equal("namespace")) + Expect(name).To(Equal("name")) +}) diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index 2654a6870..a14de7e88 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -19,7 +19,6 @@ package scope import ( "context" "fmt" - "strings" "github.com/go-logr/logr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -30,6 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" + "github.com/syself/cluster-api-provider-hetzner/pkg/baremetalutils" hcloudclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/client" ) @@ -156,47 +156,18 @@ func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, messag return nil } -func splitHostKey(key string) (namespace, name string) { - parts := strings.Split(key, "/") - if len(parts) != 2 { - panic("unexpected host key") - } - return parts[0], parts[1] -} - -// GetAssociatedHost gets the associated host by looking for an annotation on the machine -// that contains a reference to the host. Returns nil if not found. Assumes the -// host is in the same namespace as the machine. -func (m *BareMetalMachineScope) GetAssociatedHost(ctx context.Context) (*infrav1.HetznerBareMetalHost, *patch.Helper, error) { - annotations := m.BareMetalMachine.ObjectMeta.GetAnnotations() - // if no annotations exist on machine, no host can be associated - if annotations == nil { - return nil, nil, nil - } - - // check if host annotation is set and return if not - hostKey, ok := annotations[infrav1.HostAnnotation] - if !ok { - return nil, nil, nil - } - - // find associated host object and return it - hostNamespace, hostName := splitHostKey(hostKey) - - host := infrav1.HetznerBareMetalHost{} - key := client.ObjectKey{ - Name: hostName, - Namespace: hostNamespace, - } - - if err := m.Client.Get(ctx, key, &host); err != nil { - return nil, nil, fmt.Errorf("failed to get host object: %w", err) +// GetAssociatedHostAndPatchHelper gets the associated host by looking for an annotation on the +// machine that contains a reference to the host. Returns nil if not found. Assumes the host is in +// the same namespace as the machine. Additionally, a PatchHelper gets returned. +func (m *BareMetalMachineScope) GetAssociatedHostAndPatchHelper(ctx context.Context) (*infrav1.HetznerBareMetalHost, *patch.Helper, error) { + host, err := baremetalutils.GetAssociatedHost(ctx, m.Client, m.BareMetalMachine) + if err != nil { + return nil, nil, err } - - helper, err := patch.NewHelper(&host, m.Client) + helper, err := patch.NewHelper(host, m.Client) if err != nil { return nil, nil, fmt.Errorf("failed to create patch helper: %w", err) } - return &host, helper, nil + return host, helper, nil } diff --git a/pkg/scope/baremetalmachine_test.go b/pkg/scope/baremetalmachine_test.go deleted file mode 100644 index a50f507d2..000000000 --- a/pkg/scope/baremetalmachine_test.go +++ /dev/null @@ -1,12 +0,0 @@ -package scope - -import ( - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -var _ = Describe("Test splitHostKey", func() { - namespace, name := splitHostKey("namespace/name") - Expect(namespace).To(Equal("namespace")) - Expect(name).To(Equal("name")) -}) diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index e87dd0346..d9ec3344b 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -130,7 +130,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro // Delete implements delete method of bare metal machine. func (s *Service) Delete(ctx context.Context) (reconcile.Result, error) { // get host - ignore if not found - host, helper, err := s.scope.GetAssociatedHost(ctx) + host, helper, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) if err != nil && !apierrors.IsNotFound(err) { return reconcile.Result{}, fmt.Errorf("failed to get associated host: %w", err) } @@ -204,7 +204,7 @@ func (s *Service) Delete(ctx context.Context) (reconcile.Result, error) { // update updates a machine and is invoked by the Machine Controller. func (s *Service) update(ctx context.Context) error { - host, helper, err := s.scope.GetAssociatedHost(ctx) + host, helper, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) if err != nil { return fmt.Errorf("failed to get host: %w", err) } @@ -278,7 +278,7 @@ func (s *Service) update(ctx context.Context) error { func (s *Service) associate(ctx context.Context) error { // look for associated host - associatedHost, _, err := s.scope.GetAssociatedHost(ctx) + associatedHost, _, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) if err != nil { return fmt.Errorf("failed to get associated host: %w", err) } @@ -626,7 +626,7 @@ func (s *Service) setProviderID(ctx context.Context) error { } // providerID is based on the ID of the host machine - host, _, err := s.scope.GetAssociatedHost(ctx) + host, _, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) if err != nil { return fmt.Errorf("failed to get host: %w", err) } diff --git a/pkg/services/baremetal/remediation/remediation.go b/pkg/services/baremetal/remediation/remediation.go index 42fb07806..63d05fa61 100644 --- a/pkg/services/baremetal/remediation/remediation.go +++ b/pkg/services/baremetal/remediation/remediation.go @@ -21,7 +21,6 @@ import ( "context" "encoding/json" "fmt" - "strings" "time" corev1 "k8s.io/api/core/v1" @@ -32,10 +31,10 @@ import ( "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/record" - "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" + "github.com/syself/cluster-api-provider-hetzner/pkg/baremetalutils" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" ) @@ -53,33 +52,24 @@ func NewService(scope *scope.BareMetalRemediationScope) *Service { // Reconcile implements reconcilement of HetznerBareMetalRemediations. func (s *Service) Reconcile(ctx context.Context) (reconcile.Result, error) { - // try to get information about host from bare metal machine annotations - key, err := objectKeyFromAnnotations(s.scope.BareMetalMachine.ObjectMeta.GetAnnotations()) + host, err := baremetalutils.GetAssociatedHost(ctx, s.scope.Client, s.scope.BareMetalMachine) if err != nil { - return s.setOwnerRemediatedConditionToFailed(ctx, err.Error()) - } - - if key == nil { - // BareMetalMachine has not host annotation. - return s.setOwnerRemediatedConditionToFailed(ctx, - "BareMetalMachine has not host annotation.") - } - - // get host - var host infrav1.HetznerBareMetalHost - if err := s.scope.Client.Get(ctx, *key, &host); err != nil { if apierrors.IsNotFound(err) { return s.setOwnerRemediatedConditionToFailed(ctx, - "host not found - remediation failed.") + "exit remediation because host not found") } // retry - err := fmt.Errorf("failed to find the unhealthy host: %w", err) + err := fmt.Errorf("failed to find the unhealthy host (will retry): %w", err) record.Warn(s.scope.BareMetalRemediation, "FailedToFindHost", err.Error()) return reconcile.Result{}, err } - // SetErrorAndRemediate() was used to stop provisioning. No need to try a reboot. + if host == nil { + return s.setOwnerRemediatedConditionToFailed(ctx, + "exit remediation because hbmm has no host annotation") + } + // if SetErrorAndRemediate() was used to stop provisioning, do not try to reboot server infraMachineCondition := conditions.Get(s.scope.BareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) if infraMachineCondition != nil && infraMachineCondition.Status == corev1.ConditionFalse { return s.setOwnerRemediatedConditionToFailed(ctx, @@ -88,12 +78,17 @@ func (s *Service) Reconcile(ctx context.Context) (reconcile.Result, error) { infraMachineCondition.Message)) } - // if host is not provisioned or in maintenance mode, then we do not try to reboot server - if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned || - host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { + // if host is not provisioned, do not try to reboot server + if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned { return s.setOwnerRemediatedConditionToFailed(ctx, - fmt.Sprintf("exit remediation because host is not provisioned or in maintenance mode. Provisioning state: %s. Maintenance mode: %v", - host.Spec.Status.ProvisioningState, host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode)) + fmt.Sprintf("exit remediation because host is not provisioned. Provisioning state: %s.", + host.Spec.Status.ProvisioningState)) + } + + // host is in maintenance mode, do not try to reboot server + if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode { + return s.setOwnerRemediatedConditionToFailed(ctx, + "exit remediation because host is in maintenance mode") } if s.scope.BareMetalRemediation.Spec.Strategy.Type != infrav1.RemediationTypeReboot { @@ -118,7 +113,7 @@ func (s *Service) Reconcile(ctx context.Context) (reconcile.Result, error) { } } -func (s *Service) handlePhaseRunning(ctx context.Context, host infrav1.HetznerBareMetalHost) (res reconcile.Result, err error) { +func (s *Service) handlePhaseRunning(ctx context.Context, host *infrav1.HetznerBareMetalHost) (res reconcile.Result, err error) { // if host has not been remediated yet, do that now if s.scope.BareMetalRemediation.Status.LastRemediated == nil { if err := s.remediate(ctx, host); err != nil { @@ -147,10 +142,10 @@ func (s *Service) handlePhaseRunning(ctx context.Context, host infrav1.HetznerBa return res, nil } -func (s *Service) remediate(ctx context.Context, host infrav1.HetznerBareMetalHost) error { +func (s *Service) remediate(ctx context.Context, host *infrav1.HetznerBareMetalHost) error { var err error - patchHelper, err := patch.NewHelper(&host, s.scope.Client) + patchHelper, err := patch.NewHelper(host, s.scope.Client) if err != nil { return fmt.Errorf("failed to init patch helper: %s %s/%s %w", host.Kind, host.Namespace, host.Name, err) } @@ -162,7 +157,7 @@ func (s *Service) remediate(ctx context.Context, host infrav1.HetznerBareMetalHo return fmt.Errorf("failed to add reboot annotation: %w", err) } - if err := patchHelper.Patch(ctx, &host); err != nil { + if err := patchHelper.Patch(ctx, host); err != nil { return fmt.Errorf("failed to patch: %s %s/%s %w", host.Kind, host.Namespace, host.Name, err) } @@ -251,26 +246,6 @@ func (s *Service) setOwnerRemediatedConditionToFailed(ctx context.Context, msg s return reconcile.Result{}, nil } -// objectKeyFromAnnotations returns the objectKey from the annotation. Case-1: There is no -// annotation, then the ObjectKey and the error are nil. Case-2: The key exists, but contains a -// syntax error, then an error gets returned. Case-3: there is a valid objectKey. -func objectKeyFromAnnotations(annotations map[string]string) (*client.ObjectKey, error) { - if annotations == nil { - return nil, nil - } - hostKey, ok := annotations[infrav1.HostAnnotation] - if !ok { - return nil, nil - } - - hostNamespace, hostName, err := splitHostKey(hostKey) - if err != nil { - return nil, fmt.Errorf("failed to parse host key %q: %w", hostKey, err) - } - - return &client.ObjectKey{Name: hostName, Namespace: hostNamespace}, nil -} - // addRebootAnnotation sets reboot annotation on unhealthy host. func addRebootAnnotation(annotations map[string]string) (map[string]string, error) { rebootAnnotationArguments := infrav1.RebootAnnotationArguments{Type: infrav1.RebootTypeHardware} @@ -287,11 +262,3 @@ func addRebootAnnotation(annotations map[string]string) (map[string]string, erro annotations[infrav1.RebootAnnotation] = string(b) return annotations, nil } - -func splitHostKey(key string) (namespace, name string, err error) { - parts := strings.Split(key, "/") - if len(parts) != 2 { - return "", "", fmt.Errorf("unexpected host key") - } - return parts[0], parts[1], nil -} diff --git a/pkg/services/baremetal/remediation/remediation_test.go b/pkg/services/baremetal/remediation/remediation_test.go index 0b68889b6..38b5800a4 100644 --- a/pkg/services/baremetal/remediation/remediation_test.go +++ b/pkg/services/baremetal/remediation/remediation_test.go @@ -24,7 +24,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" @@ -77,96 +76,6 @@ var _ = Describe("Test TimeUntilNextRemediation", func() { ) }) -var _ = Describe("Test ObjectKeyFromAnnotations", func() { - type testCaseObjectKeyFromAnnotations struct { - annotations map[string]string - expectError bool - expectKey *client.ObjectKey - } - - DescribeTable("Test ObjectKeyFromAnnotations", - func(tc testCaseObjectKeyFromAnnotations) { - key, err := objectKeyFromAnnotations(tc.annotations) - - if tc.expectError { - Expect(err).ToNot(BeNil()) - } else { - Expect(err).To(BeNil()) - } - - Expect(key).To(Equal(tc.expectKey)) - }, - Entry("correct host key", testCaseObjectKeyFromAnnotations{ - annotations: map[string]string{ - infrav1.HostAnnotation: "mynamespace/myname", - }, - expectError: false, - expectKey: &client.ObjectKey{Namespace: "mynamespace", Name: "myname"}, - }), - Entry("nil annotations", testCaseObjectKeyFromAnnotations{ - annotations: nil, - expectError: false, - expectKey: nil, - }), - Entry("no host annotation", testCaseObjectKeyFromAnnotations{ - annotations: map[string]string{ - "other-key": "mynamespace/myname", - }, - expectError: false, - expectKey: nil, - }), - Entry("incorrect host key", testCaseObjectKeyFromAnnotations{ - annotations: map[string]string{ - infrav1.HostAnnotation: "mynamespace-myname", - }, - expectError: true, - expectKey: nil, - }), - ) -}) - -var _ = Describe("Test SplitHostKey", func() { - type testCaseSplitHostKey struct { - hostKey string - expectNamespace string - expectName string - expectError bool - } - - DescribeTable("Test SplitHostKey", - func(tc testCaseSplitHostKey) { - ns, name, err := splitHostKey(tc.hostKey) - - Expect(ns).To(Equal(tc.expectNamespace)) - Expect(name).To(Equal(tc.expectName)) - - if tc.expectError { - Expect(err).ToNot(BeNil()) - } else { - Expect(err).To(BeNil()) - } - }, - Entry("correct host key", testCaseSplitHostKey{ - hostKey: "ns/name", - expectNamespace: "ns", - expectName: "name", - expectError: false, - }), - Entry("incorrect host key 1", testCaseSplitHostKey{ - hostKey: "ns-name", - expectNamespace: "", - expectName: "", - expectError: true, - }), - Entry("incorrect host key 2", testCaseSplitHostKey{ - hostKey: "ns/name/other", - expectNamespace: "", - expectName: "", - expectError: true, - }), - ) -}) - var _ = Describe("Test AddRebootAnnotation", func() { type testCaseAddRebootAnnotation struct { annotations map[string]string From 527de14cab9db86f689ef1c2b09128f02ec8ddad Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 20:12:37 +0100 Subject: [PATCH 16/22] ... --- controllers/hetznerbaremetalhost_controller.go | 5 ++--- pkg/scope/baremetalmachine.go | 5 +++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 52a9be333..1c30b1ec7 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -139,9 +139,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl if err != nil { log.Error(err, "cache sync failed after BootState change") } - log.Info("Wait for update being in local cache", - "durationWaitForLocalCacheSync", time.Since(startReadOwnWrite).Round(time.Millisecond), - "diff", cmp.Diff(initialHost, bmHost)) + log.Info("Wait for update being in local cache", "durationWaitForLocalCacheSync", time.Since(startReadOwnWrite).Round(time.Millisecond)) }() // End: avoid conflict errors. Wait until local cache is up-to-date // ---------------------------------------------------------------- @@ -231,6 +229,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hetznerBareMetalMachine)) + ctx = ctrl.LoggerInto(ctx, log) remediateConditionOfHbmm := conditions.Get(hetznerBareMetalMachine, infrav1.NoRemediateMachineAnnotationCondition) if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse { diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index a14de7e88..9af127dd0 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -164,6 +164,11 @@ func (m *BareMetalMachineScope) GetAssociatedHostAndPatchHelper(ctx context.Cont if err != nil { return nil, nil, err } + + if host == nil { + return nil, nil, nil + } + helper, err := patch.NewHelper(host, m.Client) if err != nil { return nil, nil, fmt.Errorf("failed to create patch helper: %w", err) From 826c0d4d8ab72ade491eafa90f65c4103b2184d8 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 20:33:33 +0100 Subject: [PATCH 17/22] adapt comments to no longer use failure reason. --- controllers/hetznerbaremetalmachine_controller_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/hetznerbaremetalmachine_controller_test.go b/controllers/hetznerbaremetalmachine_controller_test.go index 2f39afe23..9595d14aa 100644 --- a/controllers/hetznerbaremetalmachine_controller_test.go +++ b/controllers/hetznerbaremetalmachine_controller_test.go @@ -413,7 +413,7 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { }, timeout, time.Second).Should(BeTrue()) }) - It("sets a failure reason when maintenance mode is set on the host", func() { + It("sets RemediateMachineAnnotation when maintenance mode is set on the host", func() { By("making sure that machine is ready") Eventually(func() bool { @@ -433,7 +433,7 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { Expect(ph.Patch(ctx, host, patch.WithStatusObservedGeneration{})).To(Succeed()) - By("checking that failure message is set on machine") + By("checking that RemediateMachineAnnotation is set on machine") Eventually(func() error { if err := testEnv.Get(ctx, key, bmMachine); err != nil { From 523dd7bf7649778f79d4c15a301e672d414fc2a9 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 20:40:48 +0100 Subject: [PATCH 18/22] remove todo. --- controllers/hetznerbaremetalhost_controller.go | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index 1c30b1ec7..61a7bf1e1 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -244,18 +244,6 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl } conditions.MarkTrue(bmHost, infrav1.NoRemediateMachineAnnotationCondition) - /* TODO: maybe do that later. - if bmHost.Spec.Status.ErrorType == infrav1.FatalError || bmHost.Spec.Status.ErrorType == infrav1.PermanentError { - // HetznerBaremetalMachineReconciler will detect that error on the host and call - // BareMetalMachineScope.SetErrorAndRemediate(). Then CAPI will delete the machine and the - // hbmm. - log.Info("ErrorType is Fatal or Permanent. Not reconciling host", - "errorType", bmHost.Spec.Status.ErrorType, - "errorMessage", bmHost.Spec.Status.ErrorMessage) - return reconcile.Result{}, nil - } - */ - // Get Hetzner robot api credentials secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader) robotCreds, err := getAndValidateRobotCredentials(ctx, req.Namespace, hetznerCluster, secretManager) From 49a275cb350c93af8edc69e58dfa2cfbee5d7e01 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 20:43:16 +0100 Subject: [PATCH 19/22] linting. --- controllers/hetznerbaremetalmachine_controller_test.go | 4 ++-- pkg/baremetalutils/baremetalutils.go | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/controllers/hetznerbaremetalmachine_controller_test.go b/controllers/hetznerbaremetalmachine_controller_test.go index 9595d14aa..3c5b7edfc 100644 --- a/controllers/hetznerbaremetalmachine_controller_test.go +++ b/controllers/hetznerbaremetalmachine_controller_test.go @@ -452,11 +452,11 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() { c := conditions.Get(bmMachine, infrav1.NoRemediateMachineAnnotationCondition) if c == nil { - return fmt.Errorf("Condition NoRemediateMachineAnnotationCondition does not exist.") + return fmt.Errorf("condition NoRemediateMachineAnnotationCondition does not exist") } if c.Status != corev1.ConditionFalse { - return fmt.Errorf("Condition NoRemediateMachineAnnotationCondition should be False") + return fmt.Errorf("condition NoRemediateMachineAnnotationCondition should be False") } return nil diff --git a/pkg/baremetalutils/baremetalutils.go b/pkg/baremetalutils/baremetalutils.go index e034ac4c2..3d6e20b6e 100644 --- a/pkg/baremetalutils/baremetalutils.go +++ b/pkg/baremetalutils/baremetalutils.go @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +// Package baremetalutils implements helper functions for working with baremetal. package baremetalutils import ( From 0726e9a59398e1469b02284393f0849683dfd476 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 20:57:11 +0100 Subject: [PATCH 20/22] make diff to main smaller. --- controllers/csr_controller.go | 2 +- .../hetznerbaremetalmachine_controller.go | 29 ++++-------------- pkg/scope/baremetalmachine.go | 22 -------------- pkg/services/baremetal/baremetal/baremetal.go | 30 ++++++++++++++++--- 4 files changed, 32 insertions(+), 51 deletions(-) diff --git a/controllers/csr_controller.go b/controllers/csr_controller.go index 254f6cde0..5e5cfd726 100644 --- a/controllers/csr_controller.go +++ b/controllers/csr_controller.go @@ -112,7 +112,7 @@ func (r *GuestCSRReconciler) Reconcile(ctx context.Context, req reconcile.Reques if errors.Is(err, errNoHetznerBareMetalMachineByProviderIDFound) { log.Info(fmt.Sprintf("ProviderID not set yet. The hbmm seems to be in 'ensure-provision'. Retrying. %s", err.Error())) - return reconcile.Result{RequeueAfter: 15 * time.Second}, nil + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } if err != nil { log.Error(err, "could not find an associated bm machine or hcloud machine", diff --git a/controllers/hetznerbaremetalmachine_controller.go b/controllers/hetznerbaremetalmachine_controller.go index 3352365d1..8c02f6741 100644 --- a/controllers/hetznerbaremetalmachine_controller.go +++ b/controllers/hetznerbaremetalmachine_controller.go @@ -23,7 +23,6 @@ import ( "time" "github.com/go-logr/logr" - corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" @@ -74,20 +73,20 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r log = log.WithValues("HetznerBareMetalMachine", klog.KObj(hbmMachine)) // Fetch the Machine. - machine, err := util.GetOwnerMachine(ctx, r.Client, hbmMachine.ObjectMeta) + capiMachine, err := util.GetOwnerMachine(ctx, r.Client, hbmMachine.ObjectMeta) if err != nil { return reconcile.Result{}, fmt.Errorf("failed to get owner machine. BareMetalMachine.ObjectMeta.OwnerReferences %v: %w", hbmMachine.ObjectMeta.OwnerReferences, err) } - if machine == nil { + if capiMachine == nil { log.Info("Machine Controller has not yet set OwnerRef") return reconcile.Result{}, nil } - log = log.WithValues("Machine", klog.KObj(machine)) + log = log.WithValues("Machine", klog.KObj(capiMachine)) // Fetch the Cluster. - cluster, err := util.GetClusterFromMetadata(ctx, r.Client, machine.ObjectMeta) + cluster, err := util.GetClusterFromMetadata(ctx, r.Client, capiMachine.ObjectMeta) if err != nil { log.Info("Machine is missing cluster label or cluster does not exist") return reconcile.Result{}, nil @@ -126,7 +125,7 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r machineScope, err := scope.NewBareMetalMachineScope(scope.BareMetalMachineScopeParams{ Client: r.Client, Logger: log, - Machine: machine, + Machine: capiMachine, BareMetalMachine: hbmMachine, HetznerCluster: hetznerCluster, HCloudClient: hcc, @@ -160,32 +159,15 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r return r.reconcileDelete(ctx, machineScope) } - _, exist := machine.Annotations[clusterv1.RemediateMachineAnnotation] - if exist { - // This hbmm will be deleted soon. Do not reconcile it. - msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine." - log.Info(msg) - c := conditions.Get(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) - if c == nil || c.Status != corev1.ConditionFalse { - // Do not overwrite the message of the condition, if the condition already exists. - conditions.MarkFalse(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition, - infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg) - } - return reconcile.Result{}, nil - } - conditions.MarkTrue(hbmMachine, infrav1.NoRemediateMachineAnnotationCondition) - return r.reconcileNormal(ctx, machineScope) } func (r *HetznerBareMetalMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.BareMetalMachineScope) (reconcile.Result, error) { // delete servers - machineScope.Logger.Info("reconcileDelete") result, err := baremetal.NewService(machineScope).Delete(ctx) if err != nil { var requeueError *scope.RequeueAfterError if ok := errors.As(err, &requeueError); ok { - machineScope.Logger.Info("reconcileDelete aaaaaafter", "e", requeueError) return reconcile.Result{Requeue: true, RequeueAfter: requeueError.GetRequeueAfter()}, nil } return reconcile.Result{}, fmt.Errorf("failed to delete servers for HetznerBareMetalMachine %s/%s: %w", @@ -193,7 +175,6 @@ func (r *HetznerBareMetalMachineReconciler) reconcileDelete(ctx context.Context, } emptyResult := reconcile.Result{} if result != emptyResult { - machineScope.Logger.Info("reconcileDelete noooooo empty result", "res", result) return result, nil } // Machine is deleted so remove the finalizer. diff --git a/pkg/scope/baremetalmachine.go b/pkg/scope/baremetalmachine.go index 9af127dd0..fa65b225f 100644 --- a/pkg/scope/baremetalmachine.go +++ b/pkg/scope/baremetalmachine.go @@ -29,7 +29,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" - "github.com/syself/cluster-api-provider-hetzner/pkg/baremetalutils" hcloudclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/client" ) @@ -155,24 +154,3 @@ func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, messag infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", message) return nil } - -// GetAssociatedHostAndPatchHelper gets the associated host by looking for an annotation on the -// machine that contains a reference to the host. Returns nil if not found. Assumes the host is in -// the same namespace as the machine. Additionally, a PatchHelper gets returned. -func (m *BareMetalMachineScope) GetAssociatedHostAndPatchHelper(ctx context.Context) (*infrav1.HetznerBareMetalHost, *patch.Helper, error) { - host, err := baremetalutils.GetAssociatedHost(ctx, m.Client, m.BareMetalMachine) - if err != nil { - return nil, nil, err - } - - if host == nil { - return nil, nil, nil - } - - helper, err := patch.NewHelper(host, m.Client) - if err != nil { - return nil, nil, fmt.Errorf("failed to create patch helper: %w", err) - } - - return host, helper, nil -} diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index d9ec3344b..dcd1c795c 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -46,6 +46,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" + "github.com/syself/cluster-api-provider-hetzner/pkg/baremetalutils" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" hcloudutil "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/util" "github.com/syself/cluster-api-provider-hetzner/pkg/utils" @@ -130,7 +131,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro // Delete implements delete method of bare metal machine. func (s *Service) Delete(ctx context.Context) (reconcile.Result, error) { // get host - ignore if not found - host, helper, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) + host, helper, err := s.getAssociatedHostAndPatchHelper(ctx) if err != nil && !apierrors.IsNotFound(err) { return reconcile.Result{}, fmt.Errorf("failed to get associated host: %w", err) } @@ -204,7 +205,7 @@ func (s *Service) Delete(ctx context.Context) (reconcile.Result, error) { // update updates a machine and is invoked by the Machine Controller. func (s *Service) update(ctx context.Context) error { - host, helper, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) + host, helper, err := s.getAssociatedHostAndPatchHelper(ctx) if err != nil { return fmt.Errorf("failed to get host: %w", err) } @@ -278,7 +279,7 @@ func (s *Service) update(ctx context.Context) error { func (s *Service) associate(ctx context.Context) error { // look for associated host - associatedHost, _, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) + associatedHost, _, err := s.getAssociatedHostAndPatchHelper(ctx) if err != nil { return fmt.Errorf("failed to get associated host: %w", err) } @@ -347,6 +348,27 @@ func (s *Service) associate(ctx context.Context) error { return nil } +// getAssociatedHostAndPatchHelper gets the associated host by looking for an annotation on the +// machine that contains a reference to the host. Returns nil if not found. Assumes the host is in +// the same namespace as the machine. Additionally, a PatchHelper gets returned. +func (s *Service) getAssociatedHostAndPatchHelper(ctx context.Context) (*infrav1.HetznerBareMetalHost, *patch.Helper, error) { + host, err := baremetalutils.GetAssociatedHost(ctx, s.scope.Client, s.scope.BareMetalMachine) + if err != nil { + return nil, nil, err + } + + if host == nil { + return nil, nil, nil + } + + helper, err := patch.NewHelper(host, s.scope.Client) + if err != nil { + return nil, nil, fmt.Errorf("failed to create patch helper: %w", err) + } + + return host, helper, nil +} + // ChooseHost tries to find a free hbmh. // If no hbmh was found, then hbmh and err are nil, and the string // "reason" contains human readable details. @@ -626,7 +648,7 @@ func (s *Service) setProviderID(ctx context.Context) error { } // providerID is based on the ID of the host machine - host, _, err := s.scope.GetAssociatedHostAndPatchHelper(ctx) + host, _, err := s.getAssociatedHostAndPatchHelper(ctx) if err != nil { return fmt.Errorf("failed to get host: %w", err) } From ef947365731e1689d52452cc4dcabb8e59952602 Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 21:09:22 +0100 Subject: [PATCH 21/22] tiny changes. --- pkg/services/baremetal/baremetal/baremetal.go | 2 +- pkg/services/baremetal/host/host.go | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/services/baremetal/baremetal/baremetal.go b/pkg/services/baremetal/baremetal/baremetal.go index dcd1c795c..5a9f8e638 100644 --- a/pkg/services/baremetal/baremetal/baremetal.go +++ b/pkg/services/baremetal/baremetal/baremetal.go @@ -658,7 +658,7 @@ func (s *Service) setProviderID(ctx context.Context) error { if err != nil { return err } - return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err) + return fmt.Errorf("host not found for machine %q", s.scope.Machine.Name) } if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned { diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 54a2f7019..2afede1ed 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -2187,10 +2187,7 @@ func (s *Service) actionProvisioned(ctx context.Context) actionResult { } // next: None -func (s *Service) actionDeprovisioning(ctx context.Context) actionResult { - logger := ctrl.LoggerFrom(ctx) - logger.Info("actionDeprovisioning actionDeprovisioning actionDeprovisioning actionDeprovisioning actionDeprovisioning") - +func (s *Service) actionDeprovisioning(_ context.Context) actionResult { // Update name in robot API if _, err := s.scope.RobotClient.SetBMServerName( s.scope.HetznerBareMetalHost.Spec.ServerID, From 12cf08b2d153aa4cd9529485f16273307b257b9c Mon Sep 17 00:00:00 2001 From: Thomas Guettler Date: Thu, 20 Nov 2025 21:49:54 +0100 Subject: [PATCH 22/22] :seedling: set GITHUB_TOKEN, so that Lychee does not get rate-limited. --- .github/workflows/pr-lint.yml | 2 ++ Makefile | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/pr-lint.yml b/.github/workflows/pr-lint.yml index 08b7d333b..a26c6235a 100644 --- a/.github/workflows/pr-lint.yml +++ b/.github/workflows/pr-lint.yml @@ -44,6 +44,8 @@ jobs: # if there's a diff then the workflow will exit here. - name: Run make verify run: make BUILD_IN_CONTAINER=false verify + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Lint Golang Code run: make BUILD_IN_CONTAINER=false lint-golang-ci diff --git a/Makefile b/Makefile index 72887dc58..0c3e3e303 100644 --- a/Makefile +++ b/Makefile @@ -762,6 +762,7 @@ ifeq ($(BUILD_IN_CONTAINER),true) $(BUILDER_IMAGE):$(BUILDER_IMAGE_VERSION) $@; else @lychee --version + @if [ -z "$${GITHUB_TOKEN}" ]; then echo "GITHUB_TOKEN is not set"; exit 1; fi lychee --verbose --config .lychee.toml ./*.md ./docs/**/*.md 2>&1 | grep -vP '\[(200|EXCLUDED)\]' endif