Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions api/v1beta1/hcloudmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,17 @@ type HCloudMachineStatus struct {
// +optional
InstanceState *hcloud.ServerStatus `json:"instanceState,omitempty"`

// FailureReason will be set in the event that there is a terminal problem
// FailureReasonpppp will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a succinct value suitable
// for machine interpretation.
// +optional
FailureReason *capierrors.MachineStatusError `json:"failureReason,omitempty"`
FailureReasonpppp *capierrors.MachineStatusError `json:"failureReason,omitempty"`

// FailureMessage will be set in the event that there is a terminal problem
// FailureMessagepppp will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a more verbose string suitable
// for logging and human consumption.
// +optional
FailureMessage *string `json:"failureMessage,omitempty"`
FailureMessagepppp *string `json:"failureMessage,omitempty"`

// Conditions define the current service state of the HCloudMachine.
// +optional
Expand Down
8 changes: 4 additions & 4 deletions api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -259,13 +259,13 @@ spec:
type: object
failureMessage:
description: |-
FailureMessage will be set in the event that there is a terminal problem
FailureMessagepppp will be set in the event that there is a terminal problem
reconciling the Machine and will contain a more verbose string suitable
for logging and human consumption.
type: string
failureReason:
description: |-
FailureReason will be set in the event that there is a terminal problem
FailureReasonpppp will be set in the event that there is a terminal problem
reconciling the Machine and will contain a succinct value suitable
for machine interpretation.
type: string
Expand Down
17 changes: 15 additions & 2 deletions controllers/hcloudmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/go-logr/logr"
"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
Expand All @@ -45,6 +46,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
"github.com/syself/cluster-api-provider-hetzner/pkg/scope"
secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets"
Expand Down Expand Up @@ -233,10 +235,21 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R
return r.reconcileDelete(ctx, machineScope)
}

if hcloudMachine.Status.FailureReason != nil {
// This machine will be removed.
_, exists := machine.Annotations[clusterv1.RemediateMachineAnnotation]
if exists {
// This hbmm will be deleted soon. Do no reconcile it.
msg := "CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine."
log.Info(msg)
c := conditions.Get(hcloudMachine, v1beta1.NoRemediateMachineAnnotationCondition)
if c == nil || c.Status != corev1.ConditionFalse {
// Do not overwrite the message of the condition, if the condition already exists.
conditions.MarkFalse(hcloudMachine, v1beta1.NoRemediateMachineAnnotationCondition,
v1beta1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s", msg)
}
return reconcile.Result{}, nil
}
conditions.MarkTrue(hcloudMachine, v1beta1.NoRemediateMachineAnnotationCondition)

return r.reconcileNormal(ctx, machineScope)
}

Expand Down
40 changes: 31 additions & 9 deletions pkg/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ import (
"time"

"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" //nolint:staticcheck // we will handle that, when we update to capi v1.11
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/cluster-api/util/record"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets"
sshclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/ssh"
Expand Down Expand Up @@ -126,13 +128,33 @@ func (m *MachineScope) PatchObject(ctx context.Context) error {
return m.patchHelper.Patch(ctx, m.HCloudMachine)
}

// SetError sets the ErrorMessage and ErrorReason fields on the machine and logs
// the message. It assumes the reason is invalid configuration, since that is
// currently the only relevant MachineStatusError choice.
// CAPI will delete the machine and create a new one.
func (m *MachineScope) SetError(message string, reason capierrors.MachineStatusError) {
m.HCloudMachine.Status.FailureMessage = &message
m.HCloudMachine.Status.FailureReason = &reason
// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding
// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be
// created.
func (m *MachineScope) SetErrorAndRemediate(ctx context.Context, message string) error {
obj := m.Machine

// Create a patch base
patch := client.MergeFrom(obj.DeepCopy())

// Modify only annotations on the in-memory copy
if obj.Annotations == nil {
obj.Annotations = map[string]string{}
}
obj.Annotations[clusterv1.RemediateMachineAnnotation] = ""

// Apply patch – only the diff (annotations) is sent to the API server
if err := m.Client.Patch(ctx, obj, patch); err != nil {
return err
}

record.Warnf(m.HCloudMachine, "HCloudMachine will be remediated: %s", message)

conditions.MarkFalse(m.HCloudMachine, v1beta1.NoRemediateMachineAnnotationCondition,
v1beta1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s",
message)

return nil
}

// SetRegion sets the region field on the machine.
Expand Down
32 changes: 17 additions & 15 deletions pkg/services/hcloud/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" //nolint:staticcheck // we will handle that, when we update to capi v1.11
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/record"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
Expand Down Expand Up @@ -122,7 +121,10 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro
"BootStateSince", s.scope.HCloudMachine.Status.BootStateSince,
)

s.scope.SetError(msg, capierrors.CreateMachineError)
err := s.scope.SetErrorAndRemediate(ctx, msg)
if err != nil {
return reconcile.Result{}, err
}
s.scope.HCloudMachine.SetBootState(infrav1.HCloudBootStateUnset)
record.Warn(s.scope.HCloudMachine, "NoHCloudServerFound", msg)
conditions.MarkFalse(s.scope.HCloudMachine, infrav1.ServerAvailableCondition,
Expand Down Expand Up @@ -166,7 +168,7 @@ func (s *Service) handleBootStateUnset(ctx context.Context) (reconcile.Result, e
// timeout. Something has failed.
msg := fmt.Sprintf("handleBootStateUnset timed out after %s. Deleting machine",
durationOfState.Round(time.Second).String())
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
s.scope.Logger.Error(nil, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"HandleBootStateUnsetTimedOut", clusterv1.ConditionSeverityWarning,
Expand Down Expand Up @@ -235,7 +237,7 @@ func (s *Service) handleBootStateInitializing(ctx context.Context, server *hclou
// timeout. Something has failed.
msg := fmt.Sprintf("handleBootStateInitializing timed out after %s. Deleting machine",
durationOfState.Round(time.Second).String())
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
s.scope.Logger.Error(nil, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"BootStateInitializingTimedOut", clusterv1.ConditionSeverityWarning,
Expand Down Expand Up @@ -327,7 +329,7 @@ func (s *Service) handleBootStateEnablingRescue(ctx context.Context, server *hcl
msg := fmt.Sprintf("handleBootStateEnablingRescue timed out after %s. Deleting machine",
durationOfState.Round(time.Second).String())
s.scope.Logger.Error(nil, msg)
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"EnablingRescueTimedOut", clusterv1.ConditionSeverityWarning, "%s", msg)
return reconcile.Result{}, nil
Expand All @@ -338,7 +340,7 @@ func (s *Service) handleBootStateEnablingRescue(ctx context.Context, server *hcl
if hm.Status.ExternalIDs.ActionIDEnableRescueSystem == 0 {
msg := "handleBootStateEnablingRescue ActionIdEnableRescueSystem not set? Can not continue. Provisioning Failed"
s.scope.Logger.Error(nil, msg)
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"ActionIDForEnablingRescueSystemNotSet", clusterv1.ConditionSeverityWarning, "%s", msg)
return reconcile.Result{}, nil
Expand Down Expand Up @@ -369,7 +371,7 @@ func (s *Service) handleBootStateEnablingRescue(ctx context.Context, server *hcl
if err != nil {
err = fmt.Errorf("action %+v failed (wait for rescue enabled): %w", action, err)
s.scope.Logger.Error(err, "")
s.scope.SetError(err.Error(), capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, err.Error())
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"EnablingRescueActionFailed", clusterv1.ConditionSeverityWarning,
"%s", err.Error())
Expand Down Expand Up @@ -451,7 +453,7 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc
// timeout. Something has failed.
msg := fmt.Sprintf("reaching rescue system has timed out after %s. Deleting machine",
durationOfState.Round(time.Second).String())
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
s.scope.Logger.Error(nil, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"BootingToRescueTimedOut", clusterv1.ConditionSeverityWarning,
Expand Down Expand Up @@ -500,7 +502,7 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc
if remoteHostName != "rescue" {
msg := fmt.Sprintf("Remote hostname (via ssh) of hcloud server is %q. Expected 'rescue'. Deleting hcloud machine", remoteHostName)
s.scope.Logger.Error(nil, msg)
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"UnexpectedHostname", clusterv1.ConditionSeverityWarning,
"%s", msg)
Expand Down Expand Up @@ -534,7 +536,7 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc
"ImageURLCommand", s.scope.ImageURLCommand,
"exitStatus", exitStatus,
"stdoutStderr", stdoutStderr)
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"StartImageURLCommandNoZeroExitCode", clusterv1.ConditionSeverityWarning,
"%s", msg)
Expand Down Expand Up @@ -571,7 +573,7 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server
durationOfState.Round(time.Second).String())
err = errors.New(msg)
s.scope.Logger.Error(err, "", "logFile", logFile)
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
record.Warn(hm, "ImageURLCommandFailed", logFile)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"RunningImageCommandTimedOut", clusterv1.ConditionSeverityWarning,
Expand Down Expand Up @@ -607,7 +609,7 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server
msg := "ImageURLCommand failed. Deleting machine"
err = errors.New(msg)
s.scope.Logger.Error(err, "", "logFile", logFile)
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"ImageCommandFailed", clusterv1.ConditionSeverityWarning,
"%s", msg)
Expand All @@ -632,7 +634,7 @@ func (s *Service) handleBootingToRealOS(ctx context.Context, server *hcloud.Serv
// timeout. Something has failed.
msg := fmt.Sprintf("handleBootingToRealOS timed out after %s. Deleting machine",
durationOfState.Round(time.Second).String())
s.scope.SetError(msg, capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, msg)
s.scope.Logger.Error(nil, msg)
conditions.MarkFalse(hm, infrav1.ServerAvailableCondition,
"BootingToRealOSTimedOut", clusterv1.ConditionSeverityWarning,
Expand Down Expand Up @@ -1212,7 +1214,7 @@ func (s *Service) handleServerStatusOff(ctx context.Context, server *hcloud.Serv
}
} else {
// Timed out. Set failure reason
s.scope.SetError("reached timeout of waiting for machines that are switched off", capierrors.CreateMachineError)
s.scope.SetErrorAndRemediate(ctx, "reached timeout of waiting for machines that are switched off")
return res, nil
}
} else {
Expand Down
22 changes: 22 additions & 0 deletions pkg/services/hcloud/server/server_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package server

import (
"bytes"
"context"
"encoding/json"
"testing"

Expand All @@ -26,10 +27,13 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/kubectl/pkg/scheme"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
"github.com/syself/cluster-api-provider-hetzner/pkg/scope"
Expand Down Expand Up @@ -241,12 +245,30 @@ func newTestServer() *hcloud.Server {
}

func newTestService(hcloudMachine *infrav1.HCloudMachine, hcloudClient hcloudclient.Client) *Service {
scheme := runtime.NewScheme()
utilruntime.Must(infrav1.AddToScheme(scheme))
utilruntime.Must(clusterv1.AddToScheme(scheme))
client := fake.NewClientBuilder().WithScheme(scheme).Build()
machine := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: hcloudMachine.Name,
Namespace: hcloudMachine.Namespace,
},
Spec: clusterv1.MachineSpec{},
Status: clusterv1.MachineStatus{},
}
err := client.Create(context.Background(), machine)
if err != nil {
panic(err)
}
return &Service{
&scope.MachineScope{
HCloudMachine: hcloudMachine,
ClusterScope: scope.ClusterScope{
HCloudClient: hcloudClient,
Client: client,
},
Machine: machine,
},
}
}
Loading
Loading