kubernetes-sigs
diff --git a/‎controlplane/kubeadm/api/v1alpha3/conversion.go
Lines changed: 7 additions & 0 deletions b/‎controlplane/kubeadm/api/v1alpha3/conversion.go
Lines changed: 7 additions & 0 deletions
diff --git a/‎controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
Lines changed: 2 additions & 0 deletions b/‎controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
Lines changed: 2 additions & 0 deletions
diff --git a/‎controlplane/kubeadm/api/v1alpha4/conversion.go
Lines changed: 17 additions & 0 deletions b/‎controlplane/kubeadm/api/v1alpha4/conversion.go
Lines changed: 17 additions & 0 deletions
diff --git a/‎controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
Lines changed: 7 additions & 10 deletions b/‎controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
Lines changed: 7 additions & 10 deletions
diff --git a/‎controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
Lines changed: 86 additions & 0 deletions b/‎controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
Lines changed: 86 additions & 0 deletions
diff --git a/‎controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go
Lines changed: 2 additions & 0 deletions b/‎controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go
Lines changed: 2 additions & 0 deletions
diff --git a/‎controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go
Lines changed: 5 additions & 0 deletions b/‎controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go
Lines changed: 5 additions & 0 deletions
diff --git a/‎controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go
Lines changed: 4 additions & 0 deletions b/‎controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go
Lines changed: 4 additions & 0 deletions
diff --git a/‎controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
Lines changed: 57 additions & 0 deletions b/‎controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
Lines changed: 57 additions & 0 deletions
@@ -99,6 +99,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
 		dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
 	}
 
+	if restored.Spec.RemediationStrategy != nil {
+		dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
+	}
+	if restored.Status.LastRemediation != nil {
+		dst.Status.LastRemediation = restored.Status.LastRemediation
+	}
+
 	return nil
 }
 
 
@@ -84,6 +84,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
 		dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
 	}
 
+	if restored.Spec.RemediationStrategy != nil {
+		dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
+	}
+	if restored.Status.LastRemediation != nil {
+		dst.Status.LastRemediation = restored.Status.LastRemediation
+	}
+
 	return nil
 }
 
@@ -173,6 +180,10 @@ func (src *KubeadmControlPlaneTemplate) ConvertTo(dstRaw conversion.Hub) error {
 		dst.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
 	}
 
+	if restored.Spec.Template.Spec.RemediationStrategy != nil {
+		dst.Spec.Template.Spec.RemediationStrategy = restored.Spec.Template.Spec.RemediationStrategy
+	}
+
 	return nil
 }
 
@@ -262,5 +273,11 @@ func Convert_v1beta1_KubeadmControlPlaneMachineTemplate_To_v1alpha4_KubeadmContr
 
 func Convert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in *controlplanev1.KubeadmControlPlaneSpec, out *KubeadmControlPlaneSpec, scope apiconversion.Scope) error {
 	// .RolloutBefore was added in v1beta1.
+	// .RemediationStrategy was added in v1beta1.
 	return autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in, out, scope)
 }
+
+func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *controlplanev1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, scope apiconversion.Scope) error {
+	// .LastRemediation was added in v1beta1.
+	return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, scope)
+}
@@ -17,6 +17,8 @@ limitations under the License.
 package v1beta1
 
 import (
+	"time"
+
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -49,6 +51,23 @@ const (
 	// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
 	// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
 	KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"
+
+	// RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
+	// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
+	// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
+	// failures in updating remediation retry or remediation count (both counters restart from zero).
+	RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"
+
+	// RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
+	// please note that in case of retry, when also the remediating machine fails, the system keeps track of
+	// the first machine of the sequence only.
+	// NOTE: if something external to CAPI removes this annotation the system this can lead to
+	// failures in updating remediation retry (the counter restarts from zero).
+	RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"
+
+	// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
+	// machine unrelated from the previous remediation.
+	DefaultMinHealthyPeriod = 1 * time.Hour
 )
 
 // KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
@@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
 	// +optional
 	// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
 	RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
+
+	// The RemediationStrategy that controls how control plane machine remediation happens.
+	// +optional
+	RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
 }
 
 // KubeadmControlPlaneMachineTemplate defines the template for Machines
@@ -158,6 +181,50 @@ type RollingUpdate struct {
 	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
 
+// RemediationStrategy allows to define how control plane machine remediation happens.
+type RemediationStrategy struct {
+	// MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	// For example, given a control plane with three machines M1, M2, M3:
+	//
+	//	M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
+	//	If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
+	//	remediated; such operation is considered a retry, remediation-retry #1.
+	//	If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
+	//
+	// A retry could happen only after RetryPeriod from the previous retry.
+	// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
+	// this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
+	//
+	// If not set, the remedation will be retried infinitely.
+	// +optional
+	MaxRetry *int32 `json:"maxRetry,omitempty"`
+
+	// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
+	// for an unhealthy machine (a retry).
+	//
+	// If not set, a retry will happen immediately.
+	// +optional
+	RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`
+
+	// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
+	// from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
+	// counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
+	//
+	//	M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
+	//	If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
+	//	this machine will be remediated and this operation is considered a retry - a problem related
+	//	to the original issue happened to M1 -.
+	//
+	//	If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
+	//	m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
+	//	the original issue happened to M1.
+	//
+	// If not set, this value is defaulted to 1h.
+	// +optional
+	MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"`
+}
+
 // KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
 type KubeadmControlPlaneStatus struct {
 	// Selector is the label selector in string format to avoid introspection
@@ -223,6 +290,25 @@ type KubeadmControlPlaneStatus struct {
 	// Conditions defines current service state of the KubeadmControlPlane.
 	// +optional
 	Conditions clusterv1.Conditions `json:"conditions,omitempty"`
+
+	// LastRemediation stores info about last remediation performed.
+	// +optional
+	LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
+}
+
+// LastRemediationStatus  stores info about last remediation performed.
+// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
+// more remediations than expected might happen.
+type LastRemediationStatus struct {
+	// Machine is the machine name of the latest machine being remediated.
+	Machine string `json:"machine"`
+
+	// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
+	Timestamp metav1.Time `json:"timestamp"`
+
+	// RetryCount used to keep track of remediation retry for the last remediated machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	RetryCount int32 `json:"retryCount"`
 }
 
 // +kubebuilder:object:root=true
 
@@ -181,6 +181,8 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
 		{spec, "machineTemplate", "nodeDeletionTimeout"},
 		{spec, "replicas"},
 		{spec, "version"},
+		{spec, "remediationStrategy"},
+		{spec, "remediationStrategy", "*"},
 		{spec, "rolloutAfter"},
 		{spec, "rolloutBefore"},
 		{spec, "rolloutBefore", "*"},
 
@@ -410,6 +410,11 @@ func TestKubeadmControlPlaneValidateUpdate(t *testing.T) {
 	validUpdate.Spec.RolloutBefore = &RolloutBefore{
 		CertificatesExpiryDays: pointer.Int32(14),
 	}
+	validUpdate.Spec.RemediationStrategy = &RemediationStrategy{
+		MaxRetry:         pointer.Int32(50),
+		MinHealthyPeriod: &metav1.Duration{Duration: 10 * time.Hour},
+		RetryPeriod:      metav1.Duration{Duration: 10 * time.Minute},
+	}
 	validUpdate.Spec.KubeadmConfigSpec.Format = bootstrapv1.CloudConfig
 
 	scaleToZero := before.DeepCopy()
 
@@ -91,6 +91,10 @@ type KubeadmControlPlaneTemplateResourceSpec struct {
 	// +optional
 	// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
 	RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
+
+	// The RemediationStrategy that controls how control plane machine remediation happens.
+	// +optional
+	RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
 }
 
 // KubeadmControlPlaneTemplateMachineTemplate defines the template for Machines
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {`
`99`	`99`	`dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy`
`100`	`100`	`}`
`101`	`101`
	`102`	`+ if restored.Spec.RemediationStrategy != nil {`
	`103`	`+ dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy`
	`104`	`+ }`
	`105`	`+ if restored.Status.LastRemediation != nil {`
	`106`	`+ dst.Status.LastRemediation = restored.Status.LastRemediation`
	`107`	`+ }`
	`108`	`+`
`102`	`109`	`return nil`
`103`	`110`	`}`
`104`	`111`