Skip to content

Commit 11e8e87

Browse files
authored
Merge pull request #7963 from fabriziopandini/improve-kcp-remediation-2
✨ Add support for KCP remediation during cluster provisioning
2 parents 193f13a + e98e2de commit 11e8e87

30 files changed

+2483
-306
lines changed

controlplane/kubeadm/api/v1alpha3/conversion.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
9999
dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
100100
}
101101

102+
if restored.Spec.RemediationStrategy != nil {
103+
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
104+
}
105+
if restored.Status.LastRemediation != nil {
106+
dst.Status.LastRemediation = restored.Status.LastRemediation
107+
}
108+
102109
return nil
103110
}
104111

controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

controlplane/kubeadm/api/v1alpha4/conversion.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
8484
dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
8585
}
8686

87+
if restored.Spec.RemediationStrategy != nil {
88+
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
89+
}
90+
if restored.Status.LastRemediation != nil {
91+
dst.Status.LastRemediation = restored.Status.LastRemediation
92+
}
93+
8794
return nil
8895
}
8996

@@ -173,6 +180,10 @@ func (src *KubeadmControlPlaneTemplate) ConvertTo(dstRaw conversion.Hub) error {
173180
dst.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
174181
}
175182

183+
if restored.Spec.Template.Spec.RemediationStrategy != nil {
184+
dst.Spec.Template.Spec.RemediationStrategy = restored.Spec.Template.Spec.RemediationStrategy
185+
}
186+
176187
return nil
177188
}
178189

@@ -262,5 +273,11 @@ func Convert_v1beta1_KubeadmControlPlaneMachineTemplate_To_v1alpha4_KubeadmContr
262273

263274
func Convert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in *controlplanev1.KubeadmControlPlaneSpec, out *KubeadmControlPlaneSpec, scope apiconversion.Scope) error {
264275
// .RolloutBefore was added in v1beta1.
276+
// .RemediationStrategy was added in v1beta1.
265277
return autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in, out, scope)
266278
}
279+
280+
func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *controlplanev1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, scope apiconversion.Scope) error {
281+
// .LastRemediation was added in v1beta1.
282+
return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, scope)
283+
}

controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go

Lines changed: 7 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package v1beta1
1818

1919
import (
20+
"time"
21+
2022
corev1 "k8s.io/api/core/v1"
2123
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2224
"k8s.io/apimachinery/pkg/util/intstr"
@@ -49,6 +51,23 @@ const (
4951
// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
5052
// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
5153
KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"
54+
55+
// RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
56+
// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
57+
// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
58+
// failures in updating remediation retry or remediation count (both counters restart from zero).
59+
RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"
60+
61+
// RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
62+
// please note that in case of retry, when also the remediating machine fails, the system keeps track of
63+
// the first machine of the sequence only.
64+
// NOTE: if something external to CAPI removes this annotation the system this can lead to
65+
// failures in updating remediation retry (the counter restarts from zero).
66+
RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"
67+
68+
// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
69+
// machine unrelated from the previous remediation.
70+
DefaultMinHealthyPeriod = 1 * time.Hour
5271
)
5372

5473
// KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
@@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
91110
// +optional
92111
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
93112
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
113+
114+
// The RemediationStrategy that controls how control plane machine remediation happens.
115+
// +optional
116+
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
94117
}
95118

96119
// KubeadmControlPlaneMachineTemplate defines the template for Machines
@@ -158,6 +181,50 @@ type RollingUpdate struct {
158181
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
159182
}
160183

184+
// RemediationStrategy allows to define how control plane machine remediation happens.
185+
type RemediationStrategy struct {
186+
// MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
187+
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
188+
// For example, given a control plane with three machines M1, M2, M3:
189+
//
190+
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
191+
// If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
192+
// remediated; such operation is considered a retry, remediation-retry #1.
193+
// If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
194+
//
195+
// A retry could happen only after RetryPeriod from the previous retry.
196+
// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
197+
// this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
198+
//
199+
// If not set, the remedation will be retried infinitely.
200+
// +optional
201+
MaxRetry *int32 `json:"maxRetry,omitempty"`
202+
203+
// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
204+
// for an unhealthy machine (a retry).
205+
//
206+
// If not set, a retry will happen immediately.
207+
// +optional
208+
RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`
209+
210+
// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
211+
// from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
212+
// counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
213+
//
214+
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
215+
// If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
216+
// this machine will be remediated and this operation is considered a retry - a problem related
217+
// to the original issue happened to M1 -.
218+
//
219+
// If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
220+
// m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
221+
// the original issue happened to M1.
222+
//
223+
// If not set, this value is defaulted to 1h.
224+
// +optional
225+
MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"`
226+
}
227+
161228
// KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
162229
type KubeadmControlPlaneStatus struct {
163230
// Selector is the label selector in string format to avoid introspection
@@ -223,6 +290,25 @@ type KubeadmControlPlaneStatus struct {
223290
// Conditions defines current service state of the KubeadmControlPlane.
224291
// +optional
225292
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
293+
294+
// LastRemediation stores info about last remediation performed.
295+
// +optional
296+
LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
297+
}
298+
299+
// LastRemediationStatus stores info about last remediation performed.
300+
// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
301+
// more remediations than expected might happen.
302+
type LastRemediationStatus struct {
303+
// Machine is the machine name of the latest machine being remediated.
304+
Machine string `json:"machine"`
305+
306+
// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
307+
Timestamp metav1.Time `json:"timestamp"`
308+
309+
// RetryCount used to keep track of remediation retry for the last remediated machine.
310+
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
311+
RetryCount int32 `json:"retryCount"`
226312
}
227313

228314
// +kubebuilder:object:root=true

controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
181181
{spec, "machineTemplate", "nodeDeletionTimeout"},
182182
{spec, "replicas"},
183183
{spec, "version"},
184+
{spec, "remediationStrategy"},
185+
{spec, "remediationStrategy", "*"},
184186
{spec, "rolloutAfter"},
185187
{spec, "rolloutBefore"},
186188
{spec, "rolloutBefore", "*"},

controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,11 @@ func TestKubeadmControlPlaneValidateUpdate(t *testing.T) {
410410
validUpdate.Spec.RolloutBefore = &RolloutBefore{
411411
CertificatesExpiryDays: pointer.Int32(14),
412412
}
413+
validUpdate.Spec.RemediationStrategy = &RemediationStrategy{
414+
MaxRetry: pointer.Int32(50),
415+
MinHealthyPeriod: &metav1.Duration{Duration: 10 * time.Hour},
416+
RetryPeriod: metav1.Duration{Duration: 10 * time.Minute},
417+
}
413418
validUpdate.Spec.KubeadmConfigSpec.Format = bootstrapv1.CloudConfig
414419

415420
scaleToZero := before.DeepCopy()

controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ type KubeadmControlPlaneTemplateResourceSpec struct {
9191
// +optional
9292
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
9393
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
94+
95+
// The RemediationStrategy that controls how control plane machine remediation happens.
96+
// +optional
97+
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
9498
}
9599

96100
// KubeadmControlPlaneTemplateMachineTemplate defines the template for Machines

controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go

Lines changed: 57 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)