@@ -17,6 +17,8 @@ limitations under the License.
17
17
package v1beta1
18
18
19
19
import (
20
+ "time"
21
+
20
22
corev1 "k8s.io/api/core/v1"
21
23
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22
24
"k8s.io/apimachinery/pkg/util/intstr"
@@ -49,6 +51,23 @@ const (
49
51
// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
50
52
// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
51
53
KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"
54
+
55
+ // RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
56
+ // specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
57
+ // NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
58
+ // failures in updating remediation retry or remediation count (both counters restart from zero).
59
+ RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"
60
+
61
+ // RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
62
+ // please note that in case of retry, when also the remediating machine fails, the system keeps track of
63
+ // the first machine of the sequence only.
64
+ // NOTE: if something external to CAPI removes this annotation the system this can lead to
65
+ // failures in updating remediation retry (the counter restarts from zero).
66
+ RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"
67
+
68
+ // DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
69
+ // machine unrelated from the previous remediation.
70
+ DefaultMinHealthyPeriod = 1 * time .Hour
52
71
)
53
72
54
73
// KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
@@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
91
110
// +optional
92
111
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
93
112
RolloutStrategy * RolloutStrategy `json:"rolloutStrategy,omitempty"`
113
+
114
+ // The RemediationStrategy that controls how control plane machine remediation happens.
115
+ // +optional
116
+ RemediationStrategy * RemediationStrategy `json:"remediationStrategy,omitempty"`
94
117
}
95
118
96
119
// KubeadmControlPlaneMachineTemplate defines the template for Machines
@@ -158,6 +181,50 @@ type RollingUpdate struct {
158
181
MaxSurge * intstr.IntOrString `json:"maxSurge,omitempty"`
159
182
}
160
183
184
+ // RemediationStrategy allows to define how control plane machine remediation happens.
185
+ type RemediationStrategy struct {
186
+ // MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
187
+ // A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
188
+ // For example, given a control plane with three machines M1, M2, M3:
189
+ //
190
+ // M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
191
+ // If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
192
+ // remediated; such operation is considered a retry, remediation-retry #1.
193
+ // If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
194
+ //
195
+ // A retry could happen only after RetryPeriod from the previous retry.
196
+ // If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
197
+ // this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
198
+ //
199
+ // If not set, the remedation will be retried infinitely.
200
+ // +optional
201
+ MaxRetry * int32 `json:"maxRetry,omitempty"`
202
+
203
+ // RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
204
+ // for an unhealthy machine (a retry).
205
+ //
206
+ // If not set, a retry will happen immediately.
207
+ // +optional
208
+ RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`
209
+
210
+ // MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
211
+ // from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
212
+ // counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
213
+ //
214
+ // M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
215
+ // If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
216
+ // this machine will be remediated and this operation is considered a retry - a problem related
217
+ // to the original issue happened to M1 -.
218
+ //
219
+ // If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
220
+ // m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
221
+ // the original issue happened to M1.
222
+ //
223
+ // If not set, this value is defaulted to 1h.
224
+ // +optional
225
+ MinHealthyPeriod * metav1.Duration `json:"minHealthyPeriod,omitempty"`
226
+ }
227
+
161
228
// KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
162
229
type KubeadmControlPlaneStatus struct {
163
230
// Selector is the label selector in string format to avoid introspection
@@ -223,6 +290,25 @@ type KubeadmControlPlaneStatus struct {
223
290
// Conditions defines current service state of the KubeadmControlPlane.
224
291
// +optional
225
292
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
293
+
294
+ // LastRemediation stores info about last remediation performed.
295
+ // +optional
296
+ LastRemediation * LastRemediationStatus `json:"lastRemediation,omitempty"`
297
+ }
298
+
299
+ // LastRemediationStatus stores info about last remediation performed.
300
+ // NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
301
+ // more remediations than expected might happen.
302
+ type LastRemediationStatus struct {
303
+ // Machine is the machine name of the latest machine being remediated.
304
+ Machine string `json:"machine"`
305
+
306
+ // Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
307
+ Timestamp metav1.Time `json:"timestamp"`
308
+
309
+ // RetryCount used to keep track of remediation retry for the last remediated machine.
310
+ // A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
311
+ RetryCount int32 `json:"retryCount"`
226
312
}
227
313
228
314
// +kubebuilder:object:root=true
0 commit comments