Skip to content

Commit cca7f8c

Browse files
authored
🌱 Implement grace period for KCP remote conditions (#11339)
* Implement grace period for KCP remote conditions Signed-off-by: Stefan Büringer [email protected] * Fix review findings * Fix review findings * Fix review findings --------- Signed-off-by: Stefan Büringer [email protected]
1 parent a5269e2 commit cca7f8c

20 files changed

+1051
-231
lines changed

api/v1beta1/machine_types.go

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -206,15 +206,11 @@ const (
206206
// during the deletion workflow, or by a users.
207207
MachineNodeDeletedV1Beta2Reason = ObjectDeletedV1Beta2Reason
208208

209-
// MachineNodeRemoteConnectionFailedV1Beta2Reason surfaces that the remote connection failed.
210-
// If the remote connection probe failed for longer than remote conditions grace period,
211-
// this reason is used when setting NodeHealthy and NodeReady conditions to `Unknown`.
212-
MachineNodeRemoteConnectionFailedV1Beta2Reason = RemoteConnectionFailedV1Beta2Reason
213-
214-
// MachineNodeRemoteConnectionDownV1Beta2Reason surfaces that the remote connection is down.
215-
// This is used when setting NodeHealthy and NodeReady conditions to `Unknown`
216-
// when the connection is down and they haven't been set yet.
217-
MachineNodeRemoteConnectionDownV1Beta2Reason = RemoteConnectionDownV1Beta2Reason
209+
// MachineNodeInspectionFailedV1Beta2Reason documents a failure when inspecting the status of a Node.
210+
MachineNodeInspectionFailedV1Beta2Reason = InspectionFailedV1Beta2Reason
211+
212+
// MachineNodeConnectionDownV1Beta2Reason surfaces that the connection to the workload cluster is down.
213+
MachineNodeConnectionDownV1Beta2Reason = ConnectionDownV1Beta2Reason
218214
)
219215

220216
// Machine's HealthCheckSucceeded condition and corresponding reasons that will be used in v1Beta2 API version.

api/v1beta1/v1beta2_condition_consts.go

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,8 @@ const (
106106
// set to false and with the OwnerRemediated condition set to false by the MachineHealthCheck controller.
107107
RemediatingV1Beta2Reason = "Remediating"
108108

109-
// NotRemediatingV1Beta2Reason surfaces when an object does not own any machines marked as not healthy
110-
// by the MachineHealthCheck controller.
109+
// NotRemediatingV1Beta2Reason surfaces when an object does not own any machines with HealthCheckSucceeded
110+
// set to false and with the OwnerRemediated condition set to false by the MachineHealthCheck controller.
111111
NotRemediatingV1Beta2Reason = "NotRemediating"
112112

113113
// NoReplicasV1Beta2Reason surfaces when an object that manage replicas does not have any.
@@ -142,15 +142,8 @@ const (
142142
// PausedV1Beta2Reason surfaces when an object is paused.
143143
PausedV1Beta2Reason = "Paused"
144144

145-
// RemoteConnectionFailedV1Beta2Reason surfaces that the remote connection failed.
146-
// This is typically used when setting remote conditions (e.g. `NodeHealthy`) to `Unknown`
147-
// after the remote connection probe didn't succeed for remote conditions grace period.
148-
RemoteConnectionFailedV1Beta2Reason = "RemoteConnectionFailed"
149-
150-
// RemoteConnectionDownV1Beta2Reason surfaces that the remote connection is down.
151-
// This is typically used when setting remote conditions (e.g. `NodeHealthy`) to `Unknown`
152-
// when the connection is down and they haven't been set yet.
153-
RemoteConnectionDownV1Beta2Reason = "RemoteConnectionDown"
145+
// ConnectionDownV1Beta2Reason surfaces that the connection to the workload cluster is down.
146+
ConnectionDownV1Beta2Reason = "ConnectionDown"
154147

155148
// DeletionTimestampNotSetV1Beta2Reason surfaces when an object is not deleting because the
156149
// DeletionTimestamp is not set.

controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,25 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2020

2121
// KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version.
2222
const (
23-
// KubeadmControlPlaneAvailableV1Beta2Condition True if the control plane can be reached, EtcdClusterHealthy is true,
23+
// KubeadmControlPlaneAvailableV1Beta2Condition is True if the control plane can be reached, EtcdClusterHealthy is true,
2424
// and CertificatesAvailable is true.
2525
KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition
2626
)
2727

28+
// KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version.
29+
const (
30+
// KubeadmControlPlaneInitializedV1Beta2Condition is True when the control plane is functional enough to accept
31+
// requests. This information is usually used as a signal for starting all the provisioning operations that
32+
// depend on a functional API server, but do not require a full HA control plane to exist.
33+
KubeadmControlPlaneInitializedV1Beta2Condition = "Initialized"
34+
35+
// KubeadmControlPlaneInitializedV1Beta2Reason surfaces when the control plane is initialized.
36+
KubeadmControlPlaneInitializedV1Beta2Reason = "Initialized"
37+
38+
// KubeadmControlPlaneNotInitializedV1Beta2Reason surfaces when the control plane is not initialized.
39+
KubeadmControlPlaneNotInitializedV1Beta2Reason = "NotInitialized"
40+
)
41+
2842
// KubeadmControlPlane's CertificatesAvailable condition and corresponding reasons that will be used in v1Beta2 API version.
2943
const (
3044
// KubeadmControlPlaneCertificatesAvailableV1Beta2Condition True if all the cluster certificates exist.
@@ -52,6 +66,10 @@ const (
5266
// etcd cluster hosted on KubeadmControlPlane controlled machines.
5367
KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason
5468

69+
// KubeadmControlPlaneEtcdClusterConnectionDownV1Beta2Reason surfaces that the connection to the workload
70+
// cluster is down.
71+
KubeadmControlPlaneEtcdClusterConnectionDownV1Beta2Reason = clusterv1.ConnectionDownV1Beta2Reason
72+
5573
// KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason surfaces when the etcd cluster hosted on KubeadmControlPlane
5674
// machines is healthy.
5775
KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason = "Healthy"
@@ -77,6 +95,10 @@ const (
7795
// control plane components hosted on KubeadmControlPlane controlled machines.
7896
KubeadmControlPlaneControlPlaneComponentsInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason
7997

98+
// KubeadmControlPlaneControlPlaneComponentsConnectionDownV1Beta2Reason surfaces that the connection to the workload
99+
// cluster is down.
100+
KubeadmControlPlaneControlPlaneComponentsConnectionDownV1Beta2Reason = clusterv1.ConnectionDownV1Beta2Reason
101+
80102
// KubeadmControlPlaneControlPlaneComponentsHealthyV1Beta2Reason surfaces when the Kubernetes control plane components
81103
// hosted on KubeadmControlPlane machines are healthy.
82104
KubeadmControlPlaneControlPlaneComponentsHealthyV1Beta2Reason = "Healthy"
@@ -233,13 +255,13 @@ const (
233255
// pod hosted on a KubeadmControlPlane controlled machine.
234256
KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason
235257

258+
// KubeadmControlPlaneMachinePodConnectionDownV1Beta2Reason surfaces that the connection to the workload
259+
// cluster is down.
260+
KubeadmControlPlaneMachinePodConnectionDownV1Beta2Reason = clusterv1.ConnectionDownV1Beta2Reason
261+
236262
// KubeadmControlPlaneMachinePodDeletingV1Beta2Reason surfaces when the machine hosting control plane components
237263
// is being deleted.
238264
KubeadmControlPlaneMachinePodDeletingV1Beta2Reason = "Deleting"
239-
240-
// KubeadmControlPlaneMachinePodInternalErrorV1Beta2Reason surfaces unexpected failures when reading pod hosted
241-
// on a KubeadmControlPlane controlled machine.
242-
KubeadmControlPlaneMachinePodInternalErrorV1Beta2Reason = clusterv1.InternalErrorV1Beta2Reason
243265
)
244266

245267
// EtcdMemberHealthy condition and corresponding reasons that will be used for KubeadmControlPlane controlled machines in v1Beta2 API version.
@@ -257,6 +279,10 @@ const (
257279
// etcd member hosted on a KubeadmControlPlane controlled machine.
258280
KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason
259281

282+
// KubeadmControlPlaneMachineEtcdMemberConnectionDownV1Beta2Reason surfaces that the connection to the workload
283+
// cluster is down.
284+
KubeadmControlPlaneMachineEtcdMemberConnectionDownV1Beta2Reason = clusterv1.ConnectionDownV1Beta2Reason
285+
260286
// KubeadmControlPlaneMachineEtcdMemberDeletingV1Beta2Reason surfaces when the machine hosting an etcd member
261287
// is being deleted.
262288
KubeadmControlPlaneMachineEtcdMemberDeletingV1Beta2Reason = "Deleting"

controlplane/kubeadm/controllers/alias.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ type KubeadmControlPlaneReconciler struct {
4040
// WatchFilterValue is the label value used to filter events prior to reconciliation.
4141
WatchFilterValue string
4242

43+
RemoteConditionsGracePeriod time.Duration
44+
4345
// Deprecated: DeprecatedInfraMachineNaming. Name the InfraStructureMachines after the InfraMachineTemplate.
4446
DeprecatedInfraMachineNaming bool
4547
}
@@ -53,6 +55,7 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mg
5355
EtcdDialTimeout: r.EtcdDialTimeout,
5456
EtcdCallTimeout: r.EtcdCallTimeout,
5557
WatchFilterValue: r.WatchFilterValue,
58+
RemoteConditionsGracePeriod: r.RemoteConditionsGracePeriod,
5659
DeprecatedInfraMachineNaming: r.DeprecatedInfraMachineNaming,
5760
}).SetupWithManager(ctx, mgr, options)
5861
}

controlplane/kubeadm/internal/control_plane.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,12 @@ func (c *ControlPlane) PatchMachines(ctx context.Context) error {
309309
controlplanev1.MachineSchedulerPodHealthyCondition,
310310
controlplanev1.MachineEtcdPodHealthyCondition,
311311
controlplanev1.MachineEtcdMemberHealthyCondition,
312+
}}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{
313+
controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition,
314+
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
315+
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition,
316+
controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition,
317+
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition,
312318
}}); err != nil {
313319
errList = append(errList, err)
314320
}

0 commit comments

Comments
 (0)