Skip to content

Commit b7e557b

Browse files
committed
🌱 MachineHealthCheck should take Machine's InfraReady condition for startup time
Signed-off-by: Vince Prignano <[email protected]>
1 parent 5b4fc70 commit b7e557b

8 files changed

+122
-25
lines changed

api/v1beta1/clusterclass_types.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,16 @@ type MachineHealthCheckClass struct {
270270
// +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$
271271
UnhealthyRange *string `json:"unhealthyRange,omitempty"`
272272

273-
// Machines older than this duration without a node will be considered to have
274-
// failed and will be remediated.
273+
// NodeStartupTimeout allows to set the maximum time for MachineHealthCheck
274+
// to consider a Machine unhealthy if a corresponding Node isn't associated
275+
// through a `Spec.ProviderID` field.
276+
//
277+
// The duration set in this field is compared to the greatest of:
278+
// - Cluster's infrastructure and control plane ready condition timestamp (if and when available)
279+
// - Machine's infrastructure ready condition timestamp (if and when available)
280+
// - Machine's metadata creation timestamp
281+
//
282+
// Defaults to 10 minutes.
275283
// If you wish to disable this feature, set the value explicitly to 0.
276284
// +optional
277285
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`

api/v1beta1/machinehealthcheck_types.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,16 @@ type MachineHealthCheckSpec struct {
6464
// +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$
6565
UnhealthyRange *string `json:"unhealthyRange,omitempty"`
6666

67-
// Machines older than this duration without a node will be considered to have
68-
// failed and will be remediated.
69-
// If not set, this value is defaulted to 10 minutes.
67+
// NodeStartupTimeout allows to set the maximum time for MachineHealthCheck
68+
// to consider a Machine unhealthy if a corresponding Node isn't associated
69+
// through a `Spec.ProviderID` field.
70+
//
71+
// The duration set in this field is compared to the greatest of:
72+
// - Cluster's infrastructure and control plane ready condition timestamp (if and when available)
73+
// - Machine's infrastructure ready condition timestamp (if and when available)
74+
// - Machine's metadata creation timestamp
75+
//
76+
// Defaults to 10 minutes.
7077
// If you wish to disable this feature, set the value explicitly to 0.
7178
// +optional
7279
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`

api/v1beta1/zz_generated.openapi.go

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml

Lines changed: 24 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/cluster.x-k8s.io_clusters.yaml

Lines changed: 24 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml

Lines changed: 12 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
112112
return true, time.Duration(0)
113113
}
114114

115-
// the node does not exist
115+
// Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
116116
if t.nodeMissing {
117117
logger.V(3).Info("Target is unhealthy: node is missing")
118118
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, "")
@@ -122,14 +122,14 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
122122
// Don't penalize any Machine/Node if the control plane has not been initialized
123123
// Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
124124
if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) {
125-
logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized")
125+
logger.V(5).Info("Not evaluating target health because the control plane has not yet been initialized")
126126
// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
127127
return false, 0
128128
}
129129

130130
// Don't penalize any Machine/Node if the cluster infrastructure is not ready.
131131
if !conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) {
132-
logger.V(3).Info("Not evaluating target health because the cluster infrastructure is not ready")
132+
logger.V(5).Info("Not evaluating target health because the cluster infrastructure is not ready")
133133
// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
134134
return false, 0
135135
}
@@ -144,18 +144,27 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
144144

145145
controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition)
146146
clusterInfraReady := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition)
147+
machineInfraReady := conditions.GetLastTransitionTime(t.Machine, clusterv1.InfrastructureReadyCondition)
147148
machineCreationTime := t.Machine.CreationTimestamp.Time
148149

149-
// Use the latest of the 3 times
150+
// Use the latest of the following timestamps.
150151
comparisonTime := machineCreationTime
151-
logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReady, "controlPlaneInitializedTime", controlPlaneInitialized)
152+
logger.V(5).Info("Determining comparison time",
153+
"machineCreationTime", machineCreationTime,
154+
"clusterInfraReadyTime", clusterInfraReady,
155+
"controlPlaneInitializedTime", controlPlaneInitialized,
156+
"machineInfraReadyTime", machineInfraReady,
157+
)
152158
if conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && controlPlaneInitialized != nil && controlPlaneInitialized.Time.After(comparisonTime) {
153159
comparisonTime = controlPlaneInitialized.Time
154160
}
155161
if conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) && clusterInfraReady != nil && clusterInfraReady.Time.After(comparisonTime) {
156162
comparisonTime = clusterInfraReady.Time
157163
}
158-
logger.V(3).Info("Using comparison time", "time", comparisonTime)
164+
if conditions.IsTrue(t.Machine, clusterv1.InfrastructureReadyCondition) && machineInfraReady != nil && machineInfraReady.Time.After(comparisonTime) {
165+
comparisonTime = machineInfraReady.Time
166+
}
167+
logger.V(5).Info("Using comparison time", "time", comparisonTime)
159168

160169
timeoutDuration := timeoutForMachineToHaveNode.Duration
161170
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {

internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,23 @@ func TestHealthCheckTargets(t *testing.T) {
239239
}
240240

241241
testMachine := newTestMachine("machine1", namespace, clusterName, "node1", mhcSelector)
242+
testMachineWithInfraReady := testMachine.DeepCopy()
243+
testMachineWithInfraReady.CreationTimestamp = metav1.NewTime(time.Now().Add(-100 * time.Second))
244+
testMachineWithInfraReady.SetConditions(clusterv1.Conditions{
245+
{
246+
Type: clusterv1.InfrastructureReadyCondition,
247+
Status: corev1.ConditionTrue,
248+
Severity: clusterv1.ConditionSeverityInfo,
249+
LastTransitionTime: metav1.NewTime(testMachineWithInfraReady.CreationTimestamp.Add(50 * time.Second)),
250+
},
251+
})
252+
253+
nodeNotYetStartedTargetAndInfraReady := healthCheckTarget{
254+
Cluster: cluster,
255+
MHC: testMHC,
256+
Machine: testMachineWithInfraReady,
257+
Node: nil,
258+
}
242259

243260
// Targets for when the node has not yet been seen by the Machine controller
244261
testMachineCreated1200s := testMachine.DeepCopy()
@@ -416,6 +433,13 @@ func TestHealthCheckTargets(t *testing.T) {
416433
expectedNeedsRemediation: []healthCheckTarget{},
417434
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second},
418435
},
436+
{
437+
desc: "when the node has not yet started for shorter than the timeout, and infra is ready",
438+
targets: []healthCheckTarget{nodeNotYetStartedTargetAndInfraReady},
439+
expectedHealthy: []healthCheckTarget{},
440+
expectedNeedsRemediation: []healthCheckTarget{},
441+
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 50*time.Second},
442+
},
419443
{
420444
desc: "when the node has not yet started for longer than the timeout",
421445
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},

0 commit comments

Comments
 (0)