Skip to content

Commit f3a67c1

Browse files
authored
Merge pull request #7079 from fgutmann/healthcheck-messages
🐛 Fix messages of MachineHealthCheckSucceedCondition
2 parents 0cf4492 + 0c45ee8 commit f3a67c1

File tree

2 files changed

+108
-38
lines changed

2 files changed

+108
-38
lines changed

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
9393
now := time.Now()
9494

9595
if t.Machine.Status.FailureReason != nil {
96-
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", t.Machine.Status.FailureReason)
96+
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason)
9797
logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason)
9898
return true, time.Duration(0)
9999
}
100100

101101
if t.Machine.Status.FailureMessage != nil {
102-
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", t.Machine.Status.FailureMessage)
102+
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", *t.Machine.Status.FailureMessage)
103103
logger.V(3).Info("Target is unhealthy", "failureMessage", t.Machine.Status.FailureMessage)
104104
return true, time.Duration(0)
105105
}
@@ -148,14 +148,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
148148
}
149149
logger.V(3).Info("Using comparison time", "time", comparisonTime)
150150

151+
timeoutDuration := timeoutForMachineToHaveNode.Duration
151152
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
152-
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutForMachineToHaveNode.String())
153-
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutForMachineToHaveNode.String())
153+
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
154+
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration)
154155
return true, time.Duration(0)
155156
}
156157

157158
durationUnhealthy := now.Sub(comparisonTime)
158-
nextCheck := timeoutForMachineToHaveNode.Duration - durationUnhealthy + time.Second
159+
nextCheck := timeoutDuration - durationUnhealthy + time.Second
159160

160161
return false, nextCheck
161162
}

internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go

Lines changed: 102 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"sigs.k8s.io/controller-runtime/pkg/client/fake"
3030

3131
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
32+
"sigs.k8s.io/cluster-api/errors"
3233
"sigs.k8s.io/cluster-api/util/conditions"
3334
"sigs.k8s.io/cluster-api/util/patch"
3435
)
@@ -209,6 +210,7 @@ func TestHealthCheckTargets(t *testing.T) {
209210

210211
timeoutForMachineToHaveNode := 10 * time.Minute
211212
disabledTimeoutForMachineToHaveNode := time.Duration(0)
213+
timeoutForUnhealthyConditions := 5 * time.Minute
212214

213215
// Create a test MHC
214216
testMHC := &clusterv1.MachineHealthCheck{
@@ -225,12 +227,12 @@ func TestHealthCheckTargets(t *testing.T) {
225227
{
226228
Type: corev1.NodeReady,
227229
Status: corev1.ConditionUnknown,
228-
Timeout: metav1.Duration{Duration: 5 * time.Minute},
230+
Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions},
229231
},
230232
{
231233
Type: corev1.NodeReady,
232234
Status: corev1.ConditionFalse,
233-
Timeout: metav1.Duration{Duration: 5 * time.Minute},
235+
Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions},
234236
},
235237
},
236238
},
@@ -249,6 +251,7 @@ func TestHealthCheckTargets(t *testing.T) {
249251
Machine: testMachineCreated1200s,
250252
Node: nil,
251253
}
254+
nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition(clusterv1.NodeStartupTimeoutReason, "Node failed to report startup in %s", timeoutForMachineToHaveNode)
252255

253256
testMachineCreated400s := testMachine.DeepCopy()
254257
nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second))
@@ -265,17 +268,18 @@ func TestHealthCheckTargets(t *testing.T) {
265268
nodeGoneAway := healthCheckTarget{
266269
Cluster: cluster,
267270
MHC: testMHC,
268-
Machine: testMachine,
271+
Machine: testMachine.DeepCopy(),
269272
Node: &corev1.Node{},
270273
nodeMissing: true,
271274
}
275+
nodeGoneAwayCondition := newFailedHealthCheckCondition(clusterv1.NodeNotFoundReason, "")
272276

273277
// Target for when the node has been in an unknown state for shorter than the timeout
274278
testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second)
275279
nodeUnknown200 := healthCheckTarget{
276280
Cluster: cluster,
277281
MHC: testMHC,
278-
Machine: testMachine,
282+
Machine: testMachine.DeepCopy(),
279283
Node: testNodeUnknown200,
280284
nodeMissing: false,
281285
}
@@ -285,7 +289,7 @@ func TestHealthCheckTargets(t *testing.T) {
285289
nodeUnknown100 := healthCheckTarget{
286290
Cluster: cluster,
287291
MHC: testMHC,
288-
Machine: testMachine,
292+
Machine: testMachine.DeepCopy(),
289293
Node: testNodeUnknown100,
290294
nodeMissing: false,
291295
}
@@ -295,29 +299,55 @@ func TestHealthCheckTargets(t *testing.T) {
295299
nodeUnknown400 := healthCheckTarget{
296300
Cluster: cluster,
297301
MHC: testMHC,
298-
Machine: testMachine,
302+
Machine: testMachine.DeepCopy(),
299303
Node: testNodeUnknown400,
300304
nodeMissing: false,
301305
}
306+
nodeUnknown400Condition := newFailedHealthCheckCondition(clusterv1.UnhealthyNodeConditionReason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions)
302307

303308
// Target for when a node is healthy
304309
testNodeHealthy := newTestNode("node1")
305310
testNodeHealthy.UID = "12345"
306311
nodeHealthy := healthCheckTarget{
307312
Cluster: cluster,
308313
MHC: testMHC,
309-
Machine: testMachine,
314+
Machine: testMachine.DeepCopy(),
310315
Node: testNodeHealthy,
311316
nodeMissing: false,
312317
}
313318

319+
// Target for when the machine has a failure reason
320+
failureReason := errors.UpdateMachineError
321+
testMachineFailureReason := testMachine.DeepCopy()
322+
testMachineFailureReason.Status.FailureReason = &failureReason
323+
machineFailureReason := healthCheckTarget{
324+
Cluster: cluster,
325+
MHC: testMHC,
326+
Machine: testMachineFailureReason,
327+
Node: nil,
328+
}
329+
machineFailureReasonCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureReason: %s", failureReason)
330+
331+
// Target for when the machine has a failure message
332+
failureMsg := "some failure message"
333+
testMachineFailureMsg := testMachine.DeepCopy()
334+
testMachineFailureMsg.Status.FailureMessage = &failureMsg
335+
machineFailureMsg := healthCheckTarget{
336+
Cluster: cluster,
337+
MHC: testMHC,
338+
Machine: testMachineFailureMsg,
339+
Node: nil,
340+
}
341+
machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg)
342+
314343
testCases := []struct {
315-
desc string
316-
targets []healthCheckTarget
317-
timeoutForMachineToHaveNode *time.Duration
318-
expectedHealthy []healthCheckTarget
319-
expectedNeedsRemediation []healthCheckTarget
320-
expectedNextCheckTimes []time.Duration
344+
desc string
345+
targets []healthCheckTarget
346+
timeoutForMachineToHaveNode *time.Duration
347+
expectedHealthy []healthCheckTarget
348+
expectedNeedsRemediation []healthCheckTarget
349+
expectedNeedsRemediationCondition []clusterv1.Condition
350+
expectedNextCheckTimes []time.Duration
321351
}{
322352
{
323353
desc: "when the node has not yet started for shorter than the timeout",
@@ -327,18 +357,20 @@ func TestHealthCheckTargets(t *testing.T) {
327357
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second},
328358
},
329359
{
330-
desc: "when the node has not yet started for longer than the timeout",
331-
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
332-
expectedHealthy: []healthCheckTarget{},
333-
expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s},
334-
expectedNextCheckTimes: []time.Duration{},
360+
desc: "when the node has not yet started for longer than the timeout",
361+
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
362+
expectedHealthy: []healthCheckTarget{},
363+
expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s},
364+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition},
365+
expectedNextCheckTimes: []time.Duration{},
335366
},
336367
{
337-
desc: "when the node has gone away",
338-
targets: []healthCheckTarget{nodeGoneAway},
339-
expectedHealthy: []healthCheckTarget{},
340-
expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway},
341-
expectedNextCheckTimes: []time.Duration{},
368+
desc: "when the node has gone away",
369+
targets: []healthCheckTarget{nodeGoneAway},
370+
expectedHealthy: []healthCheckTarget{},
371+
expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway},
372+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition},
373+
expectedNextCheckTimes: []time.Duration{},
342374
},
343375
{
344376
desc: "when the node has been in an unknown state for shorter than the timeout",
@@ -348,11 +380,12 @@ func TestHealthCheckTargets(t *testing.T) {
348380
expectedNextCheckTimes: []time.Duration{100 * time.Second},
349381
},
350382
{
351-
desc: "when the node has been in an unknown state for longer than the timeout",
352-
targets: []healthCheckTarget{nodeUnknown400},
353-
expectedHealthy: []healthCheckTarget{},
354-
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
355-
expectedNextCheckTimes: []time.Duration{},
383+
desc: "when the node has been in an unknown state for longer than the timeout",
384+
targets: []healthCheckTarget{nodeUnknown400},
385+
expectedHealthy: []healthCheckTarget{},
386+
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
387+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition},
388+
expectedNextCheckTimes: []time.Duration{},
356389
},
357390
{
358391
desc: "when the node is healthy",
@@ -362,11 +395,12 @@ func TestHealthCheckTargets(t *testing.T) {
362395
expectedNextCheckTimes: []time.Duration{},
363396
},
364397
{
365-
desc: "with a mix of healthy and unhealthy nodes",
366-
targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy},
367-
expectedHealthy: []healthCheckTarget{nodeHealthy},
368-
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
369-
expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second},
398+
desc: "with a mix of healthy and unhealthy nodes",
399+
targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy},
400+
expectedHealthy: []healthCheckTarget{nodeHealthy},
401+
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
402+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition},
403+
expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second},
370404
},
371405
{
372406
desc: "when the node has not started for a long time but the startup timeout is disabled",
@@ -376,6 +410,22 @@ func TestHealthCheckTargets(t *testing.T) {
376410
expectedNeedsRemediation: []healthCheckTarget{},
377411
expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check
378412
},
413+
{
414+
desc: "when the machine has a failure reason",
415+
targets: []healthCheckTarget{machineFailureReason},
416+
expectedHealthy: []healthCheckTarget{},
417+
expectedNeedsRemediation: []healthCheckTarget{machineFailureReason},
418+
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureReasonCondition},
419+
expectedNextCheckTimes: []time.Duration{},
420+
},
421+
{
422+
desc: "when the machine has a failure message",
423+
targets: []healthCheckTarget{machineFailureMsg},
424+
expectedHealthy: []healthCheckTarget{},
425+
expectedNeedsRemediation: []healthCheckTarget{machineFailureMsg},
426+
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition},
427+
expectedNextCheckTimes: []time.Duration{},
428+
},
379429
}
380430

381431
for _, tc := range testCases {
@@ -405,9 +455,24 @@ func TestHealthCheckTargets(t *testing.T) {
405455
return out
406456
}
407457

458+
// Remove the last transition time of the given conditions. Used for comparison with expected conditions.
459+
removeLastTransitionTimes := func(in clusterv1.Conditions) clusterv1.Conditions {
460+
out := clusterv1.Conditions{}
461+
for _, c := range in {
462+
withoutTime := c.DeepCopy()
463+
withoutTime.LastTransitionTime = metav1.Time{}
464+
out = append(out, *withoutTime)
465+
}
466+
return out
467+
}
468+
408469
gs.Expect(healthy).To(ConsistOf(tc.expectedHealthy))
409470
gs.Expect(unhealthy).To(ConsistOf(tc.expectedNeedsRemediation))
410471
gs.Expect(nextCheckTimes).To(WithTransform(roundDurations, ConsistOf(tc.expectedNextCheckTimes)))
472+
for i, expectedMachineConditions := range tc.expectedNeedsRemediationCondition {
473+
actualConditions := unhealthy[i].Machine.GetConditions()
474+
gs.Expect(actualConditions).To(WithTransform(removeLastTransitionTimes, ContainElements(expectedMachineConditions)))
475+
}
411476
})
412477
}
413478
}
@@ -477,3 +542,7 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu
477542
},
478543
}
479544
}
545+
546+
func newFailedHealthCheckCondition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition {
547+
return *conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededCondition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...)
548+
}

0 commit comments

Comments
 (0)