Skip to content

Commit 0c45ee8

Browse files
author
Florian Gutmann
committed
Fix messages of MachineHealthCheckSucceedCondition
Messages for the MachineHasFailureReason contained memory addresses instead of the failure reason text. This lead to hot reconciliation loops, because the message changed with every loop-run. Old: "FailureReason: 0xc000d80d10" New: "FailureReason: UpdateError" Messages for the UnhealthyNodeConditionReason contained the timeout value as a nested struct. Old: "Node failed to report startup in &Duration{Duration:10m0s,}" New: "Node failed to report startup in 10m0s" cr: https://code.amazon.com/reviews/CR-74430064
1 parent 8b86290 commit 0c45ee8

File tree

2 files changed

+108
-38
lines changed

2 files changed

+108
-38
lines changed

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
9393
now := time.Now()
9494

9595
if t.Machine.Status.FailureReason != nil {
96-
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", t.Machine.Status.FailureReason)
96+
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason)
9797
logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason)
9898
return true, time.Duration(0)
9999
}
100100

101101
if t.Machine.Status.FailureMessage != nil {
102-
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", t.Machine.Status.FailureMessage)
102+
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", *t.Machine.Status.FailureMessage)
103103
logger.V(3).Info("Target is unhealthy", "failureMessage", t.Machine.Status.FailureMessage)
104104
return true, time.Duration(0)
105105
}
@@ -148,14 +148,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
148148
}
149149
logger.V(3).Info("Using comparison time", "time", comparisonTime)
150150

151+
timeoutDuration := timeoutForMachineToHaveNode.Duration
151152
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
152-
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutForMachineToHaveNode.String())
153-
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutForMachineToHaveNode.String())
153+
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
154+
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration)
154155
return true, time.Duration(0)
155156
}
156157

157158
durationUnhealthy := now.Sub(comparisonTime)
158-
nextCheck := timeoutForMachineToHaveNode.Duration - durationUnhealthy + time.Second
159+
nextCheck := timeoutDuration - durationUnhealthy + time.Second
159160

160161
return false, nextCheck
161162
}

internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go

Lines changed: 102 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"sigs.k8s.io/controller-runtime/pkg/client/fake"
3030

3131
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
32+
"sigs.k8s.io/cluster-api/errors"
3233
"sigs.k8s.io/cluster-api/util/conditions"
3334
"sigs.k8s.io/cluster-api/util/patch"
3435
)
@@ -209,6 +210,7 @@ func TestHealthCheckTargets(t *testing.T) {
209210

210211
timeoutForMachineToHaveNode := 10 * time.Minute
211212
disabledTimeoutForMachineToHaveNode := time.Duration(0)
213+
timeoutForUnhealthyConditions := 5 * time.Minute
212214

213215
// Create a test MHC
214216
testMHC := &clusterv1.MachineHealthCheck{
@@ -225,12 +227,12 @@ func TestHealthCheckTargets(t *testing.T) {
225227
{
226228
Type: corev1.NodeReady,
227229
Status: corev1.ConditionUnknown,
228-
Timeout: metav1.Duration{Duration: 5 * time.Minute},
230+
Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions},
229231
},
230232
{
231233
Type: corev1.NodeReady,
232234
Status: corev1.ConditionFalse,
233-
Timeout: metav1.Duration{Duration: 5 * time.Minute},
235+
Timeout: metav1.Duration{Duration: timeoutForUnhealthyConditions},
234236
},
235237
},
236238
},
@@ -249,6 +251,7 @@ func TestHealthCheckTargets(t *testing.T) {
249251
Machine: testMachineCreated1200s,
250252
Node: nil,
251253
}
254+
nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition(clusterv1.NodeStartupTimeoutReason, "Node failed to report startup in %s", timeoutForMachineToHaveNode)
252255

253256
testMachineCreated400s := testMachine.DeepCopy()
254257
nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second))
@@ -265,17 +268,18 @@ func TestHealthCheckTargets(t *testing.T) {
265268
nodeGoneAway := healthCheckTarget{
266269
Cluster: cluster,
267270
MHC: testMHC,
268-
Machine: testMachine,
271+
Machine: testMachine.DeepCopy(),
269272
Node: &corev1.Node{},
270273
nodeMissing: true,
271274
}
275+
nodeGoneAwayCondition := newFailedHealthCheckCondition(clusterv1.NodeNotFoundReason, "")
272276

273277
// Target for when the node has been in an unknown state for shorter than the timeout
274278
testNodeUnknown200 := newTestUnhealthyNode("node1", corev1.NodeReady, corev1.ConditionUnknown, 200*time.Second)
275279
nodeUnknown200 := healthCheckTarget{
276280
Cluster: cluster,
277281
MHC: testMHC,
278-
Machine: testMachine,
282+
Machine: testMachine.DeepCopy(),
279283
Node: testNodeUnknown200,
280284
nodeMissing: false,
281285
}
@@ -285,7 +289,7 @@ func TestHealthCheckTargets(t *testing.T) {
285289
nodeUnknown100 := healthCheckTarget{
286290
Cluster: cluster,
287291
MHC: testMHC,
288-
Machine: testMachine,
292+
Machine: testMachine.DeepCopy(),
289293
Node: testNodeUnknown100,
290294
nodeMissing: false,
291295
}
@@ -295,29 +299,55 @@ func TestHealthCheckTargets(t *testing.T) {
295299
nodeUnknown400 := healthCheckTarget{
296300
Cluster: cluster,
297301
MHC: testMHC,
298-
Machine: testMachine,
302+
Machine: testMachine.DeepCopy(),
299303
Node: testNodeUnknown400,
300304
nodeMissing: false,
301305
}
306+
nodeUnknown400Condition := newFailedHealthCheckCondition(clusterv1.UnhealthyNodeConditionReason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions)
302307

303308
// Target for when a node is healthy
304309
testNodeHealthy := newTestNode("node1")
305310
testNodeHealthy.UID = "12345"
306311
nodeHealthy := healthCheckTarget{
307312
Cluster: cluster,
308313
MHC: testMHC,
309-
Machine: testMachine,
314+
Machine: testMachine.DeepCopy(),
310315
Node: testNodeHealthy,
311316
nodeMissing: false,
312317
}
313318

319+
// Target for when the machine has a failure reason
320+
failureReason := errors.UpdateMachineError
321+
testMachineFailureReason := testMachine.DeepCopy()
322+
testMachineFailureReason.Status.FailureReason = &failureReason
323+
machineFailureReason := healthCheckTarget{
324+
Cluster: cluster,
325+
MHC: testMHC,
326+
Machine: testMachineFailureReason,
327+
Node: nil,
328+
}
329+
machineFailureReasonCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureReason: %s", failureReason)
330+
331+
// Target for when the machine has a failure message
332+
failureMsg := "some failure message"
333+
testMachineFailureMsg := testMachine.DeepCopy()
334+
testMachineFailureMsg.Status.FailureMessage = &failureMsg
335+
machineFailureMsg := healthCheckTarget{
336+
Cluster: cluster,
337+
MHC: testMHC,
338+
Machine: testMachineFailureMsg,
339+
Node: nil,
340+
}
341+
machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg)
342+
314343
testCases := []struct {
315-
desc string
316-
targets []healthCheckTarget
317-
timeoutForMachineToHaveNode *time.Duration
318-
expectedHealthy []healthCheckTarget
319-
expectedNeedsRemediation []healthCheckTarget
320-
expectedNextCheckTimes []time.Duration
344+
desc string
345+
targets []healthCheckTarget
346+
timeoutForMachineToHaveNode *time.Duration
347+
expectedHealthy []healthCheckTarget
348+
expectedNeedsRemediation []healthCheckTarget
349+
expectedNeedsRemediationCondition []clusterv1.Condition
350+
expectedNextCheckTimes []time.Duration
321351
}{
322352
{
323353
desc: "when the node has not yet started for shorter than the timeout",
@@ -327,18 +357,20 @@ func TestHealthCheckTargets(t *testing.T) {
327357
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second},
328358
},
329359
{
330-
desc: "when the node has not yet started for longer than the timeout",
331-
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
332-
expectedHealthy: []healthCheckTarget{},
333-
expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s},
334-
expectedNextCheckTimes: []time.Duration{},
360+
desc: "when the node has not yet started for longer than the timeout",
361+
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
362+
expectedHealthy: []healthCheckTarget{},
363+
expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s},
364+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition},
365+
expectedNextCheckTimes: []time.Duration{},
335366
},
336367
{
337-
desc: "when the node has gone away",
338-
targets: []healthCheckTarget{nodeGoneAway},
339-
expectedHealthy: []healthCheckTarget{},
340-
expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway},
341-
expectedNextCheckTimes: []time.Duration{},
368+
desc: "when the node has gone away",
369+
targets: []healthCheckTarget{nodeGoneAway},
370+
expectedHealthy: []healthCheckTarget{},
371+
expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway},
372+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition},
373+
expectedNextCheckTimes: []time.Duration{},
342374
},
343375
{
344376
desc: "when the node has been in an unknown state for shorter than the timeout",
@@ -348,11 +380,12 @@ func TestHealthCheckTargets(t *testing.T) {
348380
expectedNextCheckTimes: []time.Duration{100 * time.Second},
349381
},
350382
{
351-
desc: "when the node has been in an unknown state for longer than the timeout",
352-
targets: []healthCheckTarget{nodeUnknown400},
353-
expectedHealthy: []healthCheckTarget{},
354-
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
355-
expectedNextCheckTimes: []time.Duration{},
383+
desc: "when the node has been in an unknown state for longer than the timeout",
384+
targets: []healthCheckTarget{nodeUnknown400},
385+
expectedHealthy: []healthCheckTarget{},
386+
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
387+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition},
388+
expectedNextCheckTimes: []time.Duration{},
356389
},
357390
{
358391
desc: "when the node is healthy",
@@ -362,11 +395,12 @@ func TestHealthCheckTargets(t *testing.T) {
362395
expectedNextCheckTimes: []time.Duration{},
363396
},
364397
{
365-
desc: "with a mix of healthy and unhealthy nodes",
366-
targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy},
367-
expectedHealthy: []healthCheckTarget{nodeHealthy},
368-
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
369-
expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second},
398+
desc: "with a mix of healthy and unhealthy nodes",
399+
targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy},
400+
expectedHealthy: []healthCheckTarget{nodeHealthy},
401+
expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400},
402+
expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition},
403+
expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second},
370404
},
371405
{
372406
desc: "when the node has not started for a long time but the startup timeout is disabled",
@@ -376,6 +410,22 @@ func TestHealthCheckTargets(t *testing.T) {
376410
expectedNeedsRemediation: []healthCheckTarget{},
377411
expectedNextCheckTimes: []time.Duration{}, // We don't have a timeout so no way to know when to re-check
378412
},
413+
{
414+
desc: "when the machine has a failure reason",
415+
targets: []healthCheckTarget{machineFailureReason},
416+
expectedHealthy: []healthCheckTarget{},
417+
expectedNeedsRemediation: []healthCheckTarget{machineFailureReason},
418+
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureReasonCondition},
419+
expectedNextCheckTimes: []time.Duration{},
420+
},
421+
{
422+
desc: "when the machine has a failure message",
423+
targets: []healthCheckTarget{machineFailureMsg},
424+
expectedHealthy: []healthCheckTarget{},
425+
expectedNeedsRemediation: []healthCheckTarget{machineFailureMsg},
426+
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition},
427+
expectedNextCheckTimes: []time.Duration{},
428+
},
379429
}
380430

381431
for _, tc := range testCases {
@@ -405,9 +455,24 @@ func TestHealthCheckTargets(t *testing.T) {
405455
return out
406456
}
407457

458+
// Remove the last transition time of the given conditions. Used for comparison with expected conditions.
459+
removeLastTransitionTimes := func(in clusterv1.Conditions) clusterv1.Conditions {
460+
out := clusterv1.Conditions{}
461+
for _, c := range in {
462+
withoutTime := c.DeepCopy()
463+
withoutTime.LastTransitionTime = metav1.Time{}
464+
out = append(out, *withoutTime)
465+
}
466+
return out
467+
}
468+
408469
gs.Expect(healthy).To(ConsistOf(tc.expectedHealthy))
409470
gs.Expect(unhealthy).To(ConsistOf(tc.expectedNeedsRemediation))
410471
gs.Expect(nextCheckTimes).To(WithTransform(roundDurations, ConsistOf(tc.expectedNextCheckTimes)))
472+
for i, expectedMachineConditions := range tc.expectedNeedsRemediationCondition {
473+
actualConditions := unhealthy[i].Machine.GetConditions()
474+
gs.Expect(actualConditions).To(WithTransform(removeLastTransitionTimes, ContainElements(expectedMachineConditions)))
475+
}
411476
})
412477
}
413478
}
@@ -477,3 +542,7 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu
477542
},
478543
}
479544
}
545+
546+
func newFailedHealthCheckCondition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition {
547+
return *conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededCondition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...)
548+
}

0 commit comments

Comments
 (0)