@@ -29,6 +29,7 @@ import (
2929 "sigs.k8s.io/controller-runtime/pkg/client/fake"
3030
3131 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
32+ "sigs.k8s.io/cluster-api/errors"
3233 "sigs.k8s.io/cluster-api/util/conditions"
3334 "sigs.k8s.io/cluster-api/util/patch"
3435)
@@ -209,6 +210,7 @@ func TestHealthCheckTargets(t *testing.T) {
209210
210211 timeoutForMachineToHaveNode := 10 * time .Minute
211212 disabledTimeoutForMachineToHaveNode := time .Duration (0 )
213+ timeoutForUnhealthyConditions := 5 * time .Minute
212214
213215 // Create a test MHC
214216 testMHC := & clusterv1.MachineHealthCheck {
@@ -225,12 +227,12 @@ func TestHealthCheckTargets(t *testing.T) {
225227 {
226228 Type : corev1 .NodeReady ,
227229 Status : corev1 .ConditionUnknown ,
228- Timeout : metav1.Duration {Duration : 5 * time . Minute },
230+ Timeout : metav1.Duration {Duration : timeoutForUnhealthyConditions },
229231 },
230232 {
231233 Type : corev1 .NodeReady ,
232234 Status : corev1 .ConditionFalse ,
233- Timeout : metav1.Duration {Duration : 5 * time . Minute },
235+ Timeout : metav1.Duration {Duration : timeoutForUnhealthyConditions },
234236 },
235237 },
236238 },
@@ -249,6 +251,7 @@ func TestHealthCheckTargets(t *testing.T) {
249251 Machine : testMachineCreated1200s ,
250252 Node : nil ,
251253 }
254+ nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition (clusterv1 .NodeStartupTimeoutReason , "Node failed to report startup in %s" , timeoutForMachineToHaveNode )
252255
253256 testMachineCreated400s := testMachine .DeepCopy ()
254257 nowMinus400s := metav1 .NewTime (time .Now ().Add (- 400 * time .Second ))
@@ -265,17 +268,18 @@ func TestHealthCheckTargets(t *testing.T) {
265268 nodeGoneAway := healthCheckTarget {
266269 Cluster : cluster ,
267270 MHC : testMHC ,
268- Machine : testMachine ,
271+ Machine : testMachine . DeepCopy () ,
269272 Node : & corev1.Node {},
270273 nodeMissing : true ,
271274 }
275+ nodeGoneAwayCondition := newFailedHealthCheckCondition (clusterv1 .NodeNotFoundReason , "" )
272276
273277 // Target for when the node has been in an unknown state for shorter than the timeout
274278 testNodeUnknown200 := newTestUnhealthyNode ("node1" , corev1 .NodeReady , corev1 .ConditionUnknown , 200 * time .Second )
275279 nodeUnknown200 := healthCheckTarget {
276280 Cluster : cluster ,
277281 MHC : testMHC ,
278- Machine : testMachine ,
282+ Machine : testMachine . DeepCopy () ,
279283 Node : testNodeUnknown200 ,
280284 nodeMissing : false ,
281285 }
@@ -285,7 +289,7 @@ func TestHealthCheckTargets(t *testing.T) {
285289 nodeUnknown100 := healthCheckTarget {
286290 Cluster : cluster ,
287291 MHC : testMHC ,
288- Machine : testMachine ,
292+ Machine : testMachine . DeepCopy () ,
289293 Node : testNodeUnknown100 ,
290294 nodeMissing : false ,
291295 }
@@ -295,29 +299,55 @@ func TestHealthCheckTargets(t *testing.T) {
295299 nodeUnknown400 := healthCheckTarget {
296300 Cluster : cluster ,
297301 MHC : testMHC ,
298- Machine : testMachine ,
302+ Machine : testMachine . DeepCopy () ,
299303 Node : testNodeUnknown400 ,
300304 nodeMissing : false ,
301305 }
306+ nodeUnknown400Condition := newFailedHealthCheckCondition (clusterv1 .UnhealthyNodeConditionReason , "Condition Ready on node is reporting status Unknown for more than %s" , timeoutForUnhealthyConditions )
302307
303308 // Target for when a node is healthy
304309 testNodeHealthy := newTestNode ("node1" )
305310 testNodeHealthy .UID = "12345"
306311 nodeHealthy := healthCheckTarget {
307312 Cluster : cluster ,
308313 MHC : testMHC ,
309- Machine : testMachine ,
314+ Machine : testMachine . DeepCopy () ,
310315 Node : testNodeHealthy ,
311316 nodeMissing : false ,
312317 }
313318
319+ // Target for when the machine has a failure reason
320+ failureReason := errors .UpdateMachineError
321+ testMachineFailureReason := testMachine .DeepCopy ()
322+ testMachineFailureReason .Status .FailureReason = & failureReason
323+ machineFailureReason := healthCheckTarget {
324+ Cluster : cluster ,
325+ MHC : testMHC ,
326+ Machine : testMachineFailureReason ,
327+ Node : nil ,
328+ }
329+ machineFailureReasonCondition := newFailedHealthCheckCondition (clusterv1 .MachineHasFailureReason , "FailureReason: %s" , failureReason )
330+
331+ // Target for when the machine has a failure message
332+ failureMsg := "some failure message"
333+ testMachineFailureMsg := testMachine .DeepCopy ()
334+ testMachineFailureMsg .Status .FailureMessage = & failureMsg
335+ machineFailureMsg := healthCheckTarget {
336+ Cluster : cluster ,
337+ MHC : testMHC ,
338+ Machine : testMachineFailureMsg ,
339+ Node : nil ,
340+ }
341+ machineFailureMsgCondition := newFailedHealthCheckCondition (clusterv1 .MachineHasFailureReason , "FailureMessage: %s" , failureMsg )
342+
314343 testCases := []struct {
315- desc string
316- targets []healthCheckTarget
317- timeoutForMachineToHaveNode * time.Duration
318- expectedHealthy []healthCheckTarget
319- expectedNeedsRemediation []healthCheckTarget
320- expectedNextCheckTimes []time.Duration
344+ desc string
345+ targets []healthCheckTarget
346+ timeoutForMachineToHaveNode * time.Duration
347+ expectedHealthy []healthCheckTarget
348+ expectedNeedsRemediation []healthCheckTarget
349+ expectedNeedsRemediationCondition []clusterv1.Condition
350+ expectedNextCheckTimes []time.Duration
321351 }{
322352 {
323353 desc : "when the node has not yet started for shorter than the timeout" ,
@@ -327,18 +357,20 @@ func TestHealthCheckTargets(t *testing.T) {
327357 expectedNextCheckTimes : []time.Duration {timeoutForMachineToHaveNode - 400 * time .Second },
328358 },
329359 {
330- desc : "when the node has not yet started for longer than the timeout" ,
331- targets : []healthCheckTarget {nodeNotYetStartedTarget1200s },
332- expectedHealthy : []healthCheckTarget {},
333- expectedNeedsRemediation : []healthCheckTarget {nodeNotYetStartedTarget1200s },
334- expectedNextCheckTimes : []time.Duration {},
360+ desc : "when the node has not yet started for longer than the timeout" ,
361+ targets : []healthCheckTarget {nodeNotYetStartedTarget1200s },
362+ expectedHealthy : []healthCheckTarget {},
363+ expectedNeedsRemediation : []healthCheckTarget {nodeNotYetStartedTarget1200s },
364+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeNotYetStartedTarget1200sCondition },
365+ expectedNextCheckTimes : []time.Duration {},
335366 },
336367 {
337- desc : "when the node has gone away" ,
338- targets : []healthCheckTarget {nodeGoneAway },
339- expectedHealthy : []healthCheckTarget {},
340- expectedNeedsRemediation : []healthCheckTarget {nodeGoneAway },
341- expectedNextCheckTimes : []time.Duration {},
368+ desc : "when the node has gone away" ,
369+ targets : []healthCheckTarget {nodeGoneAway },
370+ expectedHealthy : []healthCheckTarget {},
371+ expectedNeedsRemediation : []healthCheckTarget {nodeGoneAway },
372+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeGoneAwayCondition },
373+ expectedNextCheckTimes : []time.Duration {},
342374 },
343375 {
344376 desc : "when the node has been in an unknown state for shorter than the timeout" ,
@@ -348,11 +380,12 @@ func TestHealthCheckTargets(t *testing.T) {
348380 expectedNextCheckTimes : []time.Duration {100 * time .Second },
349381 },
350382 {
351- desc : "when the node has been in an unknown state for longer than the timeout" ,
352- targets : []healthCheckTarget {nodeUnknown400 },
353- expectedHealthy : []healthCheckTarget {},
354- expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
355- expectedNextCheckTimes : []time.Duration {},
383+ desc : "when the node has been in an unknown state for longer than the timeout" ,
384+ targets : []healthCheckTarget {nodeUnknown400 },
385+ expectedHealthy : []healthCheckTarget {},
386+ expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
387+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeUnknown400Condition },
388+ expectedNextCheckTimes : []time.Duration {},
356389 },
357390 {
358391 desc : "when the node is healthy" ,
@@ -362,11 +395,12 @@ func TestHealthCheckTargets(t *testing.T) {
362395 expectedNextCheckTimes : []time.Duration {},
363396 },
364397 {
365- desc : "with a mix of healthy and unhealthy nodes" ,
366- targets : []healthCheckTarget {nodeUnknown100 , nodeUnknown200 , nodeUnknown400 , nodeHealthy },
367- expectedHealthy : []healthCheckTarget {nodeHealthy },
368- expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
369- expectedNextCheckTimes : []time.Duration {200 * time .Second , 100 * time .Second },
398+ desc : "with a mix of healthy and unhealthy nodes" ,
399+ targets : []healthCheckTarget {nodeUnknown100 , nodeUnknown200 , nodeUnknown400 , nodeHealthy },
400+ expectedHealthy : []healthCheckTarget {nodeHealthy },
401+ expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
402+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeUnknown400Condition },
403+ expectedNextCheckTimes : []time.Duration {200 * time .Second , 100 * time .Second },
370404 },
371405 {
372406 desc : "when the node has not started for a long time but the startup timeout is disabled" ,
@@ -376,6 +410,22 @@ func TestHealthCheckTargets(t *testing.T) {
376410 expectedNeedsRemediation : []healthCheckTarget {},
377411 expectedNextCheckTimes : []time.Duration {}, // We don't have a timeout so no way to know when to re-check
378412 },
413+ {
414+ desc : "when the machine has a failure reason" ,
415+ targets : []healthCheckTarget {machineFailureReason },
416+ expectedHealthy : []healthCheckTarget {},
417+ expectedNeedsRemediation : []healthCheckTarget {machineFailureReason },
418+ expectedNeedsRemediationCondition : []clusterv1.Condition {machineFailureReasonCondition },
419+ expectedNextCheckTimes : []time.Duration {},
420+ },
421+ {
422+ desc : "when the machine has a failure message" ,
423+ targets : []healthCheckTarget {machineFailureMsg },
424+ expectedHealthy : []healthCheckTarget {},
425+ expectedNeedsRemediation : []healthCheckTarget {machineFailureMsg },
426+ expectedNeedsRemediationCondition : []clusterv1.Condition {machineFailureMsgCondition },
427+ expectedNextCheckTimes : []time.Duration {},
428+ },
379429 }
380430
381431 for _ , tc := range testCases {
@@ -405,9 +455,24 @@ func TestHealthCheckTargets(t *testing.T) {
405455 return out
406456 }
407457
458+ // Remove the last transition time of the given conditions. Used for comparison with expected conditions.
459+ removeLastTransitionTimes := func (in clusterv1.Conditions ) clusterv1.Conditions {
460+ out := clusterv1.Conditions {}
461+ for _ , c := range in {
462+ withoutTime := c .DeepCopy ()
463+ withoutTime .LastTransitionTime = metav1.Time {}
464+ out = append (out , * withoutTime )
465+ }
466+ return out
467+ }
468+
408469 gs .Expect (healthy ).To (ConsistOf (tc .expectedHealthy ))
409470 gs .Expect (unhealthy ).To (ConsistOf (tc .expectedNeedsRemediation ))
410471 gs .Expect (nextCheckTimes ).To (WithTransform (roundDurations , ConsistOf (tc .expectedNextCheckTimes )))
472+ for i , expectedMachineConditions := range tc .expectedNeedsRemediationCondition {
473+ actualConditions := unhealthy [i ].Machine .GetConditions ()
474+ gs .Expect (actualConditions ).To (WithTransform (removeLastTransitionTimes , ContainElements (expectedMachineConditions )))
475+ }
411476 })
412477 }
413478}
@@ -477,3 +542,7 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu
477542 },
478543 }
479544}
545+
546+ func newFailedHealthCheckCondition (reason string , messageFormat string , messageArgs ... interface {}) clusterv1.Condition {
547+ return * conditions .FalseCondition (clusterv1 .MachineHealthCheckSucceededCondition , reason , clusterv1 .ConditionSeverityWarning , messageFormat , messageArgs ... )
548+ }
0 commit comments