@@ -29,6 +29,7 @@ import (
29
29
"sigs.k8s.io/controller-runtime/pkg/client/fake"
30
30
31
31
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
32
+ "sigs.k8s.io/cluster-api/errors"
32
33
"sigs.k8s.io/cluster-api/util/conditions"
33
34
"sigs.k8s.io/cluster-api/util/patch"
34
35
)
@@ -209,6 +210,7 @@ func TestHealthCheckTargets(t *testing.T) {
209
210
210
211
timeoutForMachineToHaveNode := 10 * time .Minute
211
212
disabledTimeoutForMachineToHaveNode := time .Duration (0 )
213
+ timeoutForUnhealthyConditions := 5 * time .Minute
212
214
213
215
// Create a test MHC
214
216
testMHC := & clusterv1.MachineHealthCheck {
@@ -225,12 +227,12 @@ func TestHealthCheckTargets(t *testing.T) {
225
227
{
226
228
Type : corev1 .NodeReady ,
227
229
Status : corev1 .ConditionUnknown ,
228
- Timeout : metav1.Duration {Duration : 5 * time . Minute },
230
+ Timeout : metav1.Duration {Duration : timeoutForUnhealthyConditions },
229
231
},
230
232
{
231
233
Type : corev1 .NodeReady ,
232
234
Status : corev1 .ConditionFalse ,
233
- Timeout : metav1.Duration {Duration : 5 * time . Minute },
235
+ Timeout : metav1.Duration {Duration : timeoutForUnhealthyConditions },
234
236
},
235
237
},
236
238
},
@@ -249,6 +251,7 @@ func TestHealthCheckTargets(t *testing.T) {
249
251
Machine : testMachineCreated1200s ,
250
252
Node : nil ,
251
253
}
254
+ nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition (clusterv1 .NodeStartupTimeoutReason , "Node failed to report startup in %s" , timeoutForMachineToHaveNode )
252
255
253
256
testMachineCreated400s := testMachine .DeepCopy ()
254
257
nowMinus400s := metav1 .NewTime (time .Now ().Add (- 400 * time .Second ))
@@ -265,17 +268,18 @@ func TestHealthCheckTargets(t *testing.T) {
265
268
nodeGoneAway := healthCheckTarget {
266
269
Cluster : cluster ,
267
270
MHC : testMHC ,
268
- Machine : testMachine ,
271
+ Machine : testMachine . DeepCopy () ,
269
272
Node : & corev1.Node {},
270
273
nodeMissing : true ,
271
274
}
275
+ nodeGoneAwayCondition := newFailedHealthCheckCondition (clusterv1 .NodeNotFoundReason , "" )
272
276
273
277
// Target for when the node has been in an unknown state for shorter than the timeout
274
278
testNodeUnknown200 := newTestUnhealthyNode ("node1" , corev1 .NodeReady , corev1 .ConditionUnknown , 200 * time .Second )
275
279
nodeUnknown200 := healthCheckTarget {
276
280
Cluster : cluster ,
277
281
MHC : testMHC ,
278
- Machine : testMachine ,
282
+ Machine : testMachine . DeepCopy () ,
279
283
Node : testNodeUnknown200 ,
280
284
nodeMissing : false ,
281
285
}
@@ -285,7 +289,7 @@ func TestHealthCheckTargets(t *testing.T) {
285
289
nodeUnknown100 := healthCheckTarget {
286
290
Cluster : cluster ,
287
291
MHC : testMHC ,
288
- Machine : testMachine ,
292
+ Machine : testMachine . DeepCopy () ,
289
293
Node : testNodeUnknown100 ,
290
294
nodeMissing : false ,
291
295
}
@@ -295,29 +299,55 @@ func TestHealthCheckTargets(t *testing.T) {
295
299
nodeUnknown400 := healthCheckTarget {
296
300
Cluster : cluster ,
297
301
MHC : testMHC ,
298
- Machine : testMachine ,
302
+ Machine : testMachine . DeepCopy () ,
299
303
Node : testNodeUnknown400 ,
300
304
nodeMissing : false ,
301
305
}
306
+ nodeUnknown400Condition := newFailedHealthCheckCondition (clusterv1 .UnhealthyNodeConditionReason , "Condition Ready on node is reporting status Unknown for more than %s" , timeoutForUnhealthyConditions )
302
307
303
308
// Target for when a node is healthy
304
309
testNodeHealthy := newTestNode ("node1" )
305
310
testNodeHealthy .UID = "12345"
306
311
nodeHealthy := healthCheckTarget {
307
312
Cluster : cluster ,
308
313
MHC : testMHC ,
309
- Machine : testMachine ,
314
+ Machine : testMachine . DeepCopy () ,
310
315
Node : testNodeHealthy ,
311
316
nodeMissing : false ,
312
317
}
313
318
319
+ // Target for when the machine has a failure reason
320
+ failureReason := errors .UpdateMachineError
321
+ testMachineFailureReason := testMachine .DeepCopy ()
322
+ testMachineFailureReason .Status .FailureReason = & failureReason
323
+ machineFailureReason := healthCheckTarget {
324
+ Cluster : cluster ,
325
+ MHC : testMHC ,
326
+ Machine : testMachineFailureReason ,
327
+ Node : nil ,
328
+ }
329
+ machineFailureReasonCondition := newFailedHealthCheckCondition (clusterv1 .MachineHasFailureReason , "FailureReason: %s" , failureReason )
330
+
331
+ // Target for when the machine has a failure message
332
+ failureMsg := "some failure message"
333
+ testMachineFailureMsg := testMachine .DeepCopy ()
334
+ testMachineFailureMsg .Status .FailureMessage = & failureMsg
335
+ machineFailureMsg := healthCheckTarget {
336
+ Cluster : cluster ,
337
+ MHC : testMHC ,
338
+ Machine : testMachineFailureMsg ,
339
+ Node : nil ,
340
+ }
341
+ machineFailureMsgCondition := newFailedHealthCheckCondition (clusterv1 .MachineHasFailureReason , "FailureMessage: %s" , failureMsg )
342
+
314
343
testCases := []struct {
315
- desc string
316
- targets []healthCheckTarget
317
- timeoutForMachineToHaveNode * time.Duration
318
- expectedHealthy []healthCheckTarget
319
- expectedNeedsRemediation []healthCheckTarget
320
- expectedNextCheckTimes []time.Duration
344
+ desc string
345
+ targets []healthCheckTarget
346
+ timeoutForMachineToHaveNode * time.Duration
347
+ expectedHealthy []healthCheckTarget
348
+ expectedNeedsRemediation []healthCheckTarget
349
+ expectedNeedsRemediationCondition []clusterv1.Condition
350
+ expectedNextCheckTimes []time.Duration
321
351
}{
322
352
{
323
353
desc : "when the node has not yet started for shorter than the timeout" ,
@@ -327,18 +357,20 @@ func TestHealthCheckTargets(t *testing.T) {
327
357
expectedNextCheckTimes : []time.Duration {timeoutForMachineToHaveNode - 400 * time .Second },
328
358
},
329
359
{
330
- desc : "when the node has not yet started for longer than the timeout" ,
331
- targets : []healthCheckTarget {nodeNotYetStartedTarget1200s },
332
- expectedHealthy : []healthCheckTarget {},
333
- expectedNeedsRemediation : []healthCheckTarget {nodeNotYetStartedTarget1200s },
334
- expectedNextCheckTimes : []time.Duration {},
360
+ desc : "when the node has not yet started for longer than the timeout" ,
361
+ targets : []healthCheckTarget {nodeNotYetStartedTarget1200s },
362
+ expectedHealthy : []healthCheckTarget {},
363
+ expectedNeedsRemediation : []healthCheckTarget {nodeNotYetStartedTarget1200s },
364
+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeNotYetStartedTarget1200sCondition },
365
+ expectedNextCheckTimes : []time.Duration {},
335
366
},
336
367
{
337
- desc : "when the node has gone away" ,
338
- targets : []healthCheckTarget {nodeGoneAway },
339
- expectedHealthy : []healthCheckTarget {},
340
- expectedNeedsRemediation : []healthCheckTarget {nodeGoneAway },
341
- expectedNextCheckTimes : []time.Duration {},
368
+ desc : "when the node has gone away" ,
369
+ targets : []healthCheckTarget {nodeGoneAway },
370
+ expectedHealthy : []healthCheckTarget {},
371
+ expectedNeedsRemediation : []healthCheckTarget {nodeGoneAway },
372
+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeGoneAwayCondition },
373
+ expectedNextCheckTimes : []time.Duration {},
342
374
},
343
375
{
344
376
desc : "when the node has been in an unknown state for shorter than the timeout" ,
@@ -348,11 +380,12 @@ func TestHealthCheckTargets(t *testing.T) {
348
380
expectedNextCheckTimes : []time.Duration {100 * time .Second },
349
381
},
350
382
{
351
- desc : "when the node has been in an unknown state for longer than the timeout" ,
352
- targets : []healthCheckTarget {nodeUnknown400 },
353
- expectedHealthy : []healthCheckTarget {},
354
- expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
355
- expectedNextCheckTimes : []time.Duration {},
383
+ desc : "when the node has been in an unknown state for longer than the timeout" ,
384
+ targets : []healthCheckTarget {nodeUnknown400 },
385
+ expectedHealthy : []healthCheckTarget {},
386
+ expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
387
+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeUnknown400Condition },
388
+ expectedNextCheckTimes : []time.Duration {},
356
389
},
357
390
{
358
391
desc : "when the node is healthy" ,
@@ -362,11 +395,12 @@ func TestHealthCheckTargets(t *testing.T) {
362
395
expectedNextCheckTimes : []time.Duration {},
363
396
},
364
397
{
365
- desc : "with a mix of healthy and unhealthy nodes" ,
366
- targets : []healthCheckTarget {nodeUnknown100 , nodeUnknown200 , nodeUnknown400 , nodeHealthy },
367
- expectedHealthy : []healthCheckTarget {nodeHealthy },
368
- expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
369
- expectedNextCheckTimes : []time.Duration {200 * time .Second , 100 * time .Second },
398
+ desc : "with a mix of healthy and unhealthy nodes" ,
399
+ targets : []healthCheckTarget {nodeUnknown100 , nodeUnknown200 , nodeUnknown400 , nodeHealthy },
400
+ expectedHealthy : []healthCheckTarget {nodeHealthy },
401
+ expectedNeedsRemediation : []healthCheckTarget {nodeUnknown400 },
402
+ expectedNeedsRemediationCondition : []clusterv1.Condition {nodeUnknown400Condition },
403
+ expectedNextCheckTimes : []time.Duration {200 * time .Second , 100 * time .Second },
370
404
},
371
405
{
372
406
desc : "when the node has not started for a long time but the startup timeout is disabled" ,
@@ -376,6 +410,22 @@ func TestHealthCheckTargets(t *testing.T) {
376
410
expectedNeedsRemediation : []healthCheckTarget {},
377
411
expectedNextCheckTimes : []time.Duration {}, // We don't have a timeout so no way to know when to re-check
378
412
},
413
+ {
414
+ desc : "when the machine has a failure reason" ,
415
+ targets : []healthCheckTarget {machineFailureReason },
416
+ expectedHealthy : []healthCheckTarget {},
417
+ expectedNeedsRemediation : []healthCheckTarget {machineFailureReason },
418
+ expectedNeedsRemediationCondition : []clusterv1.Condition {machineFailureReasonCondition },
419
+ expectedNextCheckTimes : []time.Duration {},
420
+ },
421
+ {
422
+ desc : "when the machine has a failure message" ,
423
+ targets : []healthCheckTarget {machineFailureMsg },
424
+ expectedHealthy : []healthCheckTarget {},
425
+ expectedNeedsRemediation : []healthCheckTarget {machineFailureMsg },
426
+ expectedNeedsRemediationCondition : []clusterv1.Condition {machineFailureMsgCondition },
427
+ expectedNextCheckTimes : []time.Duration {},
428
+ },
379
429
}
380
430
381
431
for _ , tc := range testCases {
@@ -405,9 +455,24 @@ func TestHealthCheckTargets(t *testing.T) {
405
455
return out
406
456
}
407
457
458
+ // Remove the last transition time of the given conditions. Used for comparison with expected conditions.
459
+ removeLastTransitionTimes := func (in clusterv1.Conditions ) clusterv1.Conditions {
460
+ out := clusterv1.Conditions {}
461
+ for _ , c := range in {
462
+ withoutTime := c .DeepCopy ()
463
+ withoutTime .LastTransitionTime = metav1.Time {}
464
+ out = append (out , * withoutTime )
465
+ }
466
+ return out
467
+ }
468
+
408
469
gs .Expect (healthy ).To (ConsistOf (tc .expectedHealthy ))
409
470
gs .Expect (unhealthy ).To (ConsistOf (tc .expectedNeedsRemediation ))
410
471
gs .Expect (nextCheckTimes ).To (WithTransform (roundDurations , ConsistOf (tc .expectedNextCheckTimes )))
472
+ for i , expectedMachineConditions := range tc .expectedNeedsRemediationCondition {
473
+ actualConditions := unhealthy [i ].Machine .GetConditions ()
474
+ gs .Expect (actualConditions ).To (WithTransform (removeLastTransitionTimes , ContainElements (expectedMachineConditions )))
475
+ }
411
476
})
412
477
}
413
478
}
@@ -477,3 +542,7 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu
477
542
},
478
543
}
479
544
}
545
+
546
+ func newFailedHealthCheckCondition (reason string , messageFormat string , messageArgs ... interface {}) clusterv1.Condition {
547
+ return * conditions .FalseCondition (clusterv1 .MachineHealthCheckSucceededCondition , reason , clusterv1 .ConditionSeverityWarning , messageFormat , messageArgs ... )
548
+ }
0 commit comments