@@ -29,6 +29,8 @@ import (
29
29
30
30
"github.com/go-logr/logr"
31
31
"github.com/hetznercloud/hcloud-go/v2/hcloud"
32
+ "golang.org/x/exp/maps"
33
+ "golang.org/x/exp/slices"
32
34
corev1 "k8s.io/api/core/v1"
33
35
"k8s.io/apimachinery/pkg/api/equality"
34
36
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -283,19 +285,19 @@ func (s *Service) associate(ctx context.Context) error {
283
285
}
284
286
285
287
// choose new host
286
- host , helper , err := s .chooseHost (ctx )
288
+ host , helper , reason , err := s .chooseHost (ctx )
287
289
if err != nil {
288
290
return fmt .Errorf ("failed to choose host: %w" , err )
289
291
}
290
292
if host == nil {
291
293
s .scope .BareMetalMachine .Status .Phase = clusterv1 .MachinePhasePending
292
- s .scope .V ( 1 ). Info ("No available host found. Requeuing." )
294
+ s .scope .Info ("No available host found. Requeuing." , "reason" , reason )
293
295
conditions .MarkFalse (
294
296
s .scope .BareMetalMachine ,
295
297
infrav1 .HostAssociateSucceededCondition ,
296
298
infrav1 .NoAvailableHostReason ,
297
299
clusterv1 .ConditionSeverityWarning ,
298
- "no available host" ,
300
+ fmt . Sprintf ( "no available host (%s)" , reason ) ,
299
301
)
300
302
return & scope.RequeueAfterError {RequeueAfter : requeueAfter }
301
303
}
@@ -363,71 +365,145 @@ func (s *Service) getAssociatedHost(ctx context.Context) (*infrav1.HetznerBareMe
363
365
return & host , helper , nil
364
366
}
365
367
366
- func (s * Service ) chooseHost (ctx context.Context ) (* infrav1.HetznerBareMetalHost , * patch.Helper , error ) {
368
+ // chooseHost tries to find a free hbmh.
369
+ // If no hbmh was found, then hbmh and err are nil, and the string
370
+ // "reason" contains human readable details.
371
+ func (s * Service ) chooseHost (ctx context.Context ) (
372
+ hbmh * infrav1.HetznerBareMetalHost , patchHelper * patch.Helper , reason string , err error ,
373
+ ) {
367
374
// get list of hosts scoped to namespace of machine
368
375
hosts := infrav1.HetznerBareMetalHostList {}
369
376
opts := & client.ListOptions {
370
377
Namespace : s .scope .BareMetalMachine .Namespace ,
371
378
}
372
379
373
380
if err := s .scope .Client .List (ctx , & hosts , opts ); err != nil {
374
- return nil , nil , fmt .Errorf ("failed to list hosts: %w" , err )
381
+ return nil , nil , "" , fmt .Errorf ("failed to list hosts: %w" , err )
375
382
}
376
383
377
384
labelSelector := s .getLabelSelector ()
378
385
386
+ // count all hosts that are not in use already
387
+ unusedHostsCounter := 0
388
+
389
+ // hosts are "available" if they are not in use already by some Kubernetes cluster and do not have
390
+ // another reason to not be chosen (labels that don't match the selector, maintenance mode, error state, etc.)
379
391
availableHosts := make ([]* infrav1.HetznerBareMetalHost , 0 , len (hosts .Items ))
380
392
393
+ mapOfSkipReasons := make (map [string ]int )
394
+
381
395
for i , host := range hosts .Items {
382
396
if host .Spec .ConsumerRef != nil && consumerRefMatches (host .Spec .ConsumerRef , s .scope .BareMetalMachine ) {
383
397
helper , err := patch .NewHelper (& hosts .Items [i ], s .scope .Client )
384
398
if err != nil {
385
- return nil , nil , fmt .Errorf ("failed to create patch helper: %w" , err )
399
+ return nil , nil , "" , fmt .Errorf ("failed to create patch helper: %w" , err )
386
400
}
387
- return & hosts .Items [i ], helper , nil
401
+ return & hosts .Items [i ], helper , "" , nil
388
402
}
389
403
if host .Spec .ConsumerRef != nil {
390
404
continue
391
405
}
392
- if host .Spec .MaintenanceMode != nil && * host .Spec .MaintenanceMode {
406
+
407
+ // from now on each "continue" should add an entry
408
+ // to mapOfSkipReasons.
409
+ unusedHostsCounter ++
410
+
411
+ // This comes first, because we should not look too deep into machines
412
+ // which are not in our scope.
413
+ if ! labelSelector .Matches (labels .Set (host .ObjectMeta .Labels )) {
414
+ mapOfSkipReasons ["label-selector-does-not-match" ]++
393
415
continue
394
416
}
417
+
395
418
if host .GetDeletionTimestamp () != nil {
419
+ mapOfSkipReasons ["hbmh-has-deletion-timestamp" ]++
396
420
continue
397
421
}
398
- if host .Spec .Status .ErrorMessage != "" {
422
+
423
+ if host .Spec .MaintenanceMode != nil && * host .Spec .MaintenanceMode {
424
+ mapOfSkipReasons ["hbmh-in-maintenance-mode" ]++
399
425
continue
400
426
}
401
-
402
- if ! labelSelector . Matches ( labels . Set ( host . ObjectMeta . Labels )) {
427
+ if host . Spec . Status . ErrorMessage != "" {
428
+ mapOfSkipReasons [ "hbmh-has-error-message-in-status" ] ++
403
429
continue
404
430
}
405
431
406
432
if host .Spec .Status .ProvisioningState != infrav1 .StateNone {
433
+ mapOfSkipReasons ["hbmh-in-wrong-provisioning-state" ]++
407
434
continue
408
435
}
409
436
437
+ // This comes last, because now we would choose the machine, but we check
438
+ // if the config is correct.
439
+ if host .Spec .RootDeviceHints != nil &&
440
+ host .Spec .RootDeviceHints .IsValid () {
441
+ // Even if there are no rootDeviceHints specified, the host should be picked.
442
+ // After the phase registering, the process to provision the server stops and
443
+ // waits for the user to specify the rootDeviceHints.
444
+ // Here (RootDeviceHints exists and is valid) we want to check whether
445
+ // the specified rootDeviceHints fit with the InstallImage configuration
446
+ // of the HetznerBareMetalMachine. If not, it is not valid.
447
+ // Doing that without first choosing the hbmh would be nice, there is a feature request:
448
+ // https://github.com/syself/cluster-api-provider-hetzner/issues/1166
449
+ if s .scope .BareMetalMachine .Spec .InstallImage .Swraid == 1 {
450
+ // Machine should have RAID. Skip machines which have less than two WWNs
451
+ lenOfWwnSlice := len (host .Spec .RootDeviceHints .Raid .WWN )
452
+ if lenOfWwnSlice < 2 {
453
+ mapOfSkipReasons [fmt .Sprintf ("machine-should-use-swraid-but-only-%d-RAID-WWN-in-hbmh" , lenOfWwnSlice )]++
454
+ continue
455
+ }
456
+ } else { //nolint:gocritic
457
+ // Machine should have no RAID.
458
+ if host .Spec .RootDeviceHints .WWN == "" {
459
+ mapOfSkipReasons ["machine-should-use-no-swraid-and-no-non-raid-WWN-in-hbmh" ]++
460
+ continue
461
+ }
462
+ }
463
+ }
464
+
410
465
availableHosts = append (availableHosts , & hosts .Items [i ])
411
466
}
412
467
468
+ // return if all hosts are in use with a specific message
469
+ if unusedHostsCounter == 0 {
470
+ return nil , nil , fmt .Sprintf ("all hosts are in use - found %d hosts" ,
471
+ len (hosts .Items )), nil
472
+ }
473
+
474
+ // found hosts that are not in use, but all of them had some reason to not be chosen
413
475
if len (availableHosts ) == 0 {
414
- return nil , nil , nil
476
+ return nil , nil , reasonString ( mapOfSkipReasons , unusedHostsCounter ), nil
415
477
}
416
478
417
- // choose a host
479
+ // we found available hosts - choose one
418
480
randomNumber , err := rand .Int (rand .Reader , big .NewInt (int64 (len (availableHosts ))))
419
481
if err != nil {
420
- return nil , nil , fmt .Errorf ("failed to create random number: %w" , err )
482
+ return nil , nil , "" , fmt .Errorf ("failed to create random number: %w" , err )
421
483
}
422
484
423
485
chosenHost := availableHosts [randomNumber .Int64 ()]
424
486
425
487
helper , err := patch .NewHelper (chosenHost , s .scope .Client )
426
488
if err != nil {
427
- return nil , nil , fmt .Errorf ("failed to create patch helper: %w" , err )
489
+ return nil , nil , "" , fmt .Errorf ("failed to create patch helper: %w" , err )
428
490
}
429
491
430
- return chosenHost , helper , nil
492
+ return chosenHost , helper , "" , nil
493
+ }
494
+
495
+ func reasonString (mapOfSkipReasons map [string ]int , unusedHostsCounter int ) string {
496
+ reasons := make ([]string , 0 , len (mapOfSkipReasons ))
497
+ keys := maps .Keys (mapOfSkipReasons )
498
+ slices .Sort (keys )
499
+ for _ , key := range keys {
500
+ value := mapOfSkipReasons [key ]
501
+ if value == 0 {
502
+ continue
503
+ }
504
+ reasons = append (reasons , fmt .Sprintf ("%s: %d" , key , value ))
505
+ }
506
+ return fmt .Sprintf ("No available host of %d found: %s" , unusedHostsCounter , strings .Join (reasons , ", " ))
431
507
}
432
508
433
509
func (s * Service ) reconcileLoadBalancerAttachment (ctx context.Context , host * infrav1.HetznerBareMetalHost ) error {
0 commit comments