Skip to content

Commit cb8d5cf

Browse files
authored
🌱 Skip machines with one WWN, if RAID is desired. (#1162)
Additionally a better error message gets returned, if no matching HetznerBaremetalHost was found.
1 parent 5449a8f commit cb8d5cf

File tree

4 files changed

+241
-39
lines changed

4 files changed

+241
-39
lines changed

controllers/hetznerbaremetalhost_controller_test.go

Lines changed: 75 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ var _ = Describe("HetznerBareMetalHostReconciler", func() {
368368
Expect(testEnv.Cleanup(ctx, host)).To(Succeed())
369369
})
370370

371-
It("gets selected from a bm machine and provisions", func() {
371+
It("gets selected from a bm machine and provisions (context 'provision host')", func() {
372372
Eventually(func() bool {
373373
if err := testEnv.Get(ctx, key, host); err != nil {
374374
return false
@@ -382,12 +382,11 @@ var _ = Describe("HetznerBareMetalHostReconciler", func() {
382382
})
383383
})
384384

385-
Context("provision host with rootDeviceHints raid", func() {
385+
Context("Test secret owner refs", func() {
386386
BeforeEach(func() {
387387
host = helpers.BareMetalHost(
388388
hostName,
389389
testNs.Name,
390-
helpers.WithRootDeviceHintRaid(),
391390
helpers.WithHetznerClusterRef(hetznerClusterName),
392391
)
393392
Expect(testEnv.Create(ctx, host)).To(Succeed())
@@ -399,24 +398,84 @@ var _ = Describe("HetznerBareMetalHostReconciler", func() {
399398
Expect(testEnv.Cleanup(ctx, host)).To(Succeed())
400399
})
401400

402-
It("gets selected from a bm machine and provisions", func() {
401+
It("should create an owner ref of the cluster in the rescue ssh secret", func() {
403402
Eventually(func() bool {
404-
if err := testEnv.Get(ctx, key, host); err != nil {
403+
var secret corev1.Secret
404+
secretKey := types.NamespacedName{Name: rescueSSHSecret.Name, Namespace: rescueSSHSecret.Namespace}
405+
if err := testEnv.Get(ctx, secretKey, &secret); err != nil {
405406
return false
406407
}
407-
if host.Spec.Status.ProvisioningState == infrav1.StateProvisioned {
408-
return true
408+
409+
for _, owner := range secret.GetOwnerReferences() {
410+
if owner.UID == hetznerCluster.GetUID() {
411+
return true
412+
}
409413
}
414+
410415
return false
411-
}, timeout).Should(BeTrue())
416+
}, timeout, time.Second).Should(BeTrue())
412417
})
413418
})
419+
})
414420

415-
Context("Test secret owner refs", func() {
421+
Context("Tests with bm machine (with RAID)", func() {
422+
BeforeEach(func() {
423+
capiMachine = &clusterv1.Machine{
424+
ObjectMeta: metav1.ObjectMeta{
425+
GenerateName: "capi-machine-",
426+
Namespace: testNs.Name,
427+
Finalizers: []string{clusterv1.MachineFinalizer},
428+
Labels: map[string]string{
429+
clusterv1.ClusterNameLabel: capiCluster.Name,
430+
},
431+
},
432+
Spec: clusterv1.MachineSpec{
433+
ClusterName: capiCluster.Name,
434+
InfrastructureRef: corev1.ObjectReference{
435+
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
436+
Kind: "HetznerBareMetalMachine",
437+
Name: bmMachineName,
438+
},
439+
FailureDomain: &defaultFailureDomain,
440+
Bootstrap: clusterv1.Bootstrap{
441+
DataSecretName: ptr.To("bootstrap-secret"),
442+
},
443+
},
444+
}
445+
Expect(testEnv.Create(ctx, capiMachine)).To(Succeed())
446+
447+
bmMachine = &infrav1.HetznerBareMetalMachine{
448+
ObjectMeta: metav1.ObjectMeta{
449+
Name: bmMachineName,
450+
Namespace: testNs.Name,
451+
Labels: map[string]string{
452+
clusterv1.ClusterNameLabel: capiCluster.Name,
453+
},
454+
OwnerReferences: []metav1.OwnerReference{
455+
{
456+
APIVersion: "cluster.x-k8s.io/v1beta1",
457+
Kind: "Machine",
458+
Name: capiMachine.Name,
459+
UID: capiMachine.UID,
460+
},
461+
},
462+
},
463+
Spec: getDefaultHetznerBareMetalMachineSpec(),
464+
}
465+
bmMachine.Spec.InstallImage.Swraid = 1
466+
Expect(testEnv.Create(ctx, bmMachine)).To(Succeed())
467+
})
468+
469+
AfterEach(func() {
470+
Expect(testEnv.Cleanup(ctx, capiMachine, bmMachine)).To(Succeed())
471+
})
472+
473+
Context("provision host with rootDeviceHints RAID", func() {
416474
BeforeEach(func() {
417475
host = helpers.BareMetalHost(
418476
hostName,
419477
testNs.Name,
478+
helpers.WithRootDeviceHintRaid(),
420479
helpers.WithHetznerClusterRef(hetznerClusterName),
421480
)
422481
Expect(testEnv.Create(ctx, host)).To(Succeed())
@@ -428,22 +487,17 @@ var _ = Describe("HetznerBareMetalHostReconciler", func() {
428487
Expect(testEnv.Cleanup(ctx, host)).To(Succeed())
429488
})
430489

431-
It("should create an owner ref of the cluster in the rescue ssh secret", func() {
490+
It("gets selected from a bm machine and provisions (rootDeviceHints RAID)", func() {
491+
Expect(len(host.Spec.RootDeviceHints.Raid.WWN)).To(Equal(2))
432492
Eventually(func() bool {
433-
var secret corev1.Secret
434-
secretKey := types.NamespacedName{Name: rescueSSHSecret.Name, Namespace: rescueSSHSecret.Namespace}
435-
if err := testEnv.Get(ctx, secretKey, &secret); err != nil {
493+
if err := testEnv.Get(ctx, key, host); err != nil {
436494
return false
437495
}
438-
439-
for _, owner := range secret.GetOwnerReferences() {
440-
if owner.UID == hetznerCluster.GetUID() {
441-
return true
442-
}
496+
if host.Spec.Status.ProvisioningState == infrav1.StateProvisioned {
497+
return true
443498
}
444-
445499
return false
446-
}, timeout, time.Second).Should(BeTrue())
500+
}, timeout).Should(BeTrue())
447501
})
448502
})
449503
})
@@ -519,7 +573,7 @@ var _ = Describe("HetznerBareMetalHostReconciler", func() {
519573
Expect(testEnv.Cleanup(ctx, host)).To(Succeed())
520574
})
521575

522-
It("gets selected from a bm machine and provisions", func() {
576+
It("gets selected from a bm machine and provisions (different ports after installImage)", func() {
523577
Eventually(func() bool {
524578
if err := testEnv.Get(ctx, key, host); err != nil {
525579
return false

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ require (
1414
github.com/syself/hrobot-go v0.2.5
1515
go.uber.org/zap v1.27.0
1616
golang.org/x/crypto v0.20.0
17+
golang.org/x/exp v0.0.0-20230905200255-921286631fa9
1718
golang.org/x/mod v0.15.0
1819
k8s.io/api v0.28.4
1920
k8s.io/apimachinery v0.28.4
@@ -110,7 +111,6 @@ require (
110111
github.com/subosito/gotenv v1.6.0 // indirect
111112
github.com/valyala/fastjson v1.6.4 // indirect
112113
go.uber.org/multierr v1.11.0 // indirect
113-
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
114114
golang.org/x/net v0.21.0 // indirect
115115
golang.org/x/oauth2 v0.14.0 // indirect
116116
golang.org/x/sync v0.5.0 // indirect

pkg/services/baremetal/baremetal/baremetal.go

Lines changed: 92 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ import (
2929

3030
"github.com/go-logr/logr"
3131
"github.com/hetznercloud/hcloud-go/v2/hcloud"
32+
"golang.org/x/exp/maps"
33+
"golang.org/x/exp/slices"
3234
corev1 "k8s.io/api/core/v1"
3335
"k8s.io/apimachinery/pkg/api/equality"
3436
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -283,19 +285,19 @@ func (s *Service) associate(ctx context.Context) error {
283285
}
284286

285287
// choose new host
286-
host, helper, err := s.chooseHost(ctx)
288+
host, helper, reason, err := s.chooseHost(ctx)
287289
if err != nil {
288290
return fmt.Errorf("failed to choose host: %w", err)
289291
}
290292
if host == nil {
291293
s.scope.BareMetalMachine.Status.Phase = clusterv1.MachinePhasePending
292-
s.scope.V(1).Info("No available host found. Requeuing.")
294+
s.scope.Info("No available host found. Requeuing.", "reason", reason)
293295
conditions.MarkFalse(
294296
s.scope.BareMetalMachine,
295297
infrav1.HostAssociateSucceededCondition,
296298
infrav1.NoAvailableHostReason,
297299
clusterv1.ConditionSeverityWarning,
298-
"no available host",
300+
fmt.Sprintf("no available host (%s)", reason),
299301
)
300302
return &scope.RequeueAfterError{RequeueAfter: requeueAfter}
301303
}
@@ -363,71 +365,145 @@ func (s *Service) getAssociatedHost(ctx context.Context) (*infrav1.HetznerBareMe
363365
return &host, helper, nil
364366
}
365367

366-
func (s *Service) chooseHost(ctx context.Context) (*infrav1.HetznerBareMetalHost, *patch.Helper, error) {
368+
// chooseHost tries to find a free hbmh.
369+
// If no hbmh was found, then hbmh and err are nil, and the string
370+
// "reason" contains human readable details.
371+
func (s *Service) chooseHost(ctx context.Context) (
372+
hbmh *infrav1.HetznerBareMetalHost, patchHelper *patch.Helper, reason string, err error,
373+
) {
367374
// get list of hosts scoped to namespace of machine
368375
hosts := infrav1.HetznerBareMetalHostList{}
369376
opts := &client.ListOptions{
370377
Namespace: s.scope.BareMetalMachine.Namespace,
371378
}
372379

373380
if err := s.scope.Client.List(ctx, &hosts, opts); err != nil {
374-
return nil, nil, fmt.Errorf("failed to list hosts: %w", err)
381+
return nil, nil, "", fmt.Errorf("failed to list hosts: %w", err)
375382
}
376383

377384
labelSelector := s.getLabelSelector()
378385

386+
// count all hosts that are not in use already
387+
unusedHostsCounter := 0
388+
389+
// hosts are "available" if they are not in use already by some Kubernetes cluster and do not have
390+
// another reason to not be chosen (labels that don't match the selector, maintenance mode, error state, etc.)
379391
availableHosts := make([]*infrav1.HetznerBareMetalHost, 0, len(hosts.Items))
380392

393+
mapOfSkipReasons := make(map[string]int)
394+
381395
for i, host := range hosts.Items {
382396
if host.Spec.ConsumerRef != nil && consumerRefMatches(host.Spec.ConsumerRef, s.scope.BareMetalMachine) {
383397
helper, err := patch.NewHelper(&hosts.Items[i], s.scope.Client)
384398
if err != nil {
385-
return nil, nil, fmt.Errorf("failed to create patch helper: %w", err)
399+
return nil, nil, "", fmt.Errorf("failed to create patch helper: %w", err)
386400
}
387-
return &hosts.Items[i], helper, nil
401+
return &hosts.Items[i], helper, "", nil
388402
}
389403
if host.Spec.ConsumerRef != nil {
390404
continue
391405
}
392-
if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode {
406+
407+
// from now on each "continue" should add an entry
408+
// to mapOfSkipReasons.
409+
unusedHostsCounter++
410+
411+
// This comes first, because we should not look too deep into machines
412+
// which are not in our scope.
413+
if !labelSelector.Matches(labels.Set(host.ObjectMeta.Labels)) {
414+
mapOfSkipReasons["label-selector-does-not-match"]++
393415
continue
394416
}
417+
395418
if host.GetDeletionTimestamp() != nil {
419+
mapOfSkipReasons["hbmh-has-deletion-timestamp"]++
396420
continue
397421
}
398-
if host.Spec.Status.ErrorMessage != "" {
422+
423+
if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode {
424+
mapOfSkipReasons["hbmh-in-maintenance-mode"]++
399425
continue
400426
}
401-
402-
if !labelSelector.Matches(labels.Set(host.ObjectMeta.Labels)) {
427+
if host.Spec.Status.ErrorMessage != "" {
428+
mapOfSkipReasons["hbmh-has-error-message-in-status"]++
403429
continue
404430
}
405431

406432
if host.Spec.Status.ProvisioningState != infrav1.StateNone {
433+
mapOfSkipReasons["hbmh-in-wrong-provisioning-state"]++
407434
continue
408435
}
409436

437+
// This comes last, because now we would choose the machine, but we check
438+
// if the config is correct.
439+
if host.Spec.RootDeviceHints != nil &&
440+
host.Spec.RootDeviceHints.IsValid() {
441+
// Even if there are no rootDeviceHints specified, the host should be picked.
442+
// After the phase registering, the process to provision the server stops and
443+
// waits for the user to specify the rootDeviceHints.
444+
// Here (RootDeviceHints exists and is valid) we want to check whether
445+
// the specified rootDeviceHints fit with the InstallImage configuration
446+
// of the HetznerBareMetalMachine. If not, it is not valid.
447+
// Doing that without first choosing the hbmh would be nice, there is a feature request:
448+
// https://github.com/syself/cluster-api-provider-hetzner/issues/1166
449+
if s.scope.BareMetalMachine.Spec.InstallImage.Swraid == 1 {
450+
// Machine should have RAID. Skip machines which have less than two WWNs
451+
lenOfWwnSlice := len(host.Spec.RootDeviceHints.Raid.WWN)
452+
if lenOfWwnSlice < 2 {
453+
mapOfSkipReasons[fmt.Sprintf("machine-should-use-swraid-but-only-%d-RAID-WWN-in-hbmh", lenOfWwnSlice)]++
454+
continue
455+
}
456+
} else { //nolint:gocritic
457+
// Machine should have no RAID.
458+
if host.Spec.RootDeviceHints.WWN == "" {
459+
mapOfSkipReasons["machine-should-use-no-swraid-and-no-non-raid-WWN-in-hbmh"]++
460+
continue
461+
}
462+
}
463+
}
464+
410465
availableHosts = append(availableHosts, &hosts.Items[i])
411466
}
412467

468+
// return if all hosts are in use with a specific message
469+
if unusedHostsCounter == 0 {
470+
return nil, nil, fmt.Sprintf("all hosts are in use - found %d hosts",
471+
len(hosts.Items)), nil
472+
}
473+
474+
// found hosts that are not in use, but all of them had some reason to not be chosen
413475
if len(availableHosts) == 0 {
414-
return nil, nil, nil
476+
return nil, nil, reasonString(mapOfSkipReasons, unusedHostsCounter), nil
415477
}
416478

417-
// choose a host
479+
// we found available hosts - choose one
418480
randomNumber, err := rand.Int(rand.Reader, big.NewInt(int64(len(availableHosts))))
419481
if err != nil {
420-
return nil, nil, fmt.Errorf("failed to create random number: %w", err)
482+
return nil, nil, "", fmt.Errorf("failed to create random number: %w", err)
421483
}
422484

423485
chosenHost := availableHosts[randomNumber.Int64()]
424486

425487
helper, err := patch.NewHelper(chosenHost, s.scope.Client)
426488
if err != nil {
427-
return nil, nil, fmt.Errorf("failed to create patch helper: %w", err)
489+
return nil, nil, "", fmt.Errorf("failed to create patch helper: %w", err)
428490
}
429491

430-
return chosenHost, helper, nil
492+
return chosenHost, helper, "", nil
493+
}
494+
495+
func reasonString(mapOfSkipReasons map[string]int, unusedHostsCounter int) string {
496+
reasons := make([]string, 0, len(mapOfSkipReasons))
497+
keys := maps.Keys(mapOfSkipReasons)
498+
slices.Sort(keys)
499+
for _, key := range keys {
500+
value := mapOfSkipReasons[key]
501+
if value == 0 {
502+
continue
503+
}
504+
reasons = append(reasons, fmt.Sprintf("%s: %d", key, value))
505+
}
506+
return fmt.Sprintf("No available host of %d found: %s", unusedHostsCounter, strings.Join(reasons, ", "))
431507
}
432508

433509
func (s *Service) reconcileLoadBalancerAttachment(ctx context.Context, host *infrav1.HetznerBareMetalHost) error {

0 commit comments

Comments
 (0)