Skip to content

Commit 6eda8ea

Browse files
improve logging for the MachineSet scale up/down workflow
1 parent 0043155 commit 6eda8ea

File tree

1 file changed

+40
-36
lines changed

1 file changed

+40
-36
lines changed

internal/controllers/machineset/machineset_controller.go

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,6 @@ func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet
196196

197197
func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet *clusterv1.MachineSet) (ctrl.Result, error) {
198198
log := ctrl.LoggerFrom(ctx)
199-
log.V(4).Info("Reconcile MachineSet")
200199

201200
// Reconcile and retrieve the Cluster object.
202201
if machineSet.Labels == nil {
@@ -285,7 +284,7 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster,
285284
continue
286285
}
287286
if conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) {
288-
log.Info("Deleting unhealthy machine")
287+
log.Info("Deleting machine because marked as unhealthy by the MachineHealthCheck controller")
289288
patch := client.MergeFrom(machine.DeepCopy())
290289
if err := r.Client.Delete(ctx, machine); err != nil {
291290
errs = append(errs, errors.Wrap(err, "failed to delete"))
@@ -335,7 +334,6 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster,
335334

336335
// Quickly reconcile until the nodes become Ready.
337336
if machineSet.Status.ReadyReplicas != replicas {
338-
log.V(4).Info("Some nodes are not ready yet, requeuing until they are ready")
339337
return ctrl.Result{RequeueAfter: 15 * time.Second}, nil
340338
}
341339

@@ -352,10 +350,10 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
352350
switch {
353351
case diff < 0:
354352
diff *= -1
355-
log.Info("Too few replicas", "need", *(ms.Spec.Replicas), "creating", diff)
353+
log.Info(fmt.Sprintf("MachineSet is scaling up to %d replicas by creating %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines))
356354
if ms.Annotations != nil {
357355
if _, ok := ms.Annotations[clusterv1.DisableMachineCreate]; ok {
358-
log.V(2).Info("Automatic creation of new machines disabled for machine set")
356+
log.Info("Automatic creation of new machines disabled for machine set")
359357
return nil
360358
}
361359
}
@@ -365,11 +363,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
365363
)
366364

367365
for i := 0; i < diff; i++ {
368-
log.Info(fmt.Sprintf("Creating machine %d of %d, ( spec.replicas(%d) > currentMachineCount(%d) )",
369-
i+1, diff, *(ms.Spec.Replicas), len(machines)))
370-
371366
machine := r.getNewMachine(ms)
372-
log = log.WithValues("Machine", klog.KObj(machine))
373367

374368
// Clone and set the infrastructure and bootstrap references.
375369
var (
@@ -394,9 +388,10 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
394388
})
395389
if err != nil {
396390
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.BootstrapTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error())
397-
return errors.Wrapf(err, "failed to clone bootstrap configuration for MachineSet %q in namespace %q", ms.Name, ms.Namespace)
391+
return errors.Wrapf(err, "failed to clone bootstrap configuration from %s %s while creating a machine", machine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(machine.Spec.Bootstrap.ConfigRef.Namespace, machine.Spec.Bootstrap.ConfigRef.Name))
398392
}
399393
machine.Spec.Bootstrap.ConfigRef = bootstrapRef
394+
log = log.WithValues(bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name))
400395
}
401396

402397
infraRef, err = external.CreateFromTemplate(ctx, &external.CreateFromTemplateInput{
@@ -415,30 +410,31 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
415410
})
416411
if err != nil {
417412
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.InfrastructureTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error())
418-
return errors.Wrapf(err, "failed to clone infrastructure configuration for MachineSet %q in namespace %q", ms.Name, ms.Namespace)
413+
return errors.Wrapf(err, "failed to clone infrastructure machine from %s %s while creating a machine", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Spec.InfrastructureRef.Namespace, machine.Spec.InfrastructureRef.Name))
419414
}
415+
log = log.WithValues(infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name))
420416
machine.Spec.InfrastructureRef = *infraRef
421417

422418
if err := r.Client.Create(ctx, machine); err != nil {
423-
log.Error(err, "Unable to create Machine")
424-
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedCreate", "Failed to create machine %q: %v", machine.Name, err)
419+
log.Error(err, "Error while creating a machine")
420+
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedCreate", "Failed to create machine: %v", err)
425421
errs = append(errs, err)
426422
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.MachineCreationFailedReason,
427423
clusterv1.ConditionSeverityError, err.Error())
428424

429425
// Try to cleanup the external objects if the Machine creation failed.
430426
if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*infraRef)); !apierrors.IsNotFound(err) {
431-
log.Error(err, "Failed to cleanup infrastructure configuration object after Machine creation error")
427+
log.Error(err, "Failed to cleanup infrastructure machine object after Machine creation error", infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name))
432428
}
433429
if bootstrapRef != nil {
434430
if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*bootstrapRef)); !apierrors.IsNotFound(err) {
435-
log.Error(err, "Failed to cleanup bootstrap configuration object after Machine creation error")
431+
log.Error(err, "Failed to cleanup bootstrap configuration object after Machine creation error", bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name))
436432
}
437433
}
438434
continue
439435
}
440436

441-
log.Info(fmt.Sprintf("Created machine %d of %d with name %q", i+1, diff, machine.Name))
437+
log.Info(fmt.Sprintf("Created machine %d of %d", i+1, diff), "Machine", klog.KObj(machine))
442438
r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulCreate", "Created machine %q", machine.Name)
443439
machineList = append(machineList, machine)
444440
}
@@ -448,26 +444,29 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
448444
}
449445
return r.waitForMachineCreation(ctx, machineList)
450446
case diff > 0:
451-
log.Info("Too many replicas", "need", *(ms.Spec.Replicas), "deleting", diff)
447+
log.Info(fmt.Sprintf("MachineSet is scaling down to %d replicas by deleting %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines), "deletePolicy", ms.Spec.DeletePolicy)
452448

453449
deletePriorityFunc, err := getDeletePriorityFunc(ms)
454450
if err != nil {
455451
return err
456452
}
457-
log.Info("Found delete policy", "delete-policy", ms.Spec.DeletePolicy)
458453

459454
var errs []error
460455
machinesToDelete := getMachinesToDeletePrioritized(machines, diff, deletePriorityFunc)
461-
for _, machine := range machinesToDelete {
462-
log = log.WithValues("Machine", klog.KObj(machine))
463-
if err := r.Client.Delete(ctx, machine); err != nil {
464-
log.Error(err, "Unable to delete Machine")
465-
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err)
466-
errs = append(errs, err)
467-
continue
456+
for i, machine := range machinesToDelete {
457+
log := log.WithValues("Machine", klog.KObj(machine))
458+
if machine.GetDeletionTimestamp().IsZero() {
459+
log.Info(fmt.Sprintf("Deleting machine %d of %d", i+1, diff))
460+
if err := r.Client.Delete(ctx, machine); err != nil {
461+
log.Error(err, "Unable to delete Machine")
462+
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err)
463+
errs = append(errs, err)
464+
continue
465+
}
466+
r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name)
467+
} else {
468+
log.Info(fmt.Sprintf("Waiting for machine %d of %d to be deleted", i+1, diff))
468469
}
469-
log.Info("Deleted machine")
470-
r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name)
471470
}
472471

473472
if len(errs) > 0 {
@@ -667,13 +666,13 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
667666
}
668667

669668
if machine.Status.NodeRef == nil {
670-
log.V(2).Info("Unable to retrieve Node status, missing NodeRef")
669+
log.V(4).Info("Waiting for the machine controller to set status.NodeRef on the Machine")
671670
continue
672671
}
673672

674673
node, err := r.getMachineNode(ctx, cluster, machine)
675-
if err != nil {
676-
log.Error(err, "Unable to retrieve Node status")
674+
if err != nil && machine.GetDeletionTimestamp().IsZero() {
675+
log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node))
677676
continue
678677
}
679678

@@ -682,6 +681,8 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
682681
if noderefutil.IsNodeAvailable(node, ms.Spec.MinReadySeconds, metav1.Now()) {
683682
availableReplicasCount++
684683
}
684+
} else if machine.GetDeletionTimestamp().IsZero() {
685+
log.Info("Waiting for the Kubernetes node on the machine to report ready state")
685686
}
686687
}
687688

@@ -696,17 +697,17 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
696697
ms.Status.ReadyReplicas != newStatus.ReadyReplicas ||
697698
ms.Status.AvailableReplicas != newStatus.AvailableReplicas ||
698699
ms.Generation != ms.Status.ObservedGeneration {
699-
// Save the generation number we acted on, otherwise we might wrongfully indicate
700-
// that we've seen a spec update when we retry.
701-
newStatus.ObservedGeneration = ms.Generation
702-
newStatus.DeepCopyInto(&ms.Status)
703-
704700
log.V(4).Info("Updating status: " +
705701
fmt.Sprintf("replicas %d->%d (need %d), ", ms.Status.Replicas, newStatus.Replicas, desiredReplicas) +
706702
fmt.Sprintf("fullyLabeledReplicas %d->%d, ", ms.Status.FullyLabeledReplicas, newStatus.FullyLabeledReplicas) +
707703
fmt.Sprintf("readyReplicas %d->%d, ", ms.Status.ReadyReplicas, newStatus.ReadyReplicas) +
708704
fmt.Sprintf("availableReplicas %d->%d, ", ms.Status.AvailableReplicas, newStatus.AvailableReplicas) +
709-
fmt.Sprintf("sequence No: %v->%v", ms.Status.ObservedGeneration, newStatus.ObservedGeneration))
705+
fmt.Sprintf("observedGeneration %v->%v", ms.Status.ObservedGeneration, ms.Generation))
706+
707+
// Save the generation number we acted on, otherwise we might wrongfully indicate
708+
// that we've seen a spec update when we retry.
709+
newStatus.ObservedGeneration = ms.Generation
710+
newStatus.DeepCopyInto(&ms.Status)
710711
}
711712
switch {
712713
// We are scaling up
@@ -722,6 +723,9 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
722723
// NOTE: we are checking the number of machines ready so we report resize completed only when the machines
723724
// are actually provisioned (vs reporting completed immediately after the last machine object is created). This convention is also used by KCP.
724725
if newStatus.ReadyReplicas == newStatus.Replicas {
726+
if conditions.IsFalse(ms, clusterv1.ResizedCondition) {
727+
log.Info("All the replicas are ready", "replicas", newStatus.ReadyReplicas)
728+
}
725729
conditions.MarkTrue(ms, clusterv1.ResizedCondition)
726730
}
727731
// This means that there was no error in generating the desired number of machine objects

0 commit comments

Comments
 (0)