Skip to content

Commit 825da0b

Browse files
authored
Merge pull request #7026 from fabriziopandini/improve-machineset-scale-up-down-logs
🌱 Improve logging for the MachineSet scale up/down workflow
2 parents fd7b379 + 6eda8ea commit 825da0b

File tree

1 file changed

+40
-36
lines changed

1 file changed

+40
-36
lines changed

internal/controllers/machineset/machineset_controller.go

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,6 @@ func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet
202202

203203
func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet *clusterv1.MachineSet) (ctrl.Result, error) {
204204
log := ctrl.LoggerFrom(ctx)
205-
log.V(4).Info("Reconcile MachineSet")
206205

207206
// Reconcile and retrieve the Cluster object.
208207
if machineSet.Labels == nil {
@@ -291,7 +290,7 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster,
291290
continue
292291
}
293292
if conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) {
294-
log.Info("Deleting unhealthy machine")
293+
log.Info("Deleting machine because marked as unhealthy by the MachineHealthCheck controller")
295294
patch := client.MergeFrom(machine.DeepCopy())
296295
if err := r.Client.Delete(ctx, machine); err != nil {
297296
errs = append(errs, errors.Wrap(err, "failed to delete"))
@@ -341,7 +340,6 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster,
341340

342341
// Quickly reconcile until the nodes become Ready.
343342
if machineSet.Status.ReadyReplicas != replicas {
344-
log.V(4).Info("Some nodes are not ready yet, requeuing until they are ready")
345343
return ctrl.Result{RequeueAfter: 15 * time.Second}, nil
346344
}
347345

@@ -358,10 +356,10 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
358356
switch {
359357
case diff < 0:
360358
diff *= -1
361-
log.Info("Too few replicas", "need", *(ms.Spec.Replicas), "creating", diff)
359+
log.Info(fmt.Sprintf("MachineSet is scaling up to %d replicas by creating %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines))
362360
if ms.Annotations != nil {
363361
if _, ok := ms.Annotations[clusterv1.DisableMachineCreate]; ok {
364-
log.V(2).Info("Automatic creation of new machines disabled for machine set")
362+
log.Info("Automatic creation of new machines disabled for machine set")
365363
return nil
366364
}
367365
}
@@ -371,11 +369,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
371369
)
372370

373371
for i := 0; i < diff; i++ {
374-
log.Info(fmt.Sprintf("Creating machine %d of %d, ( spec.replicas(%d) > currentMachineCount(%d) )",
375-
i+1, diff, *(ms.Spec.Replicas), len(machines)))
376-
377372
machine := r.getNewMachine(ms)
378-
log = log.WithValues("Machine", klog.KObj(machine))
379373

380374
// Clone and set the infrastructure and bootstrap references.
381375
var (
@@ -400,9 +394,10 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
400394
})
401395
if err != nil {
402396
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.BootstrapTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error())
403-
return errors.Wrapf(err, "failed to clone bootstrap configuration for MachineSet %q in namespace %q", ms.Name, ms.Namespace)
397+
return errors.Wrapf(err, "failed to clone bootstrap configuration from %s %s while creating a machine", machine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(machine.Spec.Bootstrap.ConfigRef.Namespace, machine.Spec.Bootstrap.ConfigRef.Name))
404398
}
405399
machine.Spec.Bootstrap.ConfigRef = bootstrapRef
400+
log = log.WithValues(bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name))
406401
}
407402

408403
infraRef, err = external.CreateFromTemplate(ctx, &external.CreateFromTemplateInput{
@@ -421,30 +416,31 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
421416
})
422417
if err != nil {
423418
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.InfrastructureTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error())
424-
return errors.Wrapf(err, "failed to clone infrastructure configuration for MachineSet %q in namespace %q", ms.Name, ms.Namespace)
419+
return errors.Wrapf(err, "failed to clone infrastructure machine from %s %s while creating a machine", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Spec.InfrastructureRef.Namespace, machine.Spec.InfrastructureRef.Name))
425420
}
421+
log = log.WithValues(infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name))
426422
machine.Spec.InfrastructureRef = *infraRef
427423

428424
if err := r.Client.Create(ctx, machine); err != nil {
429-
log.Error(err, "Unable to create Machine")
430-
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedCreate", "Failed to create machine %q: %v", machine.Name, err)
425+
log.Error(err, "Error while creating a machine")
426+
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedCreate", "Failed to create machine: %v", err)
431427
errs = append(errs, err)
432428
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.MachineCreationFailedReason,
433429
clusterv1.ConditionSeverityError, err.Error())
434430

435431
// Try to cleanup the external objects if the Machine creation failed.
436432
if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*infraRef)); !apierrors.IsNotFound(err) {
437-
log.Error(err, "Failed to cleanup infrastructure configuration object after Machine creation error")
433+
log.Error(err, "Failed to cleanup infrastructure machine object after Machine creation error", infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name))
438434
}
439435
if bootstrapRef != nil {
440436
if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*bootstrapRef)); !apierrors.IsNotFound(err) {
441-
log.Error(err, "Failed to cleanup bootstrap configuration object after Machine creation error")
437+
log.Error(err, "Failed to cleanup bootstrap configuration object after Machine creation error", bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name))
442438
}
443439
}
444440
continue
445441
}
446442

447-
log.Info(fmt.Sprintf("Created machine %d of %d with name %q", i+1, diff, machine.Name))
443+
log.Info(fmt.Sprintf("Created machine %d of %d", i+1, diff), "Machine", klog.KObj(machine))
448444
r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulCreate", "Created machine %q", machine.Name)
449445
machineList = append(machineList, machine)
450446
}
@@ -454,26 +450,29 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet,
454450
}
455451
return r.waitForMachineCreation(ctx, machineList)
456452
case diff > 0:
457-
log.Info("Too many replicas", "need", *(ms.Spec.Replicas), "deleting", diff)
453+
log.Info(fmt.Sprintf("MachineSet is scaling down to %d replicas by deleting %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines), "deletePolicy", ms.Spec.DeletePolicy)
458454

459455
deletePriorityFunc, err := getDeletePriorityFunc(ms)
460456
if err != nil {
461457
return err
462458
}
463-
log.Info("Found delete policy", "delete-policy", ms.Spec.DeletePolicy)
464459

465460
var errs []error
466461
machinesToDelete := getMachinesToDeletePrioritized(machines, diff, deletePriorityFunc)
467-
for _, machine := range machinesToDelete {
468-
log = log.WithValues("Machine", klog.KObj(machine))
469-
if err := r.Client.Delete(ctx, machine); err != nil {
470-
log.Error(err, "Unable to delete Machine")
471-
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err)
472-
errs = append(errs, err)
473-
continue
462+
for i, machine := range machinesToDelete {
463+
log := log.WithValues("Machine", klog.KObj(machine))
464+
if machine.GetDeletionTimestamp().IsZero() {
465+
log.Info(fmt.Sprintf("Deleting machine %d of %d", i+1, diff))
466+
if err := r.Client.Delete(ctx, machine); err != nil {
467+
log.Error(err, "Unable to delete Machine")
468+
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err)
469+
errs = append(errs, err)
470+
continue
471+
}
472+
r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name)
473+
} else {
474+
log.Info(fmt.Sprintf("Waiting for machine %d of %d to be deleted", i+1, diff))
474475
}
475-
log.Info("Deleted machine")
476-
r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name)
477476
}
478477

479478
if len(errs) > 0 {
@@ -673,13 +672,13 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
673672
}
674673

675674
if machine.Status.NodeRef == nil {
676-
log.V(2).Info("Unable to retrieve Node status, missing NodeRef")
675+
log.V(4).Info("Waiting for the machine controller to set status.NodeRef on the Machine")
677676
continue
678677
}
679678

680679
node, err := r.getMachineNode(ctx, cluster, machine)
681-
if err != nil {
682-
log.Error(err, "Unable to retrieve Node status")
680+
if err != nil && machine.GetDeletionTimestamp().IsZero() {
681+
log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node))
683682
continue
684683
}
685684

@@ -688,6 +687,8 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
688687
if noderefutil.IsNodeAvailable(node, ms.Spec.MinReadySeconds, metav1.Now()) {
689688
availableReplicasCount++
690689
}
690+
} else if machine.GetDeletionTimestamp().IsZero() {
691+
log.Info("Waiting for the Kubernetes node on the machine to report ready state")
691692
}
692693
}
693694

@@ -702,17 +703,17 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
702703
ms.Status.ReadyReplicas != newStatus.ReadyReplicas ||
703704
ms.Status.AvailableReplicas != newStatus.AvailableReplicas ||
704705
ms.Generation != ms.Status.ObservedGeneration {
705-
// Save the generation number we acted on, otherwise we might wrongfully indicate
706-
// that we've seen a spec update when we retry.
707-
newStatus.ObservedGeneration = ms.Generation
708-
newStatus.DeepCopyInto(&ms.Status)
709-
710706
log.V(4).Info("Updating status: " +
711707
fmt.Sprintf("replicas %d->%d (need %d), ", ms.Status.Replicas, newStatus.Replicas, desiredReplicas) +
712708
fmt.Sprintf("fullyLabeledReplicas %d->%d, ", ms.Status.FullyLabeledReplicas, newStatus.FullyLabeledReplicas) +
713709
fmt.Sprintf("readyReplicas %d->%d, ", ms.Status.ReadyReplicas, newStatus.ReadyReplicas) +
714710
fmt.Sprintf("availableReplicas %d->%d, ", ms.Status.AvailableReplicas, newStatus.AvailableReplicas) +
715-
fmt.Sprintf("sequence No: %v->%v", ms.Status.ObservedGeneration, newStatus.ObservedGeneration))
711+
fmt.Sprintf("observedGeneration %v->%v", ms.Status.ObservedGeneration, ms.Generation))
712+
713+
// Save the generation number we acted on, otherwise we might wrongfully indicate
714+
// that we've seen a spec update when we retry.
715+
newStatus.ObservedGeneration = ms.Generation
716+
newStatus.DeepCopyInto(&ms.Status)
716717
}
717718
switch {
718719
// We are scaling up
@@ -728,6 +729,9 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
728729
// NOTE: we are checking the number of machines ready so we report resize completed only when the machines
729730
// are actually provisioned (vs reporting completed immediately after the last machine object is created). This convention is also used by KCP.
730731
if newStatus.ReadyReplicas == newStatus.Replicas {
732+
if conditions.IsFalse(ms, clusterv1.ResizedCondition) {
733+
log.Info("All the replicas are ready", "replicas", newStatus.ReadyReplicas)
734+
}
731735
conditions.MarkTrue(ms, clusterv1.ResizedCondition)
732736
}
733737
// This means that there was no error in generating the desired number of machine objects

0 commit comments

Comments
 (0)