Skip to content

Commit 7f63ad0

Browse files
committed
fix: avoid long backoff when trying to bootstrap the cluster
This makes `Sidero` tests fail, because the controller gets too many errors during the bootstrap, as we populate the address before `apid` is ready. Signed-off-by: Artem Chernyshev <[email protected]>
1 parent 8fc9a6c commit 7f63ad0

File tree

2 files changed

+29
-27
lines changed

2 files changed

+29
-27
lines changed

controllers/etcd.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, clust
4646
params = append(params, "node", machine.Name)
4747
}
4848

49-
r.Log.Info("Verifying etcd health on all nodes", params...)
49+
r.Log.Info("verifying etcd health on all nodes", params...)
5050

5151
svcs, err := c.ServiceInfo(ctx, service)
5252
if err != nil {
@@ -105,7 +105,7 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, clust
105105
// gracefulEtcdLeave removes a given machine from the etcd cluster by forfeiting leadership
106106
// and issuing a "leave" request from the machine itself.
107107
func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, machineToLeave clusterv1.Machine) error {
108-
r.Log.Info("Verifying etcd status", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name)
108+
r.Log.Info("verifying etcd status", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name)
109109

110110
svcs, err := c.ServiceInfo(ctx, "etcd")
111111
if err != nil {
@@ -114,14 +114,14 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
114114

115115
for _, svc := range svcs {
116116
if svc.Service.State != "Finished" {
117-
r.Log.Info("Forfeiting leadership", "machine", machineToLeave.Status.NodeRef.Name)
117+
r.Log.Info("forfeiting leadership", "machine", machineToLeave.Status.NodeRef.Name)
118118

119119
_, err = c.EtcdForfeitLeadership(ctx, &machine.EtcdForfeitLeadershipRequest{})
120120
if err != nil {
121121
return err
122122
}
123123

124-
r.Log.Info("Leaving etcd", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name)
124+
r.Log.Info("leaving etcd", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name)
125125

126126
err = c.EtcdLeaveCluster(ctx, &machine.EtcdLeaveClusterRequest{})
127127
if err != nil {
@@ -136,7 +136,7 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
136136
// forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member.
137137
// This is used in times when the machine was deleted out from under us.
138138
func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error {
139-
r.Log.Info("Removing etcd member", "memberName", memberName)
139+
r.Log.Info("removing etcd member", "memberName", memberName)
140140

141141
return c.EtcdRemoveMember(
142142
ctx,

controllers/taloscontrolplane_controller.go

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
102102
cluster, err := util.GetOwnerCluster(ctx, r.Client, tcp.ObjectMeta)
103103
if err != nil {
104104
if !apierrors.IsNotFound(err) {
105-
logger.Error(err, "Failed to retrieve owner Cluster from the API Server")
105+
logger.Error(err, "failed to retrieve owner Cluster from the API Server")
106106

107107
return ctrl.Result{}, err
108108
}
@@ -111,27 +111,27 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
111111
}
112112

113113
if cluster == nil {
114-
logger.Info("Cluster Controller has not yet set OwnerRef")
114+
logger.Info("cluster Controller has not yet set OwnerRef")
115115
return ctrl.Result{Requeue: true}, nil
116116
}
117117
logger = logger.WithValues("cluster", cluster.Name)
118118

119119
if annotations.IsPaused(cluster, tcp) {
120-
logger.Info("Reconciliation is paused for this object")
120+
logger.Info("reconciliation is paused for this object")
121121
return ctrl.Result{Requeue: true}, nil
122122
}
123123

124124
// Wait for the cluster infrastructure to be ready before creating machines
125125
if !cluster.Status.InfrastructureReady {
126-
logger.Info("Cluster infra not ready")
126+
logger.Info("cluster infra not ready")
127127

128128
return ctrl.Result{Requeue: true}, nil
129129
}
130130

131131
// Initialize the patch helper.
132132
patchHelper, err := patch.NewHelper(tcp, r.Client)
133133
if err != nil {
134-
logger.Error(err, "Failed to configure the patch helper")
134+
logger.Error(err, "failed to configure the patch helper")
135135
return ctrl.Result{Requeue: true}, nil
136136
}
137137

@@ -143,7 +143,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
143143
// because the main defer may take too much time to get cluster status
144144

145145
if err := patchTalosControlPlane(ctx, patchHelper, tcp, patch.WithStatusObservedGeneration{}); err != nil {
146-
logger.Error(err, "Failed to add finalizer to TalosControlPlane")
146+
logger.Error(err, "failed to add finalizer to TalosControlPlane")
147147
return ctrl.Result{}, err
148148
}
149149

@@ -158,18 +158,18 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
158158
}
159159

160160
defer func() {
161-
r.Log.Info("Attempting to set control plane status")
161+
r.Log.Info("attempting to set control plane status")
162162

163163
// Always attempt to update status.
164164
if err := r.updateStatus(ctx, tcp, cluster); err != nil {
165-
logger.Error(err, "Failed to update TalosControlPlane Status")
165+
logger.Error(err, "failed to update TalosControlPlane Status")
166166

167167
reterr = kerrors.NewAggregate([]error{reterr, err})
168168
}
169169

170170
// Always attempt to Patch the TalosControlPlane object and status after each reconciliation.
171171
if err := patchTalosControlPlane(ctx, patchHelper, tcp, patch.WithStatusObservedGeneration{}); err != nil {
172-
logger.Error(err, "Failed to patch TalosControlPlane")
172+
logger.Error(err, "failed to patch TalosControlPlane")
173173
reterr = kerrors.NewAggregate([]error{reterr, err})
174174
}
175175

@@ -182,7 +182,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
182182
}
183183
}
184184

185-
r.Log.Info("Successfully updated control plane status")
185+
r.Log.Info("successfully updated control plane status")
186186
}()
187187

188188
// Update ownerrefs on infra templates
@@ -192,7 +192,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
192192

193193
// If ControlPlaneEndpoint is not set, return early
194194
if cluster.Spec.ControlPlaneEndpoint.IsZero() {
195-
logger.Info("Cluster does not yet have a ControlPlaneEndpoint defined")
195+
logger.Info("cluster does not yet have a ControlPlaneEndpoint defined")
196196
return ctrl.Result{}, nil
197197
}
198198

@@ -251,7 +251,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
251251
// We are creating the first replica
252252
case numMachines < desiredReplicas && numMachines == 0:
253253
// Create new Machine w/ init
254-
logger.Info("Initializing control plane", "Desired", desiredReplicas, "Existing", numMachines)
254+
logger.Info("initializing control plane", "Desired", desiredReplicas, "Existing", numMachines)
255255

256256
return r.bootControlPlane(ctx, cluster, tcp, controlPlane, true)
257257
// We are scaling up
@@ -261,7 +261,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
261261
desiredReplicas, numMachines)
262262

263263
// Create a new Machine w/ join
264-
logger.Info("Scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines)
264+
logger.Info("scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines)
265265

266266
return r.bootControlPlane(ctx, cluster, tcp, controlPlane, false)
267267
// We are scaling down
@@ -279,23 +279,23 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
279279
}
280280

281281
if err := r.ensureNodesBooted(ctx, cluster, ownedMachines); err != nil {
282-
logger.Info("Waiting for all nodes to finish boot sequence", "error", err)
282+
logger.Info("waiting for all nodes to finish boot sequence", "error", err)
283283

284284
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
285285
}
286286

287287
if !conditions.IsTrue(tcp, controlplanev1.EtcdClusterHealthyCondition) {
288-
logger.Info("Waiting for etcd to become healthy before scaling down")
288+
logger.Info("waiting for etcd to become healthy before scaling down")
289289

290290
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
291291
}
292292

293-
logger.Info("Scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)
293+
logger.Info("scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)
294294

295295
res, err = r.scaleDownControlPlane(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name, ownedMachines)
296296
if err != nil {
297297
if res.Requeue || res.RequeueAfter > 0 {
298-
logger.Info("Failed to scale down control plane", "error", err)
298+
logger.Info("failed to scale down control plane", "error", err)
299299

300300
return res, nil
301301
}
@@ -307,7 +307,9 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
307307
if err := r.bootstrapCluster(ctx, cluster, ownedMachines); err != nil {
308308
conditions.MarkFalse(tcp, controlplanev1.MachinesBootstrapped, controlplanev1.WaitingForTalosBootReason, clusterv1.ConditionSeverityInfo, err.Error())
309309

310-
return ctrl.Result{}, err
310+
logger.Info("bootstrap failed, retrying in 20 seconds", "error", err)
311+
312+
return ctrl.Result{RequeueAfter: time.Second * 20}, nil
311313
}
312314

313315
conditions.MarkTrue(tcp, controlplanev1.MachinesBootstrapped)
@@ -336,7 +338,7 @@ func (r *TalosControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.Re
336338
func (r *TalosControlPlaneReconciler) ClusterToTalosControlPlane(o client.Object) []ctrl.Request {
337339
c, ok := o.(*clusterv1.Cluster)
338340
if !ok {
339-
r.Log.Error(nil, fmt.Sprintf("Expected a Cluster but got a %T", o))
341+
r.Log.Error(nil, fmt.Sprintf("expected a Cluster but got a %T", o))
340342
return nil
341343
}
342344

@@ -352,7 +354,7 @@ func (r *TalosControlPlaneReconciler) reconcileDelete(ctx context.Context, clust
352354
// Get list of all control plane machines
353355
ownedMachines, err := r.getControlPlaneMachinesForCluster(ctx, util.ObjectKey(cluster), tcp.Name)
354356
if err != nil {
355-
r.Log.Error(err, "Failed to retrieve control plane machines for cluster")
357+
r.Log.Error(err, "failed to retrieve control plane machines for cluster")
356358

357359
return ctrl.Result{}, err
358360
}
@@ -370,7 +372,7 @@ func (r *TalosControlPlaneReconciler) reconcileDelete(ctx context.Context, clust
370372
}
371373
// Submit deletion request
372374
if err := r.Client.Delete(ctx, &ownedMachine); err != nil && !apierrors.IsNotFound(err) {
373-
r.Log.Error(err, "Failed to cleanup owned machine")
375+
r.Log.Error(err, "failed to cleanup owned machine")
374376
return ctrl.Result{}, err
375377
}
376378
}
@@ -822,7 +824,7 @@ func (r *TalosControlPlaneReconciler) updateStatus(ctx context.Context, tcp *con
822824
conditions.MarkTrue(tcp, controlplanev1.AvailableCondition)
823825
}
824826
} else {
825-
r.Log.Error(err, "Failed attempt to contact workload cluster")
827+
r.Log.Error(err, "failed attempt to contact workload cluster")
826828
}
827829

828830
conditions.SetAggregate(tcp, controlplanev1.MachinesReadyCondition, conditionGetters, conditions.AddSourceRef(), conditions.WithStepCounterIf(false))

0 commit comments

Comments
 (0)