Skip to content

Commit a8959e3

Browse files
authored
🌱 clustercache: do not use RequeueAfter when hitting ErrClusterNotConnected (#11736)
* clustercache: do not use RequeueAfter when hitting ErrClusterNotConnected and return error instead * review fixes
1 parent 819d3f3 commit a8959e3

File tree

9 files changed

+21
-136
lines changed

9 files changed

+21
-136
lines changed

‎bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -279,12 +279,7 @@ func (r *KubeadmConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques
279279
return ctrl.Result{}, nil
280280
}
281281

282-
res, err := r.reconcile(ctx, scope, cluster, config, configOwner)
283-
if err != nil && errors.Is(err, clustercache.ErrClusterNotConnected) {
284-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
285-
return ctrl.Result{RequeueAfter: time.Minute}, nil
286-
}
287-
return res, err
282+
return r.reconcile(ctx, scope, cluster, config, configOwner)
288283
}
289284

290285
func (r *KubeadmConfigReconciler) reconcile(ctx context.Context, scope *Scope, cluster *clusterv1.Cluster, config *bootstrapv1.KubeadmConfig, configOwner *bsutil.ConfigOwner) (ctrl.Result, error) {

‎controlplane/kubeadm/internal/controllers/controller.go

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -255,21 +255,11 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
255255

256256
if !kcp.ObjectMeta.DeletionTimestamp.IsZero() {
257257
// Handle deletion reconciliation loop.
258-
res, err = r.reconcileDelete(ctx, controlPlane)
259-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
260-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
261-
return ctrl.Result{RequeueAfter: time.Minute}, nil
262-
}
263-
return res, err
258+
return r.reconcileDelete(ctx, controlPlane)
264259
}
265260

266261
// Handle normal reconciliation loop.
267-
res, err = r.reconcile(ctx, controlPlane)
268-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
269-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
270-
return ctrl.Result{RequeueAfter: time.Minute}, nil
271-
}
272-
return res, err
262+
return r.reconcile(ctx, controlPlane)
273263
}
274264

275265
// initControlPlaneScope initializes the control plane scope; this includes also checking for orphan machines and

‎exp/addons/internal/controllers/clusterresourceset_controller.go

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -183,17 +183,9 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
183183
}
184184

185185
errs := []error{}
186-
errClusterNotConnectedOccurred := false
187186
for _, cluster := range clusters {
188187
if err := r.ApplyClusterResourceSet(ctx, cluster, clusterResourceSet); err != nil {
189-
// Requeue if the reconcile failed because connection to workload cluster was down.
190-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
191-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
192-
errClusterNotConnectedOccurred = true
193-
} else {
194-
// Append the error if the error is not ErrClusterLocked.
195-
errs = append(errs, err)
196-
}
188+
errs = append(errs, err)
197189
}
198190
}
199191

@@ -218,13 +210,6 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
218210
return ctrl.Result{}, kerrors.NewAggregate(errs)
219211
}
220212

221-
// Requeue if ErrClusterNotConnected was returned for one of the clusters.
222-
if errClusterNotConnectedOccurred {
223-
// Requeue after a minute to not end up in exponential delayed requeue which
224-
// could take up to 16m40s.
225-
return ctrl.Result{RequeueAfter: time.Minute}, nil
226-
}
227-
228213
return ctrl.Result{}, nil
229214
}
230215

‎exp/internal/controllers/machinepool_controller.go

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -228,28 +228,15 @@ func (r *MachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request)
228228

229229
// Handle deletion reconciliation loop.
230230
if !mp.ObjectMeta.DeletionTimestamp.IsZero() {
231-
err := r.reconcileDelete(ctx, cluster, mp)
232-
// Requeue if the reconcile failed because connection to workload cluster was down.
233-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
234-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
235-
return ctrl.Result{RequeueAfter: time.Minute}, nil
236-
}
237-
238-
return ctrl.Result{}, err
231+
return ctrl.Result{}, r.reconcileDelete(ctx, cluster, mp)
239232
}
240233

241234
// Handle normal reconciliation loop.
242235
scope := &scope{
243236
cluster: cluster,
244237
machinePool: mp,
245238
}
246-
res, err := r.reconcile(ctx, scope)
247-
// Requeue if the reconcile failed because connection to workload cluster was down.
248-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
249-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
250-
return ctrl.Result{RequeueAfter: time.Minute}, nil
251-
}
252-
return res, err
239+
return r.reconcile(ctx, scope)
253240
}
254241

255242
func (r *MachinePoolReconciler) reconcile(ctx context.Context, s *scope) (ctrl.Result, error) {

‎internal/controllers/machine/machine_controller.go

Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
201201
return ctrl.Result{}, err
202202
}
203203

204-
log := ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName))
205-
ctx = ctrl.LoggerInto(ctx, log)
204+
ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName)))
206205

207206
// Add finalizer first if not set to avoid the race condition between init and delete.
208207
if finalizerAdded, err := finalizers.EnsureFinalizer(ctx, r.Client, m, clusterv1.MachineFinalizer); err != nil || finalizerAdded {
@@ -211,7 +210,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
211210

212211
// AddOwners adds the owners of Machine as k/v pairs to the logger.
213212
// Specifically, it will add KubeadmControlPlane, MachineSet and MachineDeployment.
214-
ctx, log, err := clog.AddOwners(ctx, r.Client, m)
213+
ctx, _, err := clog.AddOwners(ctx, r.Client, m)
215214
if err != nil {
216215
return ctrl.Result{}, err
217216
}
@@ -276,23 +275,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
276275
r.reconcileDelete,
277276
)
278277

279-
res, err := doReconcile(ctx, reconcileDelete, s)
280-
// Requeue if the reconcile failed because connection to workload cluster was down.
281-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
282-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
283-
return ctrl.Result{RequeueAfter: time.Minute}, nil
284-
}
285-
return res, err
278+
return doReconcile(ctx, reconcileDelete, s)
286279
}
287280

288281
// Handle normal reconciliation loop.
289-
res, err := doReconcile(ctx, alwaysReconcile, s)
290-
// Requeue if the reconcile failed because connection to workload cluster was down.
291-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
292-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
293-
return ctrl.Result{RequeueAfter: time.Minute}, nil
294-
}
295-
return res, err
282+
return doReconcile(ctx, alwaysReconcile, s)
296283
}
297284

298285
func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -819,14 +806,7 @@ func (r *Reconciler) drainNode(ctx context.Context, s *scope) (ctrl.Result, erro
819806

820807
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
821808
if err != nil {
822-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
823-
log.V(5).Info("Requeuing drain Node because connection to the workload cluster is down")
824-
s.deletingReason = clusterv1.MachineDeletingDrainingNodeV1Beta2Reason
825-
s.deletingMessage = "Requeuing drain Node because connection to the workload cluster is down"
826-
return ctrl.Result{RequeueAfter: time.Minute}, nil
827-
}
828-
log.Error(err, "Error creating a remote client for cluster while draining Node, won't retry")
829-
return ctrl.Result{}, nil
809+
return ctrl.Result{}, errors.Wrapf(err, "failed to drain Node %s", nodeName)
830810
}
831811

832812
node := &corev1.Node{}
@@ -992,15 +972,9 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, s *scope) (ct
992972
}
993973

994974
func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, name string) error {
995-
log := ctrl.LoggerFrom(ctx)
996-
997975
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
998976
if err != nil {
999-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
1000-
return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down")
1001-
}
1002-
log.Error(err, "Error creating a remote client for cluster while deleting Node, won't retry")
1003-
return nil
977+
return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down")
1004978
}
1005979

1006980
node := &corev1.Node{

‎internal/controllers/machinehealthcheck/machinehealthcheck_controller.go

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
188188
}
189189
m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName
190190

191-
result, err := r.reconcile(ctx, log, cluster, m)
192-
if err != nil {
193-
// Requeue if the reconcile failed because connection to workload cluster was down.
194-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
195-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
196-
return ctrl.Result{RequeueAfter: time.Minute}, nil
197-
}
198-
199-
// Requeue immediately if any errors occurred
200-
return ctrl.Result{}, err
201-
}
202-
203-
return result, nil
191+
return r.reconcile(ctx, log, cluster, m)
204192
}
205193

206194
func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) {

‎internal/controllers/machineset/machineset_controller.go

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
168168
return ctrl.Result{}, err
169169
}
170170

171-
log := ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(machineSet.Namespace, machineSet.Spec.ClusterName))
172-
ctx = ctrl.LoggerInto(ctx, log)
171+
ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("Cluster", klog.KRef(machineSet.Namespace, machineSet.Spec.ClusterName)))
173172

174173
// Add finalizer first if not set to avoid the race condition between init and delete.
175174
if finalizerAdded, err := finalizers.EnsureFinalizer(ctx, r.Client, machineSet, clusterv1.MachineSetFinalizer); err != nil || finalizerAdded {
@@ -178,7 +177,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
178177

179178
// AddOwners adds the owners of MachineSet as k/v pairs to the logger.
180179
// Specifically, it will add MachineDeployment.
181-
ctx, log, err := clog.AddOwners(ctx, r.Client, machineSet)
180+
ctx, _, err := clog.AddOwners(ctx, r.Client, machineSet)
182181
if err != nil {
183182
return ctrl.Result{}, err
184183
}
@@ -257,20 +256,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct
257256
wrapErrMachineSetReconcileFunc(r.syncReplicas, "failed to sync replicas"),
258257
)
259258

260-
result, kerr := doReconcile(ctx, s, reconcileNormal)
261-
if kerr != nil {
262-
// Requeue if the reconcile failed because connection to workload cluster was down.
263-
if errors.Is(kerr, clustercache.ErrClusterNotConnected) {
264-
if len(kerr.Errors()) > 1 {
265-
log.Error(kerr, "Requeuing because connection to the workload cluster is down")
266-
} else {
267-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
268-
}
269-
return ctrl.Result{RequeueAfter: time.Minute}, nil
270-
}
271-
err = kerr
272-
}
273-
return result, err
259+
return doReconcile(ctx, s, reconcileNormal)
274260
}
275261

276262
type scope struct {

‎internal/controllers/topology/cluster/cluster_controller.go

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,6 @@ func (r *Reconciler) SetupForDryRun(recorder record.EventRecorder) {
278278
}
279279

280280
func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
281-
log := ctrl.LoggerFrom(ctx)
282-
283281
// Fetch the Cluster instance.
284282
cluster := &clusterv1.Cluster{}
285283
if err := r.Client.Get(ctx, req.NamespacedName, cluster); err != nil {
@@ -340,15 +338,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
340338
}
341339

342340
// Handle normal reconciliation loop.
343-
result, err := r.reconcile(ctx, s)
344-
if err != nil {
345-
// Requeue if the reconcile failed because connection to workload cluster was down.
346-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
347-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
348-
return ctrl.Result{RequeueAfter: time.Minute}, nil
349-
}
350-
}
351-
return result, err
341+
return r.reconcile(ctx, s)
352342
}
353343

354344
// reconcile handles cluster reconciliation.

‎test/infrastructure/docker/internal/controllers/dockermachine_controller.go

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
apierrors "k8s.io/apimachinery/pkg/api/errors"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929
"k8s.io/apimachinery/pkg/types"
30+
kerrors "k8s.io/apimachinery/pkg/util/errors"
3031
"k8s.io/klog/v2"
3132
ctrl "sigs.k8s.io/controller-runtime"
3233
"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -70,8 +71,7 @@ type DockerMachineReconciler struct {
7071
// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
7172

7273
// Reconcile handles DockerMachine events.
73-
func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, rerr error) {
74-
log := ctrl.LoggerFrom(ctx)
74+
func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
7575
ctx = container.RuntimeInto(ctx, r.ContainerRuntime)
7676

7777
// Fetch the DockerMachine instance.
@@ -150,10 +150,7 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
150150
// Always attempt to Patch the DockerMachine object and status after each reconciliation.
151151
defer func() {
152152
if err := patchDockerMachine(ctx, patchHelper, dockerMachine); err != nil {
153-
log.Error(err, "Failed to patch DockerMachine")
154-
if rerr == nil {
155-
rerr = err
156-
}
153+
reterr = kerrors.NewAggregate([]error{reterr, err})
157154
}
158155
}()
159156

@@ -188,14 +185,7 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
188185
}
189186

190187
// Handle non-deleted machines
191-
res, err := r.reconcileNormal(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer)
192-
// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
193-
// the current cluster because of concurrent access.
194-
if errors.Is(err, clustercache.ErrClusterNotConnected) {
195-
log.V(5).Info("Requeuing because connection to the workload cluster is down")
196-
return ctrl.Result{RequeueAfter: time.Minute}, nil
197-
}
198-
return res, err
188+
return r.reconcileNormal(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer)
199189
}
200190

201191
func patchDockerMachine(ctx context.Context, patchHelper *patch.Helper, dockerMachine *infrav1.DockerMachine) error {

0 commit comments

Comments
 (0)