Make ClusterResourceSet controller more predictable

fabriziopandini · fabriziopandini · commit 5cfb124ff45c · 2024-07-15T19:26:30.000+02:00
diff --git a/exp/addons/internal/controllers/clusterresourceset_controller.go b/exp/addons/internal/controllers/clusterresourceset_controller.go
@@ -176,6 +176,27 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
 
 	// Return an aggregated error if errors occurred.
 	if len(errs) > 0 {
+		// When there are more than one ClusterResourceSet targeting the same cluster,
+		// there might be conflict when reconciling those ClusterResourceSet in parallel because they all try to
+		// patch the same ClusterResourceSetBinding Object.
+		// In case of patching conflicts we don't want to go on exponential backlog, otherwise it might take an
+		// arbitrary long time to get to stable state due to the backoff delay quickly growing.
+		// Instead, we are requeing with an interval to make the system a little bit more predictable (and stabilize tests).
+		// NOTE: The fact that we rely on conflict errors + requeue to reach the stable state isn't ideal, and
+		// it might also become an issue at scale.
+		// e.g. From an empirical observation, it takes 20s for 10 ClusterResourceSet to get to a stable state
+		// on the same ClusterResourceSetBinding; with less ClusterResourceSet the issue is less relevant
+		// (e.g. with 5 ClusterResourceSet it takes about 4 seconds).
+		// NOTE: Conflicts happens mostly when ClusterResourceSetBinding is initialized / an entry is added for each
+		// cluster resource set targeting the same cluster.
+		for _, err := range errs {
+			if aggregate, ok := err.(kerrors.Aggregate); ok {
+				if len(aggregate.Errors()) == 1 && apierrors.IsConflict(aggregate.Errors()[0]) {
+					log.Info("Conflict in patching a ClusterResourceSetBinding that is updated by more than one ClusterResourceSet, requeing")
+					return ctrl.Result{RequeueAfter: 100 * time.Millisecond}, nil
+				}
+			}
+		}
 		return ctrl.Result{}, kerrors.NewAggregate(errs)
 	}