@@ -176,6 +176,27 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R
176
176
177
177
// Return an aggregated error if errors occurred.
178
178
if len (errs ) > 0 {
179
+ // When there are more than one ClusterResourceSet targeting the same cluster,
180
+ // there might be conflict when reconciling those ClusterResourceSet in parallel because they all try to
181
+ // patch the same ClusterResourceSetBinding Object.
182
+ // In case of patching conflicts we don't want to go on exponential backlog, otherwise it might take an
183
+ // arbitrary long time to get to stable state due to the backoff delay quickly growing.
184
+ // Instead, we are requeing with an interval to make the system a little bit more predictable (and stabilize tests).
185
+ // NOTE: The fact that we rely on conflict errors + requeue to reach the stable state isn't ideal, and
186
+ // it might also become an issue at scale.
187
+ // e.g. From an empirical observation, it takes 20s for 10 ClusterResourceSet to get to a stable state
188
+ // on the same ClusterResourceSetBinding; with less ClusterResourceSet the issue is less relevant
189
+ // (e.g. with 5 ClusterResourceSet it takes about 4 seconds).
190
+ // NOTE: Conflicts happens mostly when ClusterResourceSetBinding is initialized / an entry is added for each
191
+ // cluster resource set targeting the same cluster.
192
+ for _ , err := range errs {
193
+ if aggregate , ok := err .(kerrors.Aggregate ); ok {
194
+ if len (aggregate .Errors ()) == 1 && apierrors .IsConflict (aggregate .Errors ()[0 ]) {
195
+ log .Info ("Conflict in patching a ClusterResourceSetBinding that is updated by more than one ClusterResourceSet, requeing" )
196
+ return ctrl.Result {RequeueAfter : 100 * time .Millisecond }, nil
197
+ }
198
+ }
199
+ }
179
200
return ctrl.Result {}, kerrors .NewAggregate (errs )
180
201
}
181
202
0 commit comments