Skip to content

Commit dda0a63

Browse files
authored
controller: consolidate status update logic (#915)
Previously, Status updates could fail due to version conflicts. This commit pulls all status update logic into a helper function that utilizes retry.RetryOnConflict to ensure that .Status updates get persisted in spite of resource version changes. Additionally, this commit removes an early bailout if the controller detects a divergence in the currently loaded resource and the API version. This change should have no notable affects as it took place after any actors ran. It also fixes a buglet that overwrote the reconcilers context, negating the maximum runtime for a single reconcilation loop.
1 parent dbefb9b commit dda0a63

File tree

4 files changed

+18
-30
lines changed

4 files changed

+18
-30
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
1111

1212
* Delete the CancelLoop function, fixing a cluster status update bug
1313
* Correctly detect failed version checker Pods
14+
* retry cluster status updates, reducing test flakes
1415

1516
# [v2.7.0](https://github.com/cockroachdb/cockroach-operator/compare/v2.6.0...v2.7.0)
1617

pkg/controller/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ go_library(
2424
"@io_k8s_api//policy/v1beta1:go_default_library",
2525
"@io_k8s_apimachinery//pkg/runtime:go_default_library",
2626
"@io_k8s_client_go//kubernetes:go_default_library",
27+
"@io_k8s_client_go//util/retry:go_default_library",
2728
"@io_k8s_sigs_controller_runtime//:go_default_library",
2829
"@io_k8s_sigs_controller_runtime//pkg/client:go_default_library",
2930
"@io_k8s_sigs_controller_runtime//pkg/reconcile:go_default_library",

pkg/controller/cluster_controller.go

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
policy "k8s.io/api/policy/v1beta1"
3838
"k8s.io/apimachinery/pkg/runtime"
3939
"k8s.io/client-go/kubernetes"
40+
"k8s.io/client-go/util/retry"
4041
ctrl "sigs.k8s.io/controller-runtime"
4142
"sigs.k8s.io/controller-runtime/pkg/client"
4243
"sigs.k8s.io/controller-runtime/pkg/reconcile"
@@ -121,8 +122,8 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
121122
// we added a state called Starting for field ClusterStatus to accomplish this
122123
if cluster.Status().ClusterStatus == "" {
123124
cluster.SetClusterStatusOnFirstReconcile()
124-
if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil {
125-
log.Error(err, "failed to update cluster status on action")
125+
if err := r.updateClusterStatus(ctx, log, &cluster); err != nil {
126+
log.Error(err, "failed to update cluster status")
126127
return requeueIfError(err)
127128
}
128129
return requeueImmediately()
@@ -132,8 +133,8 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
132133
if cluster.True(api.CrdbVersionChecked) {
133134
if cluster.GetCockroachDBImageName() != cluster.Status().CrdbContainerImage {
134135
cluster.SetFalse(api.CrdbVersionChecked)
135-
if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil {
136-
log.Error(err, "failed to update cluster status on action")
136+
if err := r.updateClusterStatus(ctx, log, &cluster); err != nil {
137+
log.Error(err, "failed to update cluster status")
137138
return requeueIfError(err)
138139
}
139140
return requeueImmediately()
@@ -148,16 +149,14 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
148149
return noRequeue()
149150
}
150151

151-
ctx = context.Background()
152-
153152
log.Info(fmt.Sprintf("Running action with name: %s", actorToExecute.GetActionType()))
154153
if err := actorToExecute.Act(ctx, &cluster, log); err != nil {
155154
// Save the error on the Status for each action
156155
log.Info("Error on action", "Action", actorToExecute.GetActionType(), "err", err.Error())
157156
cluster.SetActionFailed(actorToExecute.GetActionType(), err.Error())
158157

159158
defer func(ctx context.Context, cluster *resource.Cluster) {
160-
if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil {
159+
if err := r.updateClusterStatus(ctx, log, cluster); err != nil {
161160
log.Error(err, "failed to update cluster status")
162161
}
163162
}(ctx, &cluster)
@@ -193,20 +192,7 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
193192
cluster.SetActionFinished(actorToExecute.GetActionType())
194193
}
195194

196-
// Check if the resource has been updated while the controller worked on it
197-
fresh, err := cluster.IsFresh(fetcher)
198-
if err != nil {
199-
return requeueIfError(err)
200-
}
201-
202-
// If the resource was updated, it is needed to start all over again
203-
// to ensure that the latest state was reconciled
204-
if !fresh {
205-
log.V(int(zapcore.DebugLevel)).Info("cluster resources is not up to date")
206-
return requeueImmediately()
207-
}
208-
cluster.SetClusterStatus()
209-
if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil {
195+
if err := r.updateClusterStatus(ctx, log, &cluster); err != nil {
210196
log.Error(err, "failed to update cluster status")
211197
return requeueIfError(err)
212198
}
@@ -215,6 +201,15 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
215201
return noRequeue()
216202
}
217203

204+
// updateClusterStatus preprocesses a cluster's Status and then persists it to
205+
// the Kubernetes API. updateClusterStatus will retry on conflict errors.
206+
func (r *ClusterReconciler) updateClusterStatus(ctx context.Context, log logr.Logger, cluster *resource.Cluster) error {
207+
cluster.SetClusterStatus()
208+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
209+
return r.Client.Status().Update(ctx, cluster.Unwrap())
210+
})
211+
}
212+
218213
// SetupWithManager registers the controller with the controller.Manager from controller-runtime
219214
func (r *ClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
220215
var ingress client.Object

pkg/resource/cluster.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -319,15 +319,6 @@ func (cluster Cluster) SecureMode() string {
319319
return "--insecure"
320320
}
321321

322-
func (cluster Cluster) IsFresh(fetcher Fetcher) (bool, error) {
323-
actual := ClusterPlaceholder(cluster.Name())
324-
if err := fetcher.Fetch(actual); err != nil {
325-
return false, errors.Wrapf(err, "failed to fetch cluster resource")
326-
}
327-
328-
return cluster.cr.ResourceVersion == actual.ResourceVersion, nil
329-
}
330-
331322
func (cluster Cluster) LoggingConfiguration(fetcher Fetcher) (string, error) {
332323
if cluster.Spec().LogConfigMap != "" {
333324
cm := &corev1.ConfigMap{

0 commit comments

Comments
 (0)