Skip to content

Commit f9ae0e2

Browse files
committed
operator v1: use uncached client in initial Get Cluster
It's possible that Get returns a cached, stale value - even if the missed change was performed in the same process. This is because of various trade-offs made in controller-runtime's cache layers. This brings the controller in a dangerous situtation, in practice it happens that a previous run reduced status.currentReplicas from 3 to 2, but the next run reads a stale object and sees 3. To avoid this bug, and other bugs we can not yet foresee, we will perform this one Get with an uncached read. It is a trade-off, where we accept a performance hit, and increased load on kube-api server, but we value the consistency higher. This is limited to this one, most important Get call, of Cluster resource only (one per entire Reconcile). Since controller-runtime has throttling built in, and it's only "that on Cluster Get call", we believe this trade-off is acceptable, and the benefits outweight the cost.
1 parent c17bd45 commit f9ae0e2

File tree

2 files changed

+26
-9
lines changed

2 files changed

+26
-9
lines changed

operator/cmd/run/run.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
_ "k8s.io/client-go/plugin/pkg/client/auth"
2828
ctrl "sigs.k8s.io/controller-runtime"
2929
"sigs.k8s.io/controller-runtime/pkg/cache"
30+
ctrlClient "sigs.k8s.io/controller-runtime/pkg/client"
3031
"sigs.k8s.io/controller-runtime/pkg/healthz"
3132
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3233
"sigs.k8s.io/controller-runtime/pkg/webhook"
@@ -252,8 +253,15 @@ func Run(
252253

253254
adminAPIClientFactory := adminutils.CachedNodePoolAdminAPIClientFactory(adminutils.NewNodePoolInternalAdminAPI)
254255

256+
restConfig := mgr.GetConfig()
257+
uncachedClient, err := ctrlClient.New(restConfig, ctrlClient.Options{Scheme: mgr.GetScheme(), Mapper: mgr.GetRESTMapper()})
258+
if err != nil {
259+
return fmt.Errorf("failed to create uncached client: %w", err)
260+
}
261+
255262
if err = (&vectorizedcontrollers.ClusterReconciler{
256263
Client: mgr.GetClient(),
264+
UncachedClient: uncachedClient,
257265
Log: ctrl.Log.WithName("controllers").WithName("redpanda").WithName("Cluster"),
258266
Scheme: mgr.GetScheme(),
259267
AdminAPIClientFactory: adminutils.NewNodePoolInternalAdminAPI,

operator/internal/controller/vectorized/cluster_controller.go

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ var (
6969
// ClusterReconciler reconciles a Cluster object
7070
type ClusterReconciler struct {
7171
client.Client
72+
UncachedClient client.Client
7273
Log logr.Logger
7374
configuratorSettings resources.ConfiguratorSettings
7475
clusterDomain string
@@ -124,17 +125,25 @@ func (r *ClusterReconciler) Reconcile(
124125

125126
var vectorizedCluster vectorizedv1alpha1.Cluster
126127
ar := newAttachedResources(ctx, r, log, &vectorizedCluster)
127-
if err := r.Get(ctx, req.NamespacedName, &vectorizedCluster); err != nil {
128-
// we'll ignore not-found errors, since they can't be fixed by an immediate
129-
// requeue (we'll need to wait for a new notification), and we can get them
130-
// on deleted requests.
131-
if apierrors.IsNotFound(err) {
132-
if removeError := ar.getClusterRoleBinding().RemoveSubject(ctx, req.NamespacedName); removeError != nil {
133-
return ctrl.Result{}, fmt.Errorf("unable to remove subject in ClusterroleBinding: %w", removeError)
128+
{
129+
// Perform Get with uncached client, to avoid cache races, where we don't
130+
// see previous changes performed via Patch/Update.
131+
getClient := r.Client
132+
if r.UncachedClient != nil {
133+
getClient = r.UncachedClient
134+
}
135+
if err := getClient.Get(ctx, req.NamespacedName, &vectorizedCluster); err != nil {
136+
// we'll ignore not-found errors, since they can't be fixed by an immediate
137+
// requeue (we'll need to wait for a new notification), and we can get them
138+
// on deleted requests.
139+
if apierrors.IsNotFound(err) {
140+
if removeError := ar.getClusterRoleBinding().RemoveSubject(ctx, req.NamespacedName); removeError != nil {
141+
return ctrl.Result{}, fmt.Errorf("unable to remove subject in ClusterroleBinding: %w", removeError)
142+
}
143+
return ctrl.Result{}, nil
134144
}
135-
return ctrl.Result{}, nil
145+
return ctrl.Result{}, fmt.Errorf("unable to retrieve Cluster resource: %w", err)
136146
}
137-
return ctrl.Result{}, fmt.Errorf("unable to retrieve Cluster resource: %w", err)
138147
}
139148

140149
// After every reconciliation, update status:

0 commit comments

Comments
 (0)