Skip to content

Commit df94a14

Browse files
authored
cmd/k8s-operator: don't error for transient failures (tailscale#14073)
Every so often, the ProxyGroup and other controllers lose an optimistic locking race with other controllers that update the objects they create. Stop treating this as an error event, and instead just log an info level log line for it. Fixes tailscale#14072 Signed-off-by: Tom Proctor <[email protected]>
1 parent 7f9ebc0 commit df94a14

File tree

8 files changed

+84
-17
lines changed

8 files changed

+84
-17
lines changed

cmd/k8s-operator/connector.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"fmt"
1111
"net/netip"
1212
"slices"
13+
"strings"
1314
"sync"
1415
"time"
1516

@@ -35,6 +36,7 @@ import (
3536

3637
const (
3738
reasonConnectorCreationFailed = "ConnectorCreationFailed"
39+
reasonConnectorCreating = "ConnectorCreating"
3840
reasonConnectorCreated = "ConnectorCreated"
3941
reasonConnectorInvalid = "ConnectorInvalid"
4042

@@ -134,17 +136,24 @@ func (a *ConnectorReconciler) Reconcile(ctx context.Context, req reconcile.Reque
134136
}
135137

136138
if err := a.validate(cn); err != nil {
137-
logger.Errorf("error validating Connector spec: %w", err)
138139
message := fmt.Sprintf(messageConnectorInvalid, err)
139140
a.recorder.Eventf(cn, corev1.EventTypeWarning, reasonConnectorInvalid, message)
140141
return setStatus(cn, tsapi.ConnectorReady, metav1.ConditionFalse, reasonConnectorInvalid, message)
141142
}
142143

143144
if err = a.maybeProvisionConnector(ctx, logger, cn); err != nil {
144-
logger.Errorf("error creating Connector resources: %w", err)
145+
reason := reasonConnectorCreationFailed
145146
message := fmt.Sprintf(messageConnectorCreationFailed, err)
146-
a.recorder.Eventf(cn, corev1.EventTypeWarning, reasonConnectorCreationFailed, message)
147-
return setStatus(cn, tsapi.ConnectorReady, metav1.ConditionFalse, reasonConnectorCreationFailed, message)
147+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
148+
reason = reasonConnectorCreating
149+
message = fmt.Sprintf("optimistic lock error, retrying: %s", err)
150+
err = nil
151+
logger.Info(message)
152+
} else {
153+
a.recorder.Eventf(cn, corev1.EventTypeWarning, reason, message)
154+
}
155+
156+
return setStatus(cn, tsapi.ConnectorReady, metav1.ConditionFalse, reason, message)
148157
}
149158

150159
logger.Info("Connector resources synced")

cmd/k8s-operator/dnsrecords.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"encoding/json"
1111
"fmt"
1212
"slices"
13+
"strings"
1314

1415
"go.uber.org/zap"
1516
corev1 "k8s.io/api/core/v1"
@@ -98,7 +99,15 @@ func (dnsRR *dnsRecordsReconciler) Reconcile(ctx context.Context, req reconcile.
9899
return reconcile.Result{}, nil
99100
}
100101

101-
return reconcile.Result{}, dnsRR.maybeProvision(ctx, headlessSvc, logger)
102+
if err := dnsRR.maybeProvision(ctx, headlessSvc, logger); err != nil {
103+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
104+
logger.Infof("optimistic lock error, retrying: %s", err)
105+
} else {
106+
return reconcile.Result{}, err
107+
}
108+
}
109+
110+
return reconcile.Result{}, nil
102111
}
103112

104113
// maybeProvision ensures that dnsrecords ConfigMap contains a record for the

cmd/k8s-operator/egress-services.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,15 @@ func (esr *egressSvcsReconciler) Reconcile(ctx context.Context, req reconcile.Re
156156
return res, err
157157
}
158158

159-
return res, esr.maybeProvision(ctx, svc, l)
159+
if err := esr.maybeProvision(ctx, svc, l); err != nil {
160+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
161+
l.Infof("optimistic lock error, retrying: %s", err)
162+
} else {
163+
return reconcile.Result{}, err
164+
}
165+
}
166+
167+
return res, nil
160168
}
161169

162170
func (esr *egressSvcsReconciler) maybeProvision(ctx context.Context, svc *corev1.Service, l *zap.SugaredLogger) (err error) {

cmd/k8s-operator/ingress.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,15 @@ func (a *IngressReconciler) Reconcile(ctx context.Context, req reconcile.Request
7676
return reconcile.Result{}, a.maybeCleanup(ctx, logger, ing)
7777
}
7878

79-
return reconcile.Result{}, a.maybeProvision(ctx, logger, ing)
79+
if err := a.maybeProvision(ctx, logger, ing); err != nil {
80+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
81+
logger.Infof("optimistic lock error, retrying: %s", err)
82+
} else {
83+
return reconcile.Result{}, err
84+
}
85+
}
86+
87+
return reconcile.Result{}, nil
8088
}
8189

8290
func (a *IngressReconciler) maybeCleanup(ctx context.Context, logger *zap.SugaredLogger, ing *networkingv1.Ingress) error {

cmd/k8s-operator/nameserver.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"fmt"
1111
"slices"
12+
"strings"
1213
"sync"
1314

1415
_ "embed"
@@ -131,7 +132,12 @@ func (a *NameserverReconciler) Reconcile(ctx context.Context, req reconcile.Requ
131132
}
132133
}
133134
if err := a.maybeProvision(ctx, &dnsCfg, logger); err != nil {
134-
return reconcile.Result{}, fmt.Errorf("error provisioning nameserver resources: %w", err)
135+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
136+
logger.Infof("optimistic lock error, retrying: %s", err)
137+
return reconcile.Result{}, nil
138+
} else {
139+
return reconcile.Result{}, fmt.Errorf("error provisioning nameserver resources: %w", err)
140+
}
135141
}
136142

137143
a.mu.Lock()

cmd/k8s-operator/proxygroup.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"fmt"
1313
"net/http"
1414
"slices"
15+
"strings"
1516
"sync"
1617

1718
"github.com/pkg/errors"
@@ -45,6 +46,9 @@ const (
4546
reasonProxyGroupReady = "ProxyGroupReady"
4647
reasonProxyGroupCreating = "ProxyGroupCreating"
4748
reasonProxyGroupInvalid = "ProxyGroupInvalid"
49+
50+
// Copied from k8s.io/apiserver/pkg/registry/generic/registry/store.go@cccad306d649184bf2a0e319ba830c53f65c445c
51+
optimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
4852
)
4953

5054
var gaugeProxyGroupResources = clientmetric.NewGauge(kubetypes.MetricProxyGroupEgressCount)
@@ -166,9 +170,17 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
166170
}
167171

168172
if err = r.maybeProvision(ctx, pg, proxyClass); err != nil {
169-
err = fmt.Errorf("error provisioning ProxyGroup resources: %w", err)
170-
r.recorder.Eventf(pg, corev1.EventTypeWarning, reasonProxyGroupCreationFailed, err.Error())
171-
return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreationFailed, err.Error())
173+
reason := reasonProxyGroupCreationFailed
174+
msg := fmt.Sprintf("error provisioning ProxyGroup resources: %s", err)
175+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
176+
reason = reasonProxyGroupCreating
177+
msg = fmt.Sprintf("optimistic lock error, retrying: %s", err)
178+
err = nil
179+
logger.Info(msg)
180+
} else {
181+
r.recorder.Eventf(pg, corev1.EventTypeWarning, reason, msg)
182+
}
183+
return setStatusReady(pg, metav1.ConditionFalse, reason, msg)
172184
}
173185

174186
desiredReplicas := int(pgReplicas(pg))

cmd/k8s-operator/svc.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,15 @@ func (a *ServiceReconciler) Reconcile(ctx context.Context, req reconcile.Request
121121
return reconcile.Result{}, a.maybeCleanup(ctx, logger, svc)
122122
}
123123

124-
return reconcile.Result{}, a.maybeProvision(ctx, logger, svc)
124+
if err := a.maybeProvision(ctx, logger, svc); err != nil {
125+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
126+
logger.Infof("optimistic lock error, retrying: %s", err)
127+
} else {
128+
return reconcile.Result{}, err
129+
}
130+
}
131+
132+
return reconcile.Result{}, nil
125133
}
126134

127135
// maybeCleanup removes any existing resources related to serving svc over tailscale.

cmd/k8s-operator/tsrecorder.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"net/http"
1313
"slices"
14+
"strings"
1415
"sync"
1516

1617
"github.com/pkg/errors"
@@ -38,6 +39,7 @@ import (
3839

3940
const (
4041
reasonRecorderCreationFailed = "RecorderCreationFailed"
42+
reasonRecorderCreating = "RecorderCreating"
4143
reasonRecorderCreated = "RecorderCreated"
4244
reasonRecorderInvalid = "RecorderInvalid"
4345

@@ -119,23 +121,28 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques
119121
logger.Infof("ensuring Recorder is set up")
120122
tsr.Finalizers = append(tsr.Finalizers, FinalizerName)
121123
if err := r.Update(ctx, tsr); err != nil {
122-
logger.Errorf("error adding finalizer: %w", err)
123124
return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderCreationFailed, reasonRecorderCreationFailed)
124125
}
125126
}
126127

127128
if err := r.validate(tsr); err != nil {
128-
logger.Errorf("error validating Recorder spec: %w", err)
129129
message := fmt.Sprintf("Recorder is invalid: %s", err)
130130
r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderInvalid, message)
131131
return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderInvalid, message)
132132
}
133133

134134
if err = r.maybeProvision(ctx, tsr); err != nil {
135-
logger.Errorf("error creating Recorder resources: %w", err)
135+
reason := reasonRecorderCreationFailed
136136
message := fmt.Sprintf("failed creating Recorder: %s", err)
137-
r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderCreationFailed, message)
138-
return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderCreationFailed, message)
137+
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
138+
reason = reasonRecorderCreating
139+
message = fmt.Sprintf("optimistic lock error, retrying: %s", err)
140+
err = nil
141+
logger.Info(message)
142+
} else {
143+
r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderCreationFailed, message)
144+
}
145+
return setStatusReady(tsr, metav1.ConditionFalse, reason, message)
139146
}
140147

141148
logger.Info("Recorder resources synced")

0 commit comments

Comments
 (0)