Skip to content

Commit d8a3683

Browse files
authored
cmd/k8s-operator: restart ProxyGroup pods less (tailscale#14045)
We currently annotate pods with a hash of the tailscaled config so that we can trigger pod restarts whenever it changes. However, the hash updates more frequently than is necessary causing more restarts than is necessary. This commit removes two causes; scaling up/down and removing the auth key after pods have initially authed to control. However, note that pods will still restart on scale-up/down because of the updated set of volumes mounted into each pod. Hopefully we can fix that in a planned follow-up PR. Updates tailscale#13406 Signed-off-by: Tom Proctor <[email protected]>
1 parent 4e0fc03 commit d8a3683

File tree

3 files changed

+66
-26
lines changed

3 files changed

+66
-26
lines changed

cmd/k8s-operator/proxygroup.go

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ func (r *ProxyGroupReconciler) deleteTailnetDevice(ctx context.Context, id tailc
353353

354354
func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (hash string, err error) {
355355
logger := r.logger(pg.Name)
356-
var allConfigs []tailscaledConfigs
356+
var configSHA256Sum string
357357
for i := range pgReplicas(pg) {
358358
cfgSecret := &corev1.Secret{
359359
ObjectMeta: metav1.ObjectMeta{
@@ -389,7 +389,6 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
389389
if err != nil {
390390
return "", fmt.Errorf("error creating tailscaled config: %w", err)
391391
}
392-
allConfigs = append(allConfigs, configs)
393392

394393
for cap, cfg := range configs {
395394
cfgJSON, err := json.Marshal(cfg)
@@ -399,6 +398,32 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
399398
mak.Set(&cfgSecret.StringData, tsoperator.TailscaledConfigFileName(cap), string(cfgJSON))
400399
}
401400

401+
// The config sha256 sum is a value for a hash annotation used to trigger
402+
// pod restarts when tailscaled config changes. Any config changes apply
403+
// to all replicas, so it is sufficient to only hash the config for the
404+
// first replica.
405+
//
406+
// In future, we're aiming to eliminate restarts altogether and have
407+
// pods dynamically reload their config when it changes.
408+
if i == 0 {
409+
sum := sha256.New()
410+
for _, cfg := range configs {
411+
// Zero out the auth key so it doesn't affect the sha256 hash when we
412+
// remove it from the config after the pods have all authed. Otherwise
413+
// all the pods will need to restart immediately after authing.
414+
cfg.AuthKey = nil
415+
b, err := json.Marshal(cfg)
416+
if err != nil {
417+
return "", err
418+
}
419+
if _, err := sum.Write(b); err != nil {
420+
return "", err
421+
}
422+
}
423+
424+
configSHA256Sum = fmt.Sprintf("%x", sum.Sum(nil))
425+
}
426+
402427
if existingCfgSecret != nil {
403428
logger.Debugf("patching the existing ProxyGroup config Secret %s", cfgSecret.Name)
404429
if err := r.Patch(ctx, cfgSecret, client.MergeFrom(existingCfgSecret)); err != nil {
@@ -412,16 +437,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
412437
}
413438
}
414439

415-
sum := sha256.New()
416-
b, err := json.Marshal(allConfigs)
417-
if err != nil {
418-
return "", err
419-
}
420-
if _, err := sum.Write(b); err != nil {
421-
return "", err
422-
}
423-
424-
return fmt.Sprintf("%x", sum.Sum(nil)), nil
440+
return configSHA256Sum, nil
425441
}
426442

427443
func pgTailscaledConfig(pg *tsapi.ProxyGroup, class *tsapi.ProxyClass, idx int32, authKey string, oldSecret *corev1.Secret) (tailscaledConfigs, error) {

cmd/k8s-operator/proxygroup_specs.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode, cfgHa
9393
c.Image = image
9494
c.VolumeMounts = func() []corev1.VolumeMount {
9595
var mounts []corev1.VolumeMount
96+
97+
// TODO(tomhjp): Read config directly from the secret instead. The
98+
// mounts change on scaling up/down which causes unnecessary restarts
99+
// for pods that haven't meaningfully changed.
96100
for i := range pgReplicas(pg) {
97101
mounts = append(mounts, corev1.VolumeMount{
98102
Name: fmt.Sprintf("tailscaledconfig-%d", i),

cmd/k8s-operator/proxygroup_test.go

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ var defaultProxyClassAnnotations = map[string]string{
3535
}
3636

3737
func TestProxyGroup(t *testing.T) {
38+
const initialCfgHash = "6632726be70cf224049580deb4d317bba065915b5fd415461d60ed621c91b196"
39+
3840
pc := &tsapi.ProxyClass{
3941
ObjectMeta: metav1.ObjectMeta{
4042
Name: "default-pc",
@@ -80,6 +82,7 @@ func TestProxyGroup(t *testing.T) {
8082

8183
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "the ProxyGroup's ProxyClass default-pc is not yet in a ready state, waiting...", 0, cl, zl.Sugar())
8284
expectEqual(t, fc, pg, nil)
85+
expectProxyGroupResources(t, fc, pg, false, "")
8386
})
8487

8588
t.Run("observe_ProxyGroupCreating_status_reason", func(t *testing.T) {
@@ -100,10 +103,11 @@ func TestProxyGroup(t *testing.T) {
100103

101104
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "0/2 ProxyGroup pods running", 0, cl, zl.Sugar())
102105
expectEqual(t, fc, pg, nil)
106+
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
103107
if expected := 1; reconciler.proxyGroups.Len() != expected {
104108
t.Fatalf("expected %d recorders, got %d", expected, reconciler.proxyGroups.Len())
105109
}
106-
expectProxyGroupResources(t, fc, pg, true)
110+
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
107111
keyReq := tailscale.KeyCapabilities{
108112
Devices: tailscale.KeyDeviceCapabilities{
109113
Create: tailscale.KeyDeviceCreateCapabilities{
@@ -135,7 +139,7 @@ func TestProxyGroup(t *testing.T) {
135139
}
136140
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionTrue, reasonProxyGroupReady, reasonProxyGroupReady, 0, cl, zl.Sugar())
137141
expectEqual(t, fc, pg, nil)
138-
expectProxyGroupResources(t, fc, pg, true)
142+
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
139143
})
140144

141145
t.Run("scale_up_to_3", func(t *testing.T) {
@@ -146,6 +150,7 @@ func TestProxyGroup(t *testing.T) {
146150
expectReconciled(t, reconciler, "", pg.Name)
147151
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "2/3 ProxyGroup pods running", 0, cl, zl.Sugar())
148152
expectEqual(t, fc, pg, nil)
153+
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
149154

150155
addNodeIDToStateSecrets(t, fc, pg)
151156
expectReconciled(t, reconciler, "", pg.Name)
@@ -155,19 +160,34 @@ func TestProxyGroup(t *testing.T) {
155160
TailnetIPs: []string{"1.2.3.4", "::1"},
156161
})
157162
expectEqual(t, fc, pg, nil)
158-
expectProxyGroupResources(t, fc, pg, true)
163+
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
159164
})
160165

161166
t.Run("scale_down_to_1", func(t *testing.T) {
162167
pg.Spec.Replicas = ptr.To[int32](1)
163168
mustUpdate(t, fc, "", pg.Name, func(p *tsapi.ProxyGroup) {
164169
p.Spec = pg.Spec
165170
})
171+
166172
expectReconciled(t, reconciler, "", pg.Name)
173+
167174
pg.Status.Devices = pg.Status.Devices[:1] // truncate to only the first device.
168175
expectEqual(t, fc, pg, nil)
176+
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
177+
})
178+
179+
t.Run("trigger_config_change_and_observe_new_config_hash", func(t *testing.T) {
180+
pc.Spec.TailscaleConfig = &tsapi.TailscaleConfig{
181+
AcceptRoutes: true,
182+
}
183+
mustUpdate(t, fc, "", pc.Name, func(p *tsapi.ProxyClass) {
184+
p.Spec = pc.Spec
185+
})
169186

170-
expectProxyGroupResources(t, fc, pg, true)
187+
expectReconciled(t, reconciler, "", pg.Name)
188+
189+
expectEqual(t, fc, pg, nil)
190+
expectProxyGroupResources(t, fc, pg, true, "518a86e9fae64f270f8e0ec2a2ea6ca06c10f725035d3d6caca132cd61e42a74")
171191
})
172192

173193
t.Run("delete_and_cleanup", func(t *testing.T) {
@@ -191,13 +211,13 @@ func TestProxyGroup(t *testing.T) {
191211
})
192212
}
193213

194-
func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool) {
214+
func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool, cfgHash string) {
195215
t.Helper()
196216

197217
role := pgRole(pg, tsNamespace)
198218
roleBinding := pgRoleBinding(pg, tsNamespace)
199219
serviceAccount := pgServiceAccount(pg, tsNamespace)
200-
statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto", "")
220+
statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto", cfgHash)
201221
if err != nil {
202222
t.Fatal(err)
203223
}
@@ -207,9 +227,7 @@ func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.Prox
207227
expectEqual(t, fc, role, nil)
208228
expectEqual(t, fc, roleBinding, nil)
209229
expectEqual(t, fc, serviceAccount, nil)
210-
expectEqual(t, fc, statefulSet, func(ss *appsv1.StatefulSet) {
211-
ss.Spec.Template.Annotations[podAnnotationLastSetConfigFileHash] = ""
212-
})
230+
expectEqual(t, fc, statefulSet, nil)
213231
} else {
214232
expectMissing[rbacv1.Role](t, fc, role.Namespace, role.Name)
215233
expectMissing[rbacv1.RoleBinding](t, fc, roleBinding.Namespace, roleBinding.Name)
@@ -218,11 +236,13 @@ func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.Prox
218236
}
219237

220238
var expectedSecrets []string
221-
for i := range pgReplicas(pg) {
222-
expectedSecrets = append(expectedSecrets,
223-
fmt.Sprintf("%s-%d", pg.Name, i),
224-
fmt.Sprintf("%s-%d-config", pg.Name, i),
225-
)
239+
if shouldExist {
240+
for i := range pgReplicas(pg) {
241+
expectedSecrets = append(expectedSecrets,
242+
fmt.Sprintf("%s-%d", pg.Name, i),
243+
fmt.Sprintf("%s-%d-config", pg.Name, i),
244+
)
245+
}
226246
}
227247
expectSecrets(t, fc, expectedSecrets)
228248
}

0 commit comments

Comments
 (0)