Skip to content

Commit cdaeed9

Browse files
committed
fix: resolve webhook caBundle deadlock during helm upgrade
During helm upgrade, the webhook configurations' caBundle field was reset to empty, causing new pods to fail readiness checks while the old leader pod never detected the change (only watched the cert Secret, with a 24h requeue). This created a deadlock where no pod could fix the caBundle. - Watch ValidatingWebhookConfiguration and MutatingWebhookConfiguration so the leader detects caBundle changes immediately - Use bytes.Equal for caBundle comparison instead of len==0 so stale values are corrected, not just empty ones - Remove caBundle from Helm webhook templates so upgrades stop resetting operator-managed values
1 parent 71c90a6 commit cdaeed9

File tree

4 files changed

+51
-8
lines changed

4 files changed

+51
-8
lines changed

chart/templates/mutating-webhook.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ webhooks:
77
- admissionReviewVersions:
88
- v1
99
clientConfig:
10-
caBundle: ""
1110
service:
1211
name: skyhook-operator-webhook-service
1312
namespace: {{ .Release.Namespace }}
@@ -35,7 +34,6 @@ webhooks:
3534
- admissionReviewVersions:
3635
- v1
3736
clientConfig:
38-
caBundle: ""
3937
service:
4038
name: skyhook-operator-webhook-service
4139
namespace: {{ .Release.Namespace }}

chart/templates/validating-webhook.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ webhooks:
77
- admissionReviewVersions:
88
- v1
99
clientConfig:
10-
caBundle: ""
1110
service:
1211
name: skyhook-operator-webhook-service
1312
namespace: {{ .Release.Namespace }}
@@ -34,7 +33,6 @@ webhooks:
3433
- admissionReviewVersions:
3534
- v1
3635
clientConfig:
37-
caBundle: ""
3836
service:
3937
name: skyhook-operator-webhook-service
4038
namespace: {{ .Release.Namespace }}

operator/internal/controller/webhook_controller.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"time"
2828

2929
"sigs.k8s.io/controller-runtime/pkg/builder"
30+
"sigs.k8s.io/controller-runtime/pkg/handler"
3031
"sigs.k8s.io/controller-runtime/pkg/predicate"
3132

3233
"github.com/NVIDIA/skyhook/operator/api/v1alpha1"
@@ -127,9 +128,29 @@ func (r *WebhookController) SetupWithManager(mgr ctrl.Manager) error {
127128
For(&corev1.Secret{}, builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool {
128129
return obj.GetNamespace() == r.namespace && obj.GetName() == r.opts.SecretName
129130
}))).
131+
// Watch webhook configurations so the leader detects external changes (e.g. Helm upgrade
132+
// resetting caBundle) and fixes them immediately instead of waiting for the 24h requeue.
133+
Watches(&admissionregistrationv1.ValidatingWebhookConfiguration{},
134+
handler.EnqueueRequestsFromMapFunc(r.webhookConfigToSecret),
135+
builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool {
136+
return obj.GetName() == validatingWebhookConfigName
137+
}))).
138+
Watches(&admissionregistrationv1.MutatingWebhookConfiguration{},
139+
handler.EnqueueRequestsFromMapFunc(r.webhookConfigToSecret),
140+
builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool {
141+
return obj.GetName() == mutatingWebhookConfigName
142+
}))).
130143
Complete(r)
131144
}
132145

146+
// webhookConfigToSecret maps webhook config change events back to the cert Secret,
147+
// so the existing Reconcile() can verify and fix the caBundle.
148+
func (r *WebhookController) webhookConfigToSecret(_ context.Context, _ client.Object) []reconcile.Request {
149+
return []reconcile.Request{{
150+
NamespacedName: types.NamespacedName{Name: r.opts.SecretName, Namespace: r.namespace},
151+
}}
152+
}
153+
133154
// permissions
134155
//+kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=validatingwebhookconfigurations;mutatingwebhookconfigurations,verbs=get;list;watch;create;update;patch;delete
135156
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
@@ -480,8 +501,8 @@ func deploymentPolicyMutatingRules() []admissionregistrationv1.RuleWithOperation
480501
func validatingWebhookNeedsUpdate(webhook *admissionregistrationv1.ValidatingWebhook, caBundle []byte, expectedRules []admissionregistrationv1.RuleWithOperations) bool {
481502
needUpdate := false
482503

483-
// Check if CABundle needs to be set
484-
if len(webhook.ClientConfig.CABundle) == 0 {
504+
// Check if CABundle needs to be set or updated (catches both empty and stale values)
505+
if !bytes.Equal(webhook.ClientConfig.CABundle, caBundle) {
485506
webhook.ClientConfig.CABundle = caBundle
486507
needUpdate = true
487508
}
@@ -499,8 +520,8 @@ func validatingWebhookNeedsUpdate(webhook *admissionregistrationv1.ValidatingWeb
499520
func mutatingWebhookNeedsUpdate(webhook *admissionregistrationv1.MutatingWebhook, caBundle []byte, expectedRules []admissionregistrationv1.RuleWithOperations) bool {
500521
needUpdate := false
501522

502-
// Check if CABundle needs to be set
503-
if len(webhook.ClientConfig.CABundle) == 0 {
523+
// Check if CABundle needs to be set or updated (catches both empty and stale values)
524+
if !bytes.Equal(webhook.ClientConfig.CABundle, caBundle) {
504525
webhook.ClientConfig.CABundle = caBundle
505526
needUpdate = true
506527
}

operator/internal/controller/webhook_controller_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,32 @@ var _ = Describe("WebhookController", Ordered, func() {
293293
Expect(needsUpdate).To(BeTrue(), "should detect empty CABundle")
294294
Expect(webhook.ClientConfig.CABundle).To(Equal(caBundle), "CABundle should be updated")
295295
})
296+
297+
It("should update CABundle when stale (non-empty but wrong)", func() {
298+
expectedRules := skyhookRules()
299+
correctCA := []byte("correct-ca")
300+
staleCA := []byte("stale-ca-from-previous-cert")
301+
302+
validatingWebhook := admissionregistrationv1.ValidatingWebhook{
303+
ClientConfig: admissionregistrationv1.WebhookClientConfig{
304+
CABundle: staleCA,
305+
},
306+
Rules: expectedRules,
307+
}
308+
needsUpdate := validatingWebhookNeedsUpdate(&validatingWebhook, correctCA, expectedRules)
309+
Expect(needsUpdate).To(BeTrue(), "should detect stale validating CABundle")
310+
Expect(validatingWebhook.ClientConfig.CABundle).To(Equal(correctCA))
311+
312+
mutatingWebhook := admissionregistrationv1.MutatingWebhook{
313+
ClientConfig: admissionregistrationv1.WebhookClientConfig{
314+
CABundle: staleCA,
315+
},
316+
Rules: expectedRules,
317+
}
318+
needsUpdate = mutatingWebhookNeedsUpdate(&mutatingWebhook, correctCA, expectedRules)
319+
Expect(needsUpdate).To(BeTrue(), "should detect stale mutating CABundle")
320+
Expect(mutatingWebhook.ClientConfig.CABundle).To(Equal(correctCA))
321+
})
296322
})
297323

298324
Describe("Disk and Secret-to-Disk Sync Logic", func() {

0 commit comments

Comments
 (0)