Skip to content

Commit afd86af

Browse files
committed
Fix service-CA certificate rotation not triggering pod restarts
When service-CA rotates certificates, the cluster-monitoring-operator wasn't watching service-CA generated secrets, causing pods to continue using expired certificates until manually restarted. This change adds detect changes happeing for service-CA secrets in monitoring namespaces (secrets ending with -tls or -cert) and triggers reconciliation when they change, which triggers pod restarts and certificate pickup. Fixes the issue where EUS clusters running 3+ years without upgrades require manual intervention after service-CA rotation. Signed-off-by: Daniel Mellado <[email protected]>
1 parent af22cdb commit afd86af

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

pkg/operator/operator.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,16 @@ func (o *Operator) keyFunc(obj interface{}) (string, bool) {
622622
return k, true
623623
}
624624

625+
// isServiceCASecret checks if the given key represents a service-CA generated secret
626+
// that should trigger reconciliation when rotated.
627+
func (o *Operator) isServiceCASecret(key string) bool {
628+
if strings.HasPrefix(key, o.namespace+"/") || strings.HasPrefix(key, o.namespaceUserWorkload+"/") {
629+
secretName := strings.Split(key, "/")[1]
630+
return strings.HasSuffix(secretName, "-tls") || strings.HasSuffix(secretName, "-cert")
631+
}
632+
return false
633+
}
634+
625635
func (o *Operator) handleEvent(obj interface{}) {
626636
cmoConfigMap := o.namespace + "/" + o.configMapName
627637

@@ -671,6 +681,10 @@ func (o *Operator) handleEvent(obj interface{}) {
671681
case federateClientCerts:
672682
case uwmConfigMap:
673683
default:
684+
if o.isServiceCASecret(key) {
685+
klog.V(4).Infof("Service-CA secret updated, triggering reconciliation: %s", key)
686+
break
687+
}
674688
klog.V(5).Infof("ConfigMap or Secret (%s) not triggering an update.", key)
675689
return
676690
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// Copyright 2021 The Cluster Monitoring Operator Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package e2e
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"testing"
21+
"time"
22+
23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24+
25+
"github.com/openshift/cluster-monitoring-operator/test/e2e/framework"
26+
)
27+
28+
func TestServiceCASecretRotation(t *testing.T) {
29+
ctx := context.Background()
30+
31+
testCases := []struct {
32+
name string
33+
secretName string
34+
}{
35+
{
36+
name: "monitoring-plugin-cert change is handled gracefully",
37+
secretName: "monitoring-plugin-cert",
38+
},
39+
}
40+
41+
for _, tc := range testCases {
42+
t.Run(tc.name, func(t *testing.T) {
43+
s, err := f.KubeClient.CoreV1().Secrets(f.Ns).Get(ctx, tc.secretName, metav1.GetOptions{})
44+
if err != nil {
45+
t.Skipf("secret %s/%s not found, skipping test: %v", f.Ns, tc.secretName, err)
46+
}
47+
48+
t.Cleanup(func() {
49+
if s, err := f.KubeClient.CoreV1().Secrets(f.Ns).Get(ctx, tc.secretName, metav1.GetOptions{}); err == nil && s.Annotations != nil {
50+
if _, exists := s.Annotations["test.openshift.io/service-ca-test-rotation"]; exists {
51+
delete(s.Annotations, "test.openshift.io/service-ca-test-rotation")
52+
_ = f.OperatorClient.CreateOrUpdateSecret(ctx, s)
53+
}
54+
}
55+
})
56+
57+
initialCO, err := f.OpenShiftConfigClient.ConfigV1().ClusterOperators().Get(ctx, "monitoring", metav1.GetOptions{})
58+
if err != nil {
59+
t.Fatalf("error getting initial cluster operator status: %v", err)
60+
}
61+
62+
if s.Annotations == nil {
63+
s.Annotations = make(map[string]string)
64+
}
65+
s.Annotations["test.openshift.io/service-ca-test-rotation"] = fmt.Sprintf("%d", time.Now().Unix())
66+
67+
if err := f.OperatorClient.CreateOrUpdateSecret(ctx, s); err != nil {
68+
t.Fatalf("error updating secret %s/%s: %v", f.Ns, tc.secretName, err)
69+
}
70+
71+
err = framework.Poll(5*time.Second, 2*time.Minute, func() error {
72+
co, err := f.OpenShiftConfigClient.ConfigV1().ClusterOperators().Get(ctx, "monitoring", metav1.GetOptions{})
73+
if err != nil {
74+
return fmt.Errorf("error getting cluster operator status: %w", err)
75+
}
76+
77+
for _, condition := range co.Status.Conditions {
78+
switch condition.Type {
79+
case "Available":
80+
if condition.Status != "True" {
81+
return fmt.Errorf("cluster operator is not available: %s", condition.Message)
82+
}
83+
case "Degraded":
84+
if condition.Status == "True" {
85+
return fmt.Errorf("cluster operator is degraded: %s", condition.Message)
86+
}
87+
case "Progressing":
88+
if condition.Status == "True" && condition.Reason != "" {
89+
if co.Generation == initialCO.Generation {
90+
return fmt.Errorf("operator is progressing but generation hasn't changed")
91+
}
92+
}
93+
}
94+
}
95+
96+
return nil
97+
})
98+
if err != nil {
99+
t.Fatalf("service-CA secret rotation test failed: %v", err)
100+
}
101+
})
102+
}
103+
}

0 commit comments

Comments
 (0)