Skip to content

Commit bb42bad

Browse files
committed
feat: add secret expiration reconciler
Before this patch, the Prometheus User Workload Token Secret did not have a reliable reconciliation pattern to handle expirations. This patch added a controller which will remove any secrets that have expired, so they can be reconciled. The UWM Token Secret has its expiration date reduced from 1 year to 1 week, and the controller will check for expired tokens once per day. Signed-off-by: Kaiyi Liu <[email protected]>
1 parent f210aba commit bb42bad

File tree

14 files changed

+532
-61
lines changed

14 files changed

+532
-61
lines changed

bundle/manifests/kepler-operator.clusterserviceversion.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,8 @@ spec:
225225
- args:
226226
- --openshift
227227
- --deployment-namespace=power-monitor
228+
- --exp.reconciler.token.refresh-interval=24h
229+
- --exp.uwm.token.ttl=168h
228230
- --leader-elect
229231
- --kepler.image=$(RELATED_IMAGE_KEPLER)
230232
- --kube-rbac-proxy.image=$(RELATED_IMAGE_KUBE_RBAC_PROXY)

cmd/main.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"fmt"
1010
"os"
1111
"strings"
12+
"time"
1213

1314
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
1415
// to ensure that exec-entrypoint and run can make use of them.
@@ -71,6 +72,8 @@ func main() {
7172
var enableHTTP2 bool
7273
var tlsOpts []func(*tls.Config)
7374
var additionalNamespaces stringList
75+
var tokenRefreshInterval time.Duration
76+
var tokenTTL time.Duration
7477

7578
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to."+
7679
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
@@ -92,6 +95,12 @@ func main() {
9295
flag.BoolVar(&openshift, "openshift", false,
9396
"Indicate if the operator is running on an OpenShift cluster.")
9497

98+
flag.DurationVar(&tokenRefreshInterval, "exp.reconciler.token.refresh-interval", controller.Config.TokenRefreshInterval,
99+
"Interval at which the token expiry reconciler requeues for reconciliation.")
100+
101+
flag.DurationVar(&tokenTTL, "exp.uwm.token.ttl", controller.Config.TokenTTL,
102+
"Time-to-live duration for user workload monitoring tokens.")
103+
95104
// NOTE: RELATED_IMAGE_KEPLER can be set as env or flag, flag takes precedence over env
96105
keplerImage := os.Getenv("RELATED_IMAGE_KEPLER")
97106
flag.StringVar(&controller.Config.Image, "kepler.image", keplerImage, "kepler image")
@@ -136,6 +145,10 @@ func main() {
136145
}
137146
}
138147

148+
controller.Config.TokenRefreshInterval = tokenRefreshInterval
149+
controller.Config.TokenTTL = tokenTTL
150+
powermonitor.TokenTTL = tokenTTL
151+
139152
// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
140153
// More info:
141154
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/metrics/server
@@ -200,6 +213,13 @@ func main() {
200213
os.Exit(1)
201214
}
202215

216+
if err = (&controller.TokenExpiryReconciler{
217+
Client: mgr.GetClient(),
218+
Scheme: mgr.GetScheme(),
219+
}).SetupWithManager(mgr); err != nil {
220+
setupLog.Error(err, "unable to create controller", "controller", "token-expiry")
221+
os.Exit(1)
222+
}
203223
if err = (&controller.PowerMonitorReconciler{
204224
Client: mgr.GetClient(),
205225
Scheme: mgr.GetScheme(),

config/manager/overlays/openshift/kustomization.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,9 @@ patches:
1515
- op: add
1616
path: /spec/template/spec/containers/0/args/1
1717
value: --deployment-namespace=power-monitor
18+
- op: add
19+
path: /spec/template/spec/containers/0/args/2
20+
value: --exp.reconciler.token.refresh-interval=24h
21+
- op: add
22+
path: /spec/template/spec/containers/0/args/3
23+
value: --exp.uwm.token.ttl=168h

config/manifests/bases/kepler-operator.clusterserviceversion.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ spec:
9898
- --health-probe-bind-address=:8081
9999
- --metrics-bind-address=127.0.0.1:8080
100100
- --leader-elect
101+
- --exp.reconciler.token.refresh-interval=24h
102+
- --exp.uwm.token.ttl=168h
101103
command:
102104
- /manager
103105
image: quay.io/sustainable_computing_io/kepler-operator:latest

internal/controller/config.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,25 @@
33

44
package controller
55

6-
import "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s"
6+
import (
7+
"time"
8+
9+
"github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s"
10+
)
711

812
// Config holds configuration shared across all controllers. This struct
913
// should be initialized in main
1014

1115
var Config = struct {
12-
KubeRbacProxyImage string
13-
Image string
14-
Cluster k8s.Cluster
16+
KubeRbacProxyImage string
17+
Image string
18+
Cluster k8s.Cluster
19+
TokenRefreshInterval time.Duration
20+
TokenTTL time.Duration
1521
}{
16-
KubeRbacProxyImage: "quay.io/brancz/kube-rbac-proxy:v0.19.0",
17-
Image: "",
18-
Cluster: k8s.Kubernetes,
22+
KubeRbacProxyImage: "quay.io/brancz/kube-rbac-proxy:v0.19.0",
23+
Image: "",
24+
Cluster: k8s.Kubernetes,
25+
TokenRefreshInterval: 24 * time.Hour,
26+
TokenTTL: 168 * time.Hour,
1927
}

internal/controller/power_monitor_internal.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package controller
66
import (
77
"context"
88
"fmt"
9+
910
"slices"
1011
"time"
1112

@@ -30,6 +31,7 @@ import (
3031
"k8s.io/apimachinery/pkg/types"
3132
"k8s.io/client-go/util/retry"
3233

34+
monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
3335
appsv1 "k8s.io/api/apps/v1"
3436
corev1 "k8s.io/api/core/v1"
3537
rbacv1 "k8s.io/api/rbac/v1"
@@ -134,6 +136,7 @@ func (r *PowerMonitorInternalReconciler) SetupWithManager(mgr ctrl.Manager) erro
134136
Owns(&corev1.ServiceAccount{}, genChanged).
135137
Owns(&corev1.Service{}, genChanged).
136138
Owns(&appsv1.DaemonSet{}, resVerChanged).
139+
Owns(&monv1.ServiceMonitor{}, genChanged).
137140
Owns(&rbacv1.ClusterRoleBinding{}, genChanged).
138141
Owns(&rbacv1.ClusterRole{}, genChanged).
139142
// NOTE: requires resVerChanged for ConfigMap & Secret since
@@ -476,6 +479,8 @@ func powerMonitorExporters(pmi *v1alpha1.PowerMonitorInternal, ds *appsv1.Daemon
476479
fmt.Sprintf("%s:%s", powermonitor.UWMNamespace, powermonitor.UWMServiceAccountName),
477480
)
478481

482+
sm := powermonitor.NewPowerMonitorServiceMonitor(components.Full, pmi)
483+
479484
// cluster-scoped resources first
480485
// update cluster role before cluster role binding
481486
rs := resourceReconcilers(updateResource,
@@ -504,6 +509,7 @@ func powerMonitorExporters(pmi *v1alpha1.PowerMonitorInternal, ds *appsv1.Daemon
504509
Pmi: pmi,
505510
Cluster: cluster,
506511
Ds: ds,
512+
Sm: sm,
507513
EnableRBAC: enableRBAC,
508514
EnableUWM: enableUWM,
509515
},
@@ -516,6 +522,7 @@ func powerMonitorExporters(pmi *v1alpha1.PowerMonitorInternal, ds *appsv1.Daemon
516522
rs = append(rs,
517523
reconciler.PowerMonitorServiceMonitorReconciler{
518524
Pmi: pmi,
525+
Sm: sm,
519526
EnableRBAC: enableRBAC,
520527
EnableUWM: enableUWM,
521528
},
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
// SPDX-FileCopyrightText: 2025 The Kepler Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package controller
5+
6+
import (
7+
"context"
8+
"time"
9+
10+
"github.com/go-logr/logr"
11+
"sigs.k8s.io/controller-runtime/pkg/builder"
12+
"sigs.k8s.io/controller-runtime/pkg/client"
13+
"sigs.k8s.io/controller-runtime/pkg/log"
14+
"sigs.k8s.io/controller-runtime/pkg/predicate"
15+
16+
powermonitor "github.com/sustainable.computing.io/kepler-operator/pkg/components/power-monitor"
17+
"github.com/sustainable.computing.io/kepler-operator/pkg/reconciler"
18+
19+
"k8s.io/apimachinery/pkg/api/errors"
20+
"k8s.io/apimachinery/pkg/runtime"
21+
"sigs.k8s.io/controller-runtime/pkg/event"
22+
23+
corev1 "k8s.io/api/core/v1"
24+
25+
ctrl "sigs.k8s.io/controller-runtime"
26+
)
27+
28+
type TokenExpiryReconciler struct {
29+
client.Client
30+
Scheme *runtime.Scheme
31+
logger logr.Logger
32+
}
33+
34+
// RBAC for TokenExpirationReconciler
35+
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;delete
36+
37+
// SetupWithManager sets up the controller with the Manager.
38+
func (r *TokenExpiryReconciler) SetupWithManager(mgr ctrl.Manager) error {
39+
secretPredicate := builder.WithPredicates(predicate.Funcs{
40+
CreateFunc: func(e event.CreateEvent) bool {
41+
return r.inPowerMonitorNamespace(e.Object) && r.isPrometheusUserWorkloadToken(e.Object)
42+
},
43+
UpdateFunc: func(e event.UpdateEvent) bool {
44+
return r.inPowerMonitorNamespace(e.ObjectNew) && r.isPrometheusUserWorkloadToken(e.ObjectNew)
45+
},
46+
DeleteFunc: func(e event.DeleteEvent) bool {
47+
return false
48+
},
49+
GenericFunc: func(e event.GenericEvent) bool {
50+
return r.inPowerMonitorNamespace(e.Object) && r.isPrometheusUserWorkloadToken(e.Object)
51+
},
52+
})
53+
54+
return ctrl.NewControllerManagedBy(mgr).
55+
For(&corev1.Secret{}, secretPredicate).
56+
Complete(r)
57+
}
58+
59+
// inPowerMonitorNamespace checks if object is in the PowerMonitorDeploymentNS namespace
60+
func (r *TokenExpiryReconciler) inPowerMonitorNamespace(obj client.Object) bool {
61+
return obj.GetNamespace() == PowerMonitorDeploymentNS
62+
}
63+
64+
// isPrometheusUserWorkloadToken checks if the secret is the prometheus-user-workload-token
65+
func (r *TokenExpiryReconciler) isPrometheusUserWorkloadToken(obj client.Object) bool {
66+
return obj.GetName() == powermonitor.SecretUWMTokenName
67+
}
68+
69+
// hasExpirationAnnotation checks if the secret has an expiration annotation
70+
func (r *TokenExpiryReconciler) hasExpirationAnnotation(obj client.Object) bool {
71+
secret, ok := obj.(*corev1.Secret)
72+
if !ok {
73+
return false
74+
}
75+
76+
annotations := secret.GetAnnotations()
77+
if annotations == nil {
78+
return false
79+
}
80+
81+
_, exists := annotations[powermonitor.SecretTokenExpirationAnnotation]
82+
return exists
83+
}
84+
85+
// deleteResources is a helper function that creates and runs deleter reconcilers for the given resources
86+
func (r *TokenExpiryReconciler) deleteResources(ctx context.Context, resources ...client.Object) (ctrl.Result, error) {
87+
reconcilers := resourceReconcilers(deleteResource, resources...)
88+
89+
return reconciler.Runner{
90+
Reconcilers: reconcilers,
91+
Client: r.Client,
92+
Scheme: r.Scheme,
93+
Logger: r.logger,
94+
}.Run(ctx)
95+
}
96+
97+
func (r *TokenExpiryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
98+
logger := log.FromContext(ctx)
99+
r.logger = logger
100+
101+
logger.Info("Start of reconcile")
102+
defer logger.Info("End of reconcile")
103+
104+
secret := &corev1.Secret{}
105+
err := r.Get(ctx, req.NamespacedName, secret)
106+
if err != nil {
107+
if errors.IsNotFound(err) {
108+
r.logger.Info("secret not found, continue without error")
109+
return ctrl.Result{}, nil
110+
}
111+
r.logger.Error(err, "failed to retrieve secret")
112+
return ctrl.Result{}, err
113+
}
114+
115+
if !r.hasExpirationAnnotation(secret) {
116+
r.logger.Info("prometheus-user-workload-token does not have expiration annotation, deleting it")
117+
return r.deleteResources(ctx, secret)
118+
}
119+
120+
expired, expirationTime, err := r.isSecretExpired(secret)
121+
if err != nil {
122+
r.logger.Error(err, "failed to extract expiration time")
123+
return ctrl.Result{RequeueAfter: time.Minute * 5}, nil
124+
}
125+
126+
if expired {
127+
r.logger.Info("secret has expired, reconciling", "expiration-time", expirationTime)
128+
return r.deleteResources(ctx, secret)
129+
}
130+
131+
timeUntilExpiration := time.Until(expirationTime)
132+
r.logger.Info("secret not expired yet, requeuing", "expiration-time", expirationTime, "time-until-expiration", timeUntilExpiration)
133+
134+
return ctrl.Result{RequeueAfter: Config.TokenRefreshInterval}, nil
135+
}
136+
137+
// isSecretExpired checks if the secret has expired according to the expiration annotation
138+
func (r *TokenExpiryReconciler) isSecretExpired(secret *corev1.Secret) (bool, time.Time, error) {
139+
expirationTime, err := powermonitor.GetExpirationFromAnnotation(&secret.ObjectMeta, powermonitor.SecretTokenExpirationAnnotation)
140+
if err != nil {
141+
return false, time.Time{}, err
142+
}
143+
if expirationTime == nil {
144+
return false, time.Time{}, nil
145+
}
146+
return time.Now().After(expirationTime.Add(-(Config.TokenRefreshInterval * 2))), *expirationTime, nil
147+
}

0 commit comments

Comments
 (0)