Skip to content

Commit 654516b

Browse files
authored
🌱 Add clustercache metrics (#11789)
* add healthcheck metric * add connection metric * comments * improve help text * address comments + add labels * update label naming
1 parent 065f159 commit 654516b

File tree

2 files changed

+69
-2
lines changed

2 files changed

+69
-2
lines changed

controllers/clustercache/cluster_accessor.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,10 @@ func (ca *clusterAccessor) Connect(ctx context.Context) (retErr error) {
260260
defer func() {
261261
if retErr != nil {
262262
log.Error(retErr, "Connect failed")
263+
connectionUp.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace).Set(0)
263264
ca.lockedState.lastConnectionCreationErrorTimestamp = time.Now()
265+
} else {
266+
connectionUp.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace).Set(1)
264267
}
265268
}()
266269

@@ -303,15 +306,17 @@ func (ca *clusterAccessor) Connect(ctx context.Context) (retErr error) {
303306
// Disconnect disconnects a connection to the workload cluster.
304307
func (ca *clusterAccessor) Disconnect(ctx context.Context) {
305308
log := ctrl.LoggerFrom(ctx)
306-
307309
if !ca.Connected(ctx) {
308310
log.V(6).Info("Skipping disconnect, already disconnected")
309311
return
310312
}
311313

312314
ca.lock(ctx)
313-
defer ca.unlock(ctx)
314315

316+
defer func() {
317+
ca.unlock(ctx)
318+
connectionUp.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace).Set(0)
319+
}()
315320
log.Info("Disconnecting")
316321

317322
// Stopping the cache is non-blocking, so it's okay to do it while holding the lock.
@@ -356,14 +361,20 @@ func (ca *clusterAccessor) HealthCheck(ctx context.Context) (bool, bool) {
356361
unauthorizedErrorOccurred = true
357362
ca.lockedState.healthChecking.consecutiveFailures++
358363
log.V(6).Info(fmt.Sprintf("Health probe failed (unauthorized error occurred): %v", err))
364+
healthCheck.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace).Set(0)
365+
healthChecksTotal.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace, "error").Inc()
359366
case err != nil:
360367
ca.lockedState.healthChecking.consecutiveFailures++
361368
log.V(6).Info(fmt.Sprintf("Health probe failed (%d/%d): %v",
362369
ca.lockedState.healthChecking.consecutiveFailures, ca.config.HealthProbe.FailureThreshold, err))
370+
healthCheck.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace).Set(0)
371+
healthChecksTotal.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace, "error").Inc()
363372
default:
364373
ca.lockedState.healthChecking.consecutiveFailures = 0
365374
ca.lockedState.healthChecking.lastProbeSuccessTimestamp = ca.lockedState.healthChecking.lastProbeTimestamp
366375
log.V(6).Info("Health probe succeeded")
376+
healthCheck.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace).Set(1)
377+
healthChecksTotal.WithLabelValues(ca.cluster.Name, ca.cluster.Namespace, "success").Inc()
367378
}
368379

369380
tooManyConsecutiveFailures := ca.lockedState.healthChecking.consecutiveFailures >= ca.config.HealthProbe.FailureThreshold

controllers/clustercache/metrics.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package clustercache
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
22+
)
23+
24+
func init() {
25+
// Register the metrics at the controller-runtime metrics registry.
26+
ctrlmetrics.Registry.MustRegister(healthCheck)
27+
ctrlmetrics.Registry.MustRegister(connectionUp)
28+
ctrlmetrics.Registry.MustRegister(healthChecksTotal)
29+
}
30+
31+
var (
32+
healthCheck = prometheus.NewGaugeVec(
33+
prometheus.GaugeOpts{
34+
Name: "capi_cluster_cache_healthcheck",
35+
Help: "Result of the last clustercache healthcheck for a cluster.",
36+
}, []string{
37+
"cluster_name", "cluster_namespace",
38+
},
39+
)
40+
healthChecksTotal = prometheus.NewCounterVec(
41+
prometheus.CounterOpts{
42+
Name: "capi_cluster_cache_healthchecks_total",
43+
Help: "Results of all clustercache healthchecks.",
44+
}, []string{
45+
"cluster_name", "cluster_namespace", "status",
46+
},
47+
)
48+
connectionUp = prometheus.NewGaugeVec(
49+
prometheus.GaugeOpts{
50+
Name: "capi_cluster_cache_connection_up",
51+
Help: "Whether the connection to the cluster is up.",
52+
}, []string{
53+
"cluster_name", "cluster_namespace",
54+
},
55+
)
56+
)

0 commit comments

Comments
 (0)