Skip to content

Commit af29be3

Browse files
committed
Add basic metrics for the controller engine
This'll let folks monitor things like: * How many XR controllers are running * How many watches are running * Whether an XR controller is flapping * Whether an XR controller is watching its XR and CompRev * How many types of composed resource an XR controller is watching Signed-off-by: Nic Cope <[email protected]>
1 parent bbc7c4d commit af29be3

File tree

4 files changed

+161
-19
lines changed

4 files changed

+161
-19
lines changed

cmd/crossplane/core/core.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,14 +244,14 @@ func (c *startCommand) Run(s *runtime.Scheme, log logging.Logger) error { //noli
244244
return errors.Wrap(err, "cannot load client TLS certificates")
245245
}
246246

247-
m := xfn.NewMetrics()
248-
metrics.Registry.MustRegister(m)
247+
xfnm := xfn.NewPrometheusMetrics()
248+
metrics.Registry.MustRegister(xfnm)
249249

250250
// We want all XR controllers to share the same gRPC clients.
251251
functionRunner := xfn.NewPackagedFunctionRunner(mgr.GetClient(),
252252
xfn.WithLogger(log),
253253
xfn.WithTLSConfig(clienttls),
254-
xfn.WithInterceptorCreators(m),
254+
xfn.WithInterceptorCreators(xfnm),
255255
)
256256

257257
// Periodically remove clients for Functions that no longer exist.
@@ -375,6 +375,9 @@ func (c *startCommand) Run(s *runtime.Scheme, log logging.Logger) error { //noli
375375
return errors.Wrap(err, "cannot create uncached client for API extension controllers")
376376
}
377377

378+
cem := engine.NewPrometheusMetrics()
379+
metrics.Registry.MustRegister(cem)
380+
378381
// It's important the engine's client is wrapped with unstructured.NewClient
379382
// because controller-runtime always caches *unstructured.Unstructured, not
380383
// our wrapper types like *composite.Unstructured. This client takes care of
@@ -384,6 +387,7 @@ func (c *startCommand) Run(s *runtime.Scheme, log logging.Logger) error { //noli
384387
unstructured.NewClient(cached),
385388
unstructured.NewClient(uncached),
386389
engine.WithLogger(log),
390+
engine.WithMetrics(cem),
387391
)
388392

389393
// TODO(negz): Garbage collect informers for CRs that are still defined

internal/engine/engine.go

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,12 @@ type ControllerEngine struct {
6161
// exists.
6262
uncached client.Client
6363

64-
log logging.Logger
65-
66-
// Protects everything below.
67-
mx sync.RWMutex
68-
69-
// Running controllers, by name.
64+
// Running controllers, by name. Protected by the mutex.
7065
controllers map[string]*controller
66+
mx sync.RWMutex
67+
68+
log logging.Logger
69+
metrics Metrics
7170
}
7271

7372
// TrackingInformers is a set of Informers. It tracks which are active.
@@ -76,15 +75,31 @@ type TrackingInformers interface {
7675
ActiveInformers() []schema.GroupVersionKind
7776
}
7877

78+
// Metrics for the controller engine.
79+
type Metrics interface {
80+
// ControllerStarted records a controller start.
81+
ControllerStarted(name string)
82+
83+
// ControllerStopped records a controller stop.
84+
ControllerStopped(name string)
85+
86+
// WatchStarted records a watch start for a controller.
87+
WatchStarted(name string, t WatchType)
88+
89+
// WatchStopped records a watch stop for a controller.
90+
WatchStopped(name string, t WatchType)
91+
}
92+
7993
// New creates a new controller engine.
8094
func New(mgr manager.Manager, infs TrackingInformers, c client.Client, nc client.Client, o ...ControllerEngineOption) *ControllerEngine {
8195
e := &ControllerEngine{
8296
mgr: mgr,
8397
infs: infs,
8498
cached: c,
8599
uncached: nc,
86-
log: logging.NewNopLogger(),
87100
controllers: make(map[string]*controller),
101+
log: logging.NewNopLogger(),
102+
metrics: &NopMetrics{},
88103
}
89104

90105
for _, fn := range o {
@@ -104,6 +119,13 @@ func WithLogger(l logging.Logger) ControllerEngineOption {
104119
}
105120
}
106121

122+
// WithMetrics configures an Engine to expose metrics.
123+
func WithMetrics(m Metrics) ControllerEngineOption {
124+
return func(e *ControllerEngine) {
125+
e.metrics = m
126+
}
127+
}
128+
107129
type controller struct {
108130
// The running controller.
109131
ctrl kcontroller.Controller
@@ -213,6 +235,7 @@ func (e *ControllerEngine) Start(name string, o ...ControllerOption) error {
213235
<-e.mgr.Elected()
214236

215237
e.log.Debug("Starting new controller", "controller", name)
238+
e.metrics.ControllerStarted(name)
216239

217240
// Run the controller until its context is cancelled.
218241
if err := c.Start(ctx); err != nil {
@@ -225,6 +248,7 @@ func (e *ControllerEngine) Start(name string, o ...ControllerOption) error {
225248
}
226249

227250
e.log.Debug("Stopped controller", "controller", name)
251+
e.metrics.ControllerStopped(name)
228252
}()
229253

230254
if co.gc != nil {
@@ -281,7 +305,6 @@ func (e *ControllerEngine) Stop(ctx context.Context, name string) error {
281305
c.cancel()
282306
delete(e.controllers, name)
283307

284-
e.log.Debug("Stopped controller", "controller", name)
285308
return nil
286309
}
287310

@@ -441,6 +464,7 @@ func (e *ControllerEngine) StartWatches(ctx context.Context, name string, ws ...
441464
c.sources[wid] = src
442465

443466
e.log.Debug("Started watching GVK", "controller", name, "watch-type", wid.Type, "watched-gvk", wid.GVK)
467+
e.metrics.WatchStarted(name, wid.Type)
444468
}
445469

446470
return nil
@@ -510,6 +534,7 @@ func (e *ControllerEngine) StopWatches(ctx context.Context, name string, ws ...W
510534
}
511535
delete(c.sources, wid)
512536
e.log.Debug("Stopped watching GVK", "controller", name, "watch-type", wid.Type, "watched-gvk", wid.GVK)
537+
e.metrics.WatchStopped(name, wid.Type)
513538
stopped++
514539
}
515540

internal/engine/engine_metrics.go

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
/*
2+
Copyright 2025 The Crossplane Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use
5+
this file except in compliance with the License. You may obtain a copy of the
6+
License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software distributed
11+
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
specific language governing permissions and limitations under the License.
14+
*/
15+
16+
package engine
17+
18+
import (
19+
"github.com/prometheus/client_golang/prometheus"
20+
)
21+
22+
// NopMetrics does nothing.
23+
type NopMetrics struct{}
24+
25+
// ControllerStarted does nothing.
26+
func (m *NopMetrics) ControllerStarted(_ string) {}
27+
28+
// ControllerStopped does nothing.
29+
func (m *NopMetrics) ControllerStopped(_ string) {}
30+
31+
// WatchStarted does nothing.
32+
func (m *NopMetrics) WatchStarted(_ string, _ WatchType) {}
33+
34+
// WatchStopped does nothing.
35+
func (m *NopMetrics) WatchStopped(_ string, _ WatchType) {}
36+
37+
// PrometheusMetrics for the controller engine.
38+
type PrometheusMetrics struct {
39+
controllersStarted *prometheus.CounterVec
40+
controllersStopped *prometheus.CounterVec
41+
42+
watchesStarted *prometheus.CounterVec
43+
watchesStopped *prometheus.CounterVec
44+
}
45+
46+
// NewPrometheusMetrics exposes controller engine metrics via Prometheus.
47+
func NewPrometheusMetrics() *PrometheusMetrics {
48+
return &PrometheusMetrics{
49+
controllersStarted: prometheus.NewCounterVec(prometheus.CounterOpts{
50+
Subsystem: "composition",
51+
Name: "controllers_started_total",
52+
Help: "Total number of XR controllers started.",
53+
}, []string{"controller"}),
54+
55+
controllersStopped: prometheus.NewCounterVec(prometheus.CounterOpts{
56+
Subsystem: "composition",
57+
Name: "controllers_stopped_total",
58+
Help: "Total number of XR controllers stopped.",
59+
}, []string{"controller"}),
60+
61+
watchesStarted: prometheus.NewCounterVec(prometheus.CounterOpts{
62+
Subsystem: "composition",
63+
Name: "watches_started_total",
64+
Help: "Total number of watches started.",
65+
}, []string{"controller", "type"}),
66+
67+
watchesStopped: prometheus.NewCounterVec(prometheus.CounterOpts{
68+
Subsystem: "composition",
69+
Name: "watches_stopped_total",
70+
Help: "Total number of watches stopped.",
71+
}, []string{"controller", "type"}),
72+
}
73+
}
74+
75+
// ControllerStarted records a controller start.
76+
func (m *PrometheusMetrics) ControllerStarted(name string) {
77+
m.controllersStarted.With(prometheus.Labels{"controller": name}).Inc()
78+
}
79+
80+
// ControllerStopped records a controller stop.
81+
func (m *PrometheusMetrics) ControllerStopped(name string) {
82+
m.controllersStopped.With(prometheus.Labels{"controller": name}).Inc()
83+
}
84+
85+
// WatchStarted records a watch start for a controller.
86+
func (m *PrometheusMetrics) WatchStarted(name string, t WatchType) {
87+
m.watchesStarted.With(prometheus.Labels{"controller": name, "type": string(t)}).Inc()
88+
}
89+
90+
// WatchStopped records a watch stop for a controller.
91+
func (m *PrometheusMetrics) WatchStopped(name string, t WatchType) {
92+
m.watchesStopped.With(prometheus.Labels{"controller": name, "type": string(t)}).Inc()
93+
}
94+
95+
// Describe sends the super-set of all possible descriptors of metrics
96+
// collected by this Collector to the provided channel and returns once
97+
// the last descriptor has been sent.
98+
func (m *PrometheusMetrics) Describe(ch chan<- *prometheus.Desc) {
99+
m.controllersStarted.Describe(ch)
100+
m.controllersStopped.Describe(ch)
101+
m.watchesStarted.Describe(ch)
102+
m.watchesStopped.Describe(ch)
103+
}
104+
105+
// Collect is called by the Prometheus registry when collecting
106+
// metrics. The implementation sends each collected metric via the
107+
// provided channel and returns once the last metric has been sent.
108+
func (m *PrometheusMetrics) Collect(ch chan<- prometheus.Metric) {
109+
m.controllersStarted.Collect(ch)
110+
m.controllersStopped.Collect(ch)
111+
m.watchesStarted.Collect(ch)
112+
m.watchesStopped.Collect(ch)
113+
}

internal/xfn/function_runner_metrics.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@ import (
2626
fnv1 "github.com/crossplane/crossplane/apis/apiextensions/fn/proto/v1"
2727
)
2828

29-
// Metrics are requests, errors, and duration (RED) metrics for composition
29+
// PrometheusMetrics are requests, errors, and duration (RED) metrics for composition
3030
// function runs.
31-
type Metrics struct {
31+
type PrometheusMetrics struct {
3232
requests *prometheus.CounterVec
3333
responses *prometheus.CounterVec
3434
duration *prometheus.HistogramVec
3535
}
3636

37-
// NewMetrics creates metrics for composition function runs.
38-
func NewMetrics() *Metrics {
39-
return &Metrics{
37+
// NewPrometheusMetrics creates metrics for composition function runs.
38+
func NewPrometheusMetrics() *PrometheusMetrics {
39+
return &PrometheusMetrics{
4040
requests: prometheus.NewCounterVec(prometheus.CounterOpts{
4141
Subsystem: "composition",
4242
Name: "run_function_request_total",
@@ -61,7 +61,7 @@ func NewMetrics() *Metrics {
6161
// Describe sends the super-set of all possible descriptors of metrics
6262
// collected by this Collector to the provided channel and returns once
6363
// the last descriptor has been sent.
64-
func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
64+
func (m *PrometheusMetrics) Describe(ch chan<- *prometheus.Desc) {
6565
m.requests.Describe(ch)
6666
m.responses.Describe(ch)
6767
m.duration.Describe(ch)
@@ -70,15 +70,15 @@ func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
7070
// Collect is called by the Prometheus registry when collecting
7171
// metrics. The implementation sends each collected metric via the
7272
// provided channel and returns once the last metric has been sent.
73-
func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
73+
func (m *PrometheusMetrics) Collect(ch chan<- prometheus.Metric) {
7474
m.requests.Collect(ch)
7575
m.responses.Collect(ch)
7676
m.duration.Collect(ch)
7777
}
7878

7979
// CreateInterceptor returns a gRPC UnaryClientInterceptor for the named
8080
// function. The supplied package (pkg) should be the package's OCI reference.
81-
func (m *Metrics) CreateInterceptor(name, pkg string) grpc.UnaryClientInterceptor {
81+
func (m *PrometheusMetrics) CreateInterceptor(name, pkg string) grpc.UnaryClientInterceptor {
8282
return func(ctx context.Context, method string, req, reply any, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
8383
l := prometheus.Labels{"function_name": name, "function_package": pkg, "grpc_target": cc.Target(), "grpc_method": method}
8484

0 commit comments

Comments
 (0)