Skip to content

Commit ab34693

Browse files
authored
metrics: add route validation gauge metrics (zalando#3550)
**Prometheus metrics:** - `skipper_route_invalid_routes_gauge{reason}` (gauge) - Current number of invalid routes by reason **CodaHale metrics:** - `route.invalid.gauge.{reason}` (gauge) - Current invalid routes count by reason **Failure Reasons Tracked:** - `failed_backend_split` - Backend URL parsing errors - `unknown_filter` - Unknown or disabled filters used - `unknown_predicate` - Unknown predicates used - `invalid_filter_params` - Invalid filter parameters - `invalid_predicate_params` - Invalid predicate parameters - Any other validation errors that may occur ## How error rate will be calculated - **Graph Representation**: Displays a graph with points over specific time intervals. - **Calculation**: `(routes.total - sum(route.invalid.gauge.%s)) / routes.total`, where `routes.total` is an existed gauge metric Signed-off-by: Veronika Volokitina <[email protected]>
1 parent b80dfdb commit ab34693

File tree

8 files changed

+336
-60
lines changed

8 files changed

+336
-60
lines changed

docs/operation/operation.md

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,72 @@ Metrics explanation:
789789

790790
If you want to read more about RouteSRV see [deploy RouteSRV](../kubernetes/ingress-controller.md#routesrv).
791791

792+
### Route validation metrics
793+
794+
Skipper provides metrics to track the success and failure rates of route processing during configuration updates. These
795+
metrics help monitor the health of route definitions and identify common configuration issues.
796+
797+
#### Gauge metrics
798+
799+
The following gauge metrics show the current count of invalid routes by failure reason:
800+
801+
- `skipper_route_invalid{reason="<reason>"}`: Current number of invalid routes by reason
802+
- `routes.total`: Total number of valid routes currently loaded (available as
803+
`routesrv_custom_gauges{key="routes.total"}` in RouteSRV)
804+
805+
The `routes.total` gauge metric represents the current number of successfully loaded and valid routes. This metric can
806+
be used in combination with the invalid route metrics to calculate error rates during route processing.
807+
808+
#### Failure reasons
809+
810+
The metrics track different types of route validation failures:
811+
812+
- `unknown_filter`: Route uses a filter that is not registered or available
813+
- `invalid_filter_params`: Route has a filter with invalid parameters
814+
- `unknown_predicate`: Route uses a predicate that is not registered or available
815+
- `invalid_predicate_params`: Route has a predicate with invalid parameters
816+
- `failed_backend_split`: Route has an invalid backend URL or configuration
817+
- `other`: Route has other unclassified validation errors
818+
819+
#### Prometheus example
820+
821+
```
822+
# HELP skipper_route_invalid Number of invalid routes by reason.
823+
# TYPE skipper_route_invalid gauge
824+
skipper_route_invalid{reason="unknown_filter"} 3
825+
skipper_route_invalid{reason="invalid_filter_params"} 1
826+
skipper_route_invalid{reason="failed_backend_split"} 2
827+
828+
# HELP skipper_custom_gauges Gauges number of custom metrics.
829+
# TYPE skipper_custom_gauges gauge
830+
skipper_custom_gauges{key="routes.total"} 1250
831+
```
832+
833+
#### Codahale example
834+
835+
```json
836+
{
837+
"gauges": {
838+
"route.invalid.unknown_filter": {
839+
"value": 3
840+
},
841+
"route.invalid.invalid_filter_params": {
842+
"value": 1
843+
},
844+
"route.invalid.failed_backend_split": {
845+
"value": 2
846+
}
847+
}
848+
}
849+
```
850+
851+
These metrics are particularly useful for:
852+
853+
- Monitoring configuration deployment health
854+
- Identifying common route definition errors
855+
- Alerting on configuration issues
856+
- Tracking the success rate of route updates
857+
792858
## OpenTracing
793859

794860
Skipper has support for different [OpenTracing API](http://opentracing.io/) vendors, including

metrics/all_kind.go

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -131,14 +131,9 @@ func (a *All) IncErrorsStreaming(routeId string) {
131131

132132
}
133133

134-
func (a *All) IncValidRoutes() {
135-
a.prometheus.IncValidRoutes()
136-
a.codaHale.IncValidRoutes()
137-
}
138-
139-
func (a *All) IncInvalidRoutes(reason string) {
140-
a.prometheus.IncInvalidRoutes(reason)
141-
a.codaHale.IncInvalidRoutes(reason)
134+
func (a *All) UpdateInvalidRoute(reasonCounts map[string]int) {
135+
a.prometheus.UpdateInvalidRoute(reasonCounts)
136+
a.codaHale.UpdateInvalidRoute(reasonCounts)
142137
}
143138

144139
func (a *All) Close() {

metrics/codahale.go

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ const (
3333

3434
KeyErrorsBackend = "errors.backend.%s"
3535
KeyErrorsStreaming = "errors.streaming.%s"
36-
KeyValidRoutes = "route.valid"
3736
KeyInvalidRoutes = "route.invalid.%s"
3837

3938
statsRefreshDuration = time.Duration(5 * time.Second)
@@ -266,12 +265,10 @@ func (c *CodaHale) IncErrorsStreaming(routeId string) {
266265
}
267266
}
268267

269-
func (c *CodaHale) IncValidRoutes() {
270-
c.incCounter(KeyValidRoutes, 1)
271-
}
272-
273-
func (c *CodaHale) IncInvalidRoutes(reason string) {
274-
c.incCounter(fmt.Sprintf(KeyInvalidRoutes, reason), 1)
268+
func (c *CodaHale) UpdateInvalidRoute(reasonCounts map[string]int) {
269+
for reason, count := range reasonCounts {
270+
c.UpdateGauge(fmt.Sprintf(KeyInvalidRoutes, reason), float64(count))
271+
}
275272
}
276273

277274
func (c *CodaHale) Close() {

metrics/metrics.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,7 @@ type Metrics interface {
8080
IncErrorsStreaming(routeId string)
8181
RegisterHandler(path string, handler *http.ServeMux)
8282
UpdateGauge(key string, value float64)
83-
IncValidRoutes()
84-
IncInvalidRoutes(reason string)
83+
UpdateInvalidRoute(reasonCounts map[string]int)
8584
Close()
8685
}
8786

metrics/metricstest/metricsmock.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -225,12 +225,10 @@ func (m *MockMetrics) Gauge(key string) (v float64, ok bool) {
225225
return
226226
}
227227

228-
func (m *MockMetrics) IncValidRoutes() {
229-
m.IncCounter("route.valid")
230-
}
231-
232-
func (m *MockMetrics) IncInvalidRoutes(reason string) {
233-
m.IncCounter("route.invalid." + reason)
228+
func (m *MockMetrics) UpdateInvalidRoute(reasonCounts map[string]int) {
229+
for reason, count := range reasonCounts {
230+
m.UpdateGauge("route.invalid."+reason, float64(count))
231+
}
234232
}
235233

236234
func (m *MockMetrics) Close() {}

metrics/prometheus.go

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,7 @@ type Prometheus struct {
7171
customHistogramM *prometheus.HistogramVec
7272
customCounterM *prometheus.CounterVec
7373
customGaugeM *prometheus.GaugeVec
74-
validRoutesM *prometheus.CounterVec
75-
invalidRoutesM *prometheus.CounterVec
74+
invalidRouteM *prometheus.GaugeVec
7675

7776
opts Options
7877
registry *prometheus.Registry
@@ -308,18 +307,11 @@ func NewPrometheus(opts Options) *Prometheus {
308307
Buckets: opts.HistogramBuckets,
309308
}, []string{"key"}))
310309

311-
p.validRoutesM = register(p, prometheus.NewCounterVec(prometheus.CounterOpts{
310+
p.invalidRouteM = register(p, prometheus.NewGaugeVec(prometheus.GaugeOpts{
312311
Namespace: namespace,
313312
Subsystem: promRouteSubsystem,
314-
Name: "valid_routes",
315-
Help: "Total number of successfully processed routes.",
316-
}, []string{}))
317-
318-
p.invalidRoutesM = register(p, prometheus.NewCounterVec(prometheus.CounterOpts{
319-
Namespace: namespace,
320-
Subsystem: promRouteSubsystem,
321-
Name: "invalid_routes",
322-
Help: "Total number of invalid routes with failure reasons.",
313+
Name: "invalid",
314+
Help: "Number of invalid routes by reason.",
323315
}, []string{"reason"}))
324316

325317
// Register prometheus runtime collectors if required.
@@ -531,14 +523,11 @@ func (p *Prometheus) IncErrorsStreaming(routeID string) {
531523
p.proxyStreamingErrorsM.WithLabelValues(routeID).Inc()
532524
}
533525

534-
// IncValidRoutes satisfies Metrics interface.
535-
func (p *Prometheus) IncValidRoutes() {
536-
p.validRoutesM.WithLabelValues().Inc()
537-
}
538-
539-
// IncInvalidRoutes satisfies Metrics interface.
540-
func (p *Prometheus) IncInvalidRoutes(reason string) {
541-
p.invalidRoutesM.WithLabelValues(reason).Inc()
526+
// UpdateInvalidRoute satisfies Metrics interface.
527+
func (p *Prometheus) UpdateInvalidRoute(reasonCounts map[string]int) {
528+
for reason, count := range reasonCounts {
529+
p.invalidRouteM.WithLabelValues(reason).Set(float64(count))
530+
}
542531
}
543532

544533
func (p *Prometheus) Close() {}

routing/datasource.go

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ var (
3434
errUnknownPredicate = invalidDefinitionError("unknown_predicate")
3535
errInvalidPredicateParams = invalidDefinitionError("invalid_predicate_params")
3636
errFailedBackendSplit = invalidDefinitionError("failed_backend_split")
37+
errInvalidMatcher = invalidDefinitionError("invalid_matcher")
3738
)
3839

3940
func (it incomingType) String() string {
@@ -509,10 +510,6 @@ func processRouteDef(o *Options, cpm map[string]PredicateSpec, def *eskip.Route)
509510
return nil, err
510511
}
511512

512-
if o.Metrics != nil {
513-
o.Metrics.IncValidRoutes()
514-
}
515-
516513
return r, nil
517514
}
518515

@@ -529,6 +526,8 @@ func mapPredicates(cps []PredicateSpec) map[string]PredicateSpec {
529526
// processes a set of route definitions for the routing table
530527
func processRouteDefs(o *Options, defs []*eskip.Route) (routes []*Route, invalidDefs []*eskip.Route) {
531528
cpm := mapPredicates(o.Predicates)
529+
reasonCounts := make(map[string]int)
530+
532531
for _, def := range defs {
533532
route, err := processRouteDef(o, cpm, def)
534533
if err == nil {
@@ -538,13 +537,19 @@ func processRouteDefs(o *Options, defs []*eskip.Route) (routes []*Route, invalid
538537
o.Log.Errorf("failed to process route %s: %v", def.Id, err)
539538

540539
var defErr invalidDefinitionError
541-
if errors.As(err, &defErr) && o.Metrics != nil {
542-
o.Metrics.IncInvalidRoutes(defErr.Code())
543-
} else if o.Metrics != nil {
544-
o.Metrics.IncInvalidRoutes("other")
540+
reason := "other"
541+
if errors.As(err, &defErr) {
542+
reason = defErr.Code()
545543
}
544+
545+
reasonCounts[reason]++
546546
}
547547
}
548+
549+
if o.Metrics != nil {
550+
o.Metrics.UpdateInvalidRoute(reasonCounts)
551+
}
552+
548553
return
549554
}
550555

@@ -613,6 +618,10 @@ func receiveRouteMatcher(o Options, out chan<- *routeTable, quit <-chan struct{}
613618
invalidRouteIds[err.ID] = struct{}{}
614619
}
615620

621+
if o.Metrics != nil {
622+
o.Metrics.UpdateInvalidRoute(map[string]int{errInvalidMatcher.Code(): len(errs)})
623+
}
624+
616625
for i := range routes {
617626
r := routes[i]
618627
if _, found := invalidRouteIds[r.Id]; found {

0 commit comments

Comments
 (0)