Skip to content

Commit e5d599b

Browse files
authored
Roffe/metrics polish (#595)
* update metrics docs & dashboard * renamed `namespace` label to `svc_namespace` for service metrics as it would be overwritten by most Prometheus setups * Made histograms for all the controller sync times for better visualization * added `controller_routes_sync_time`, `controller_bgp_advertisements_sent` & `controller_policy_chains_sync_time` metrics
1 parent 0cdaa43 commit e5d599b

File tree

8 files changed

+1639
-1389
lines changed

8 files changed

+1639
-1389
lines changed

dashboard/dashboard.png

412 KB
Loading

dashboard/kube-router.json

Lines changed: 1533 additions & 1342 deletions
Large diffs are not rendered by default.

docs/metrics.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ The default values unless other specified is
2222
By enabling [Kubernetes SD](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<kubernetes_sd_config>) in Prometheus configuration & adding required annotations Prometheus can automaticly discover & scrape kube-router metrics
2323

2424
## Version notes
25+
kube-router v0.2.4 received a metrics overhaul where some metrics were changed into histograms, additional metrics was also added. Please make sure you are using the latest dashboard version with versions => v0.2.4
26+
2527
kube-router 0.1.0-rc2 and upwards supports the runtime configuration for controlling where to expose the metrics. If you are using a older version, metrics path & port is locked to `/metrics` & `8080`
2628

2729
## Supported annotations
@@ -56,14 +58,20 @@ The following metrics is exposed by kube-router prefixed by `kube_router_`
5658
* controller_bgp_peers
5759
Number of BGP peers of the instance
5860
* controller_bgp_advertisements_received
59-
Number of total BGP advertisements received since kube-router start
61+
Total number of BGP advertisements received since kube-router started
62+
* controller_bgp_advertisements_sent
63+
Total number of BGP advertisements sent since kube-router started
6064
* controller_bgp_internal_peers_sync_time
6165
Time it took for the BGP internal peer sync loop to complete
66+
* controller_routes_sync_time
67+
Time it took for controller to sync routes
6268

6369
### run-firewall=true
6470

6571
* controller_iptables_sync_time
6672
Time it took for the iptables sync loop to complete
73+
* controller_policy_chains_sync_time
74+
Time it took for controller to sync policy chains
6775

6876
### run-service-proxy = true
6977

@@ -95,7 +103,7 @@ The following metrics is exposed by kube-router prefixed by `kube_router_`
95103
Outgoing bytes per second
96104

97105
To get a grouped list of CPS for each service a Prometheus query could look like this e.g:
98-
`sum(kube_router_service_cps) by (namespace, service_name)`
106+
`sum(kube_router_service_cps) by (svc_namespace, service_name)`
99107

100108
## Grafana Dashboard
101109

pkg/controllers/netpol/network_policy_controller.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"errors"
77
"fmt"
88
"net"
9+
"regexp"
910
"strconv"
1011
"strings"
1112
"sync"
@@ -26,7 +27,6 @@ import (
2627
"k8s.io/client-go/kubernetes"
2728
listers "k8s.io/client-go/listers/core/v1"
2829
"k8s.io/client-go/tools/cache"
29-
"regexp"
3030
)
3131

3232
const (
@@ -215,7 +215,7 @@ func (npc *NetworkPolicyController) Sync() error {
215215
defer func() {
216216
endTime := time.Since(start)
217217
if npc.MetricsEnabled {
218-
metrics.ControllerIptablesSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
218+
metrics.ControllerIptablesSyncTime.Observe(endTime.Seconds())
219219
}
220220
glog.V(1).Infof("sync iptables took %v", endTime)
221221
}()
@@ -258,7 +258,12 @@ func (npc *NetworkPolicyController) Sync() error {
258258
// policyspec is evaluated to set of matching pods, which are grouped in to a
259259
// ipset used for source ip addr matching.
260260
func (npc *NetworkPolicyController) syncNetworkPolicyChains(version string) (map[string]bool, map[string]bool, error) {
261-
261+
start := time.Now()
262+
defer func() {
263+
endTime := time.Since(start)
264+
metrics.ControllerPolicyChainsSyncTime.Observe(endTime.Seconds())
265+
glog.V(2).Infof("Syncing network policy chains took %v", endTime)
266+
}()
262267
activePolicyChains := make(map[string]bool)
263268
activePolicyIpSets := make(map[string]bool)
264269

@@ -1536,6 +1541,7 @@ func NewNetworkPolicyController(clientset kubernetes.Interface,
15361541
if config.MetricsEnabled {
15371542
//Register the metrics for this controller
15381543
prometheus.MustRegister(metrics.ControllerIptablesSyncTime)
1544+
prometheus.MustRegister(metrics.ControllerPolicyChainsSyncTime)
15391545
npc.MetricsEnabled = true
15401546
}
15411547

pkg/controllers/proxy/network_services_controller.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,9 @@ func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoM
373373
defer func() {
374374
endTime := time.Since(start)
375375
glog.V(2).Infof("Publishing IPVS metrics took %v", endTime)
376-
metrics.ControllerIpvsMetricsExportTime.WithLabelValues().Set(float64(endTime.Seconds()))
376+
if nsc.MetricsEnabled {
377+
metrics.ControllerIpvsMetricsExportTime.Observe(float64(endTime.Seconds()))
378+
}
377379
}()
378380

379381
ipvsSvcs, err := nsc.ln.ipvsGetServices()
@@ -429,7 +431,7 @@ func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoM
429431
metrics.ServicePpsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSIn))
430432
metrics.ServicePpsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSOut))
431433
metrics.ServiceTotalConn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.Connections))
432-
metrics.ControllerIpvsServices.WithLabelValues().Set(float64(len(ipvsSvcs)))
434+
metrics.ControllerIpvsServices.Set(float64(len(ipvsSvcs)))
433435
}
434436
}
435437
}
@@ -528,7 +530,7 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf
528530
defer func() {
529531
endTime := time.Since(start)
530532
if nsc.MetricsEnabled {
531-
metrics.ControllerIpvsServicesSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
533+
metrics.ControllerIpvsServicesSyncTime.Observe(endTime.Seconds())
532534
}
533535
glog.V(1).Infof("sync ipvs services took %v", endTime)
534536
}()

pkg/controllers/routing/bgp_peers.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ func (nrc *NetworkRoutingController) syncInternalPeers() {
3030
start := time.Now()
3131
defer func() {
3232
endTime := time.Since(start)
33-
metrics.ControllerBGPInternalPeersSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
33+
if nrc.MetricsEnabled {
34+
metrics.ControllerBGPInternalPeersSyncTime.Observe(endTime.Seconds())
35+
}
3436
glog.V(2).Infof("Syncing BGP peers for the node took %v", endTime)
3537
}()
3638

@@ -40,8 +42,9 @@ func (nrc *NetworkRoutingController) syncInternalPeers() {
4042
glog.Errorf("Failed to list nodes from API server due to: %s. Can not perform BGP peer sync", err.Error())
4143
return
4244
}
43-
44-
metrics.ControllerBPGpeers.WithLabelValues().Set(float64(len(nodes.Items)))
45+
if nrc.MetricsEnabled {
46+
metrics.ControllerBPGpeers.Set(float64(len(nodes.Items)))
47+
}
4548
// establish peer and add Pod CIDRs with current set of nodes
4649
currentNodes := make([]string, 0)
4750
for _, node := range nodes.Items {

pkg/controllers/routing/network_routes_controller.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() {
325325
case *gobgp.WatchEventBestPath:
326326
glog.V(3).Info("Processing bgp route advertisement from peer")
327327
if nrc.MetricsEnabled {
328-
metrics.ControllerBGPadvertisementsReceived.WithLabelValues().Add(float64(1))
328+
metrics.ControllerBGPadvertisementsReceived.Inc()
329329
}
330330
for _, path := range msg.PathList {
331331
if path.IsLocal() {
@@ -342,6 +342,9 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() {
342342
}
343343

344344
func (nrc *NetworkRoutingController) advertisePodRoute() error {
345+
if nrc.MetricsEnabled {
346+
metrics.ControllerBGPadvertisementsSent.Inc()
347+
}
345348
cidr, err := utils.GetPodCidrFromNodeSpec(nrc.clientset, nrc.hostnameOverride)
346349
if err != nil {
347350
return err
@@ -486,6 +489,12 @@ func (nrc *NetworkRoutingController) Cleanup() {
486489
}
487490

488491
func (nrc *NetworkRoutingController) syncNodeIPSets() error {
492+
start := time.Now()
493+
defer func() {
494+
if nrc.MetricsEnabled {
495+
metrics.ControllerRoutesSyncTime.Observe(time.Since(start).Seconds())
496+
}
497+
}()
489498
// Get the current list of the nodes from API server
490499
nodes, err := nrc.clientset.CoreV1().Nodes().List(metav1.ListOptions{})
491500
if err != nil {
@@ -786,6 +795,7 @@ func NewNetworkRoutingController(clientset kubernetes.Interface,
786795
prometheus.MustRegister(metrics.ControllerBGPadvertisementsReceived)
787796
prometheus.MustRegister(metrics.ControllerBGPInternalPeersSyncTime)
788797
prometheus.MustRegister(metrics.ControllerBPGpeers)
798+
prometheus.MustRegister(metrics.ControllerRoutesSyncTime)
789799
nrc.MetricsEnabled = true
790800
}
791801

pkg/metrics/metrics_controller.go

Lines changed: 65 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,108 +21,138 @@ const (
2121
)
2222

2323
var (
24+
// ServiceTotalConn Total incoming connections made
2425
ServiceTotalConn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
2526
Namespace: namespace,
2627
Name: "service_total_connections",
2728
Help: "Total incoming connections made",
28-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
29+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
30+
// ServicePacketsIn Total incoming packets
2931
ServicePacketsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
3032
Namespace: namespace,
3133
Name: "service_packets_in",
3234
Help: "Total incoming packets",
33-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
35+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
36+
// ServicePacketsOut Total outgoing packets
3437
ServicePacketsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
3538
Namespace: namespace,
3639
Name: "service_packets_out",
3740
Help: "Total outgoing packets",
38-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
41+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
42+
// ServiceBytesIn Total incoming bytes
3943
ServiceBytesIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
4044
Namespace: namespace,
4145
Name: "service_bytes_in",
4246
Help: "Total incoming bytes",
43-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
47+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
48+
// ServiceBytesOut Total outgoing bytes
4449
ServiceBytesOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
4550
Namespace: namespace,
4651
Name: "service_bytes_out",
4752
Help: "Total outgoing bytes",
48-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
53+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
54+
// ServicePpsIn Incoming packets per second
4955
ServicePpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
5056
Namespace: namespace,
5157
Name: "service_pps_in",
5258
Help: "Incoming packets per second",
53-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
59+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
60+
// ServicePpsOut Outgoing packets per second
5461
ServicePpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
5562
Namespace: namespace,
5663
Name: "service_pps_out",
5764
Help: "Outgoing packets per second",
58-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
65+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
66+
// ServiceCPS Service connections per second
5967
ServiceCPS = prometheus.NewGaugeVec(prometheus.GaugeOpts{
6068
Namespace: namespace,
6169
Name: "service_cps",
6270
Help: "Service connections per second",
63-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
71+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
72+
// ServiceBpsIn Incoming bytes per second
6473
ServiceBpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
6574
Namespace: namespace,
6675
Name: "service_bps_in",
6776
Help: "Incoming bytes per second",
68-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
77+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
78+
// ServiceBpsOut Outgoing bytes per second
6979
ServiceBpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
7080
Namespace: namespace,
7181
Name: "service_bps_out",
7282
Help: "Outgoing bytes per second",
73-
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
74-
ControllerIpvsServices = prometheus.NewGaugeVec(prometheus.GaugeOpts{
83+
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
84+
// ControllerIpvsServices Number of ipvs services in the instance
85+
ControllerIpvsServices = prometheus.NewGauge(prometheus.GaugeOpts{
7586
Namespace: namespace,
7687
Name: "controller_ipvs_services",
7788
Help: "Number of ipvs services in the instance",
78-
}, []string{})
79-
ControllerIptablesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
89+
})
90+
// ControllerIptablesSyncTime Time it took for controller to sync iptables
91+
ControllerIptablesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
8092
Namespace: namespace,
8193
Name: "controller_iptables_sync_time",
8294
Help: "Time it took for controller to sync iptables",
83-
}, []string{})
84-
ControllerPublishMetricsTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
85-
Namespace: namespace,
86-
Name: "controller_publish_metrics_time",
87-
Help: "Time it took to publish metrics",
88-
}, []string{})
89-
ControllerIpvsServicesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
95+
})
96+
// ControllerIpvsServicesSyncTime Time it took for controller to sync ipvs services
97+
ControllerIpvsServicesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
9098
Namespace: namespace,
9199
Name: "controller_ipvs_services_sync_time",
92100
Help: "Time it took for controller to sync ipvs services",
93-
}, []string{})
94-
ControllerBPGpeers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
101+
})
102+
// ControllerRoutesSyncTime Time it took for controller to sync ipvs services
103+
ControllerRoutesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
104+
Namespace: namespace,
105+
Name: "controller_routes_sync_time",
106+
Help: "Time it took for controller to sync routes",
107+
})
108+
// ControllerBPGpeers BGP peers in the runtime configuration
109+
ControllerBPGpeers = prometheus.NewGauge(prometheus.GaugeOpts{
95110
Namespace: namespace,
96111
Name: "controller_bgp_peers",
97112
Help: "BGP peers in the runtime configuration",
98-
}, []string{})
99-
ControllerBGPInternalPeersSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
113+
})
114+
// ControllerBGPInternalPeersSyncTime Time it took to sync internal bgp peers
115+
ControllerBGPInternalPeersSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
100116
Namespace: namespace,
101117
Name: "controller_bgp_internal_peers_sync_time",
102118
Help: "Time it took to sync internal bgp peers",
103-
}, []string{})
104-
ControllerBGPadvertisementsReceived = prometheus.NewGaugeVec(prometheus.GaugeOpts{
119+
})
120+
// ControllerBGPadvertisementsReceived Time it took to sync internal bgp peers
121+
ControllerBGPadvertisementsReceived = prometheus.NewCounter(prometheus.CounterOpts{
105122
Namespace: namespace,
106123
Name: "controller_bgp_advertisements_received",
107-
Help: "Time it took to sync internal bgp peers",
108-
}, []string{})
109-
ControllerIpvsMetricsExportTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
124+
Help: "BGP advertisements received",
125+
})
126+
// ControllerBGPadvertisementsSent Time it took to sync internal bgp peers
127+
ControllerBGPadvertisementsSent = prometheus.NewCounter(prometheus.CounterOpts{
128+
Namespace: namespace,
129+
Name: "controller_bgp_advertisements_sent",
130+
Help: "BGP advertisements sent",
131+
})
132+
// ControllerIpvsMetricsExportTime Time it took to export metrics
133+
ControllerIpvsMetricsExportTime = prometheus.NewHistogram(prometheus.HistogramOpts{
110134
Namespace: namespace,
111135
Name: "controller_ipvs_metrics_export_time",
112136
Help: "Time it took to export metrics",
113-
}, []string{})
137+
})
138+
// ControllerPolicyChainsSyncTime Time it took for controller to sync policys
139+
ControllerPolicyChainsSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
140+
Namespace: namespace,
141+
Name: "controller_policy_chains_sync_time",
142+
Help: "Time it took for controller to sync policy chains",
143+
})
114144
)
115145

116-
// MetricsController Holds settings for the metrics controller
117-
type MetricsController struct {
146+
// Controller Holds settings for the metrics controller
147+
type Controller struct {
118148
MetricsPath string
119149
MetricsPort uint16
120150
mu sync.Mutex
121151
nodeIP net.IP
122152
}
123153

124154
// Run prometheus metrics controller
125-
func (mc *MetricsController) Run(healthChan chan<- *healthcheck.ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
155+
func (mc *Controller) Run(healthChan chan<- *healthcheck.ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
126156
t := time.NewTicker(3 * time.Second)
127157
defer wg.Done()
128158
glog.Info("Starting metrics controller")
@@ -157,8 +187,8 @@ func (mc *MetricsController) Run(healthChan chan<- *healthcheck.ControllerHeartb
157187
}
158188

159189
// NewMetricsController returns new MetricController object
160-
func NewMetricsController(clientset kubernetes.Interface, config *options.KubeRouterConfig) (*MetricsController, error) {
161-
mc := MetricsController{}
190+
func NewMetricsController(clientset kubernetes.Interface, config *options.KubeRouterConfig) (*Controller, error) {
191+
mc := Controller{}
162192
mc.MetricsPath = config.MetricsPath
163193
mc.MetricsPort = config.MetricsPort
164194
return &mc, nil

0 commit comments

Comments
 (0)