Skip to content

Commit 8697342

Browse files
authored
Merge pull request #147 from mohit-sheth/ovn-dash-improvements
Openshift Networking dashboard Improvements
2 parents 3e24721 + 59d1aff commit 8697342

File tree

3 files changed

+153
-27
lines changed

3 files changed

+153
-27
lines changed

assets/ovn-monitoring/panels.libsonnet

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,15 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
9393
'mean',
9494
'max',
9595
])
96-
+ options.legend.withDisplayMode('table'),
96+
+ options.legend.withDisplayMode('table')
97+
+ {
98+
options+: {
99+
legend+: {
100+
sortBy: 'Max',
101+
sortDesc: true,
102+
},
103+
},
104+
},
97105

98106

99107
},

assets/ovn-monitoring/queries.libsonnet

Lines changed: 108 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,35 +44,131 @@ local generateTimeSeriesQuery(query, legend) = [
4444
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (pod,node))', '{{pod}} - {{node}}'),
4545
},
4646

47+
topOvnkubenodePodCPU: {
48+
query():
49+
generateTimeSeriesQuery(
50+
'topk(10, (sum(irate(container_cpu_usage_seconds_total{name!="",container!~"POD|",namespace=~"openshift-ovn-kubernetes", node=~"$_worker_node"}[2m]) * 100) by (pod, namespace, node)) > 0)',
51+
'{{pod}} - {{node}}'
52+
),
53+
},
54+
55+
topOvnkubenodePodMem: {
56+
query():
57+
generateTimeSeriesQuery(
58+
'topk(10, sum(container_memory_rss{name!="",container!~"POD|",namespace=~"openshift-ovn-kubernetes", node=~"$_worker_node"}) by (pod, namespace, node))',
59+
'{{pod}} - {{node}}'
60+
),
61+
},
62+
63+
topNorthdCPU: {
64+
query():
65+
generateTimeSeriesQuery(
66+
'topk(10, sum(irate(container_cpu_usage_seconds_total{container="northd", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))',
67+
'{{pod}} - {{node}}'
68+
),
69+
},
70+
71+
topNorthdMem: {
72+
query():
73+
generateTimeSeriesQuery(
74+
'topk(10, sum(container_memory_rss{container="northd", namespace="openshift-ovn-kubernetes"}) by (pod, node))',
75+
'{{pod}} - {{node}}'
76+
),
77+
},
78+
79+
topSbdbCPU: {
80+
query():
81+
generateTimeSeriesQuery(
82+
'topk(10, sum(irate(container_cpu_usage_seconds_total{container="sbdb", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))',
83+
'{{pod}} - {{node}}'
84+
),
85+
},
86+
87+
topSbdbMem: {
88+
query():
89+
generateTimeSeriesQuery(
90+
'topk(10, sum(container_memory_rss{container="sbdb", namespace="openshift-ovn-kubernetes"}) by (pod, node))',
91+
'{{pod}} - {{node}}'
92+
),
93+
},
94+
95+
topNbdbCPU: {
96+
query():
97+
generateTimeSeriesQuery(
98+
'topk(10, sum(irate(container_cpu_usage_seconds_total{container="nbdb", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))',
99+
'{{pod}} - {{node}}'
100+
),
101+
},
102+
103+
topNbdbMem: {
104+
query():
105+
generateTimeSeriesQuery(
106+
'topk(10, sum(container_memory_rss{container="nbdb", namespace="openshift-ovn-kubernetes"}) by (pod, node))',
107+
'{{pod}} - {{node}}'
108+
),
109+
},
110+
111+
topOvnkubeControllerCPU: {
112+
query():
113+
generateTimeSeriesQuery(
114+
'topk(10, sum(irate(container_cpu_usage_seconds_total{container="ovnkube-controller", namespace="openshift-ovn-kubernetes"}[2m])*100) by (pod, node))',
115+
'{{pod}} - {{node}}'
116+
),
117+
},
118+
119+
topOvnkubeControllerMem: {
120+
query():
121+
generateTimeSeriesQuery(
122+
'topk(10, sum(container_memory_rss{container="ovnkube-controller", namespace="openshift-ovn-kubernetes"}) by (pod, node))',
123+
'{{pod}} - {{node}}'
124+
),
125+
},
126+
127+
podSchedulingLatency: {
128+
query():
129+
generateTimeSeriesQuery('histogram_quantile(0.99, rate(scheduler_pod_scheduling_sli_duration_seconds_bucket[5m])) > 0', '{{pod}}'),
130+
},
131+
132+
firstSeenToLSPCreated: {
133+
query():
134+
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_first_seen_lsp_created_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'),
135+
},
136+
47137
ovnAnnotationLatency: {
48138
query():
49-
generateTimeSeriesQuery('histogram_quantile(0.99, sum by (pod, le) (rate(ovnkube_controller_pod_creation_latency_seconds_bucket[2m]))) > 0', '{{pod}} - Pod Annotation latency'),
139+
generateTimeSeriesQuery('histogram_quantile(0.99, sum by (pod, le) (rate(ovnkube_controller_pod_creation_latency_seconds_bucket[2m]))) > 0', '{{pod}}'),
140+
},
141+
142+
lspCreated: {
143+
query():
144+
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_lsp_created_port_binding_duration_seconds_bucket[2m])) by (pod,le)) > 0', '{{pod}}'),
145+
},
146+
147+
lspToChassis: {
148+
query():
149+
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_port_binding_port_binding_chassis_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'),
150+
},
151+
152+
portMarkedAsUp: {
153+
query():
154+
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_pod_port_binding_chassis_port_binding_up_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'),
50155
},
51156

52157
ovnCNIAdd: {
53158
query():
54159
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[2m])) by (pod,le)) > 0', '{{pod}}'),
55160
},
56161

57-
podLatency: {
162+
networkProgrammingComplete: {
58163
query():
59-
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_lsp_created_port_binding_duration_seconds_bucket[2m])) by (pod,le))', '{{pod}} - LSP created')
60-
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_port_binding_chassis_duration_seconds_bucket[2m])) by (pod,le))', '{{pod}} - Port Binding')
61-
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_chassis_port_binding_up_duration_seconds_bucket[2m])) by (pod,le))', '{{pod}} - Port Binding Up')
62-
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_first_seen_lsp_created_duration_seconds_bucket[2m])) by (pod,le))', '{{pod}} - Pod First seen'),
164+
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_controller_network_programming_duration_seconds_bucket[2m])) by (pod, le)) > 0', '{{pod}}'),
63165
},
64166

65167
synclatency: {
66168
query():
67169
generateTimeSeriesQuery('rate(ovnkube_master_sync_service_latency_seconds_sum[2m])', '{{pod}} - Sync service latency'),
68170
},
69171

70-
ovnLatencyCalculate: {
71-
query():
72-
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (pod, le))', '{{pod}} - Kind Pod')
73-
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (service, le))', '{{service}} - Kind Service'),
74-
},
75-
76172
ovnkubeNodeReadyLatency: {
77173
query():
78174
generateTimeSeriesQuery('ovnkube_node_ready_duration_seconds{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}}'),

templates/General/ovn-dashboard.jsonnet

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ local queries = import '../../assets/ovn-monitoring/queries.libsonnet';
33
local variables = import '../../assets/ovn-monitoring/variables.libsonnet';
44
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
55

6-
g.dashboard.new('OVN-Monitoring-dashboard')
6+
g.dashboard.new('Openshift Networking')
77
+ g.dashboard.time.withFrom('now-1h')
88
+ g.dashboard.time.withTo('now')
99
+ g.dashboard.withTimezone('utc')
@@ -26,24 +26,46 @@ g.dashboard.new('OVN-Monitoring-dashboard')
2626
+ g.panel.row.withCollapsed(true)
2727
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
2828
+ g.panel.row.withPanels([
29-
panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnClusterManagerLeader.query(), { x: 0, y: 0, w: 8, h: 4 }),
29+
panels.stat.genericstatThresoldPanel('OVNKube Cluster Manager Leader', 'none', queries.ovnClusterManagerLeader.query(), { x: 0, y: 0, w: 8, h: 4 }),
3030
panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 8, y: 0, w: 8, h: 4 }),
31-
panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 8, h: 4 }),
32-
panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 4, w: 12, h: 10 }),
33-
panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 4, w: 12, h: 10 }),
34-
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 12, w: 12, h: 10 }),
35-
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 12, w: 12, h: 10 }),
31+
panels.stat.genericstatThresoldOVNControllerPanel('OVN Controller Count', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 8, h: 4 }),
32+
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Control Plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 4, w: 12, h: 10 }),
33+
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Control Plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 4, w: 12, h: 10 }),
3634
]),
37-
g.panel.row.new('Latency Monitoring')
35+
g.panel.row.new('Pod Startup Latency Breakdown')
3836
+ g.panel.row.withCollapsed(true)
3937
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
4038
+ g.panel.row.withPanels([
41-
panels.timeSeries.genericTimeSeriesLegendPanel('Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 0, w: 12, h: 10 }),
42-
panels.timeSeries.genericTimeSeriesLegendPanel('CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 12, y: 0, w: 12, h: 10 }),
43-
panels.timeSeries.genericTimeSeriesLegendPanel('Pod creation Latency', 's', queries.podLatency.query(), { x: 0, y: 8, w: 24, h: 10 }),
44-
panels.timeSeries.genericTimeSeriesLegendPanel('Sync Service Latency', 's', queries.synclatency.query(), { x: 0, y: 16, w: 24, h: 10 }),
45-
panels.timeSeries.genericTimeSeriesLegendPanel('Duration for OVN to apply network configuration', 's', queries.ovnLatencyCalculate.query(), { x: 0, y: 24, w: 24, h: 10 }),
46-
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Ready Latency', 's', queries.ovnkubeNodeReadyLatency.query(), { x: 0, y: 32, w: 24, h: 10 }),
39+
panels.timeSeries.genericTimeSeriesLegendPanel('Scheduler Pod Scheduling Duration (P99)', 's', queries.podSchedulingLatency.query(), { x: 0, y: 0, w: 12, h: 10 }),
40+
panels.timeSeries.genericTimeSeriesLegendPanel('Pod First Seen to LSP Created Latency (P99)', 's', queries.firstSeenToLSPCreated.query(), { x: 12, y: 0, w: 12, h: 10 }),
41+
panels.timeSeries.genericTimeSeriesLegendPanel('Pod Annotation Latency (P99)', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 10, w: 12, h: 10 }),
42+
panels.timeSeries.genericTimeSeriesLegendPanel('Port Binding After LSP Creation Latency (P99)', 's', queries.lspCreated.query(), { x: 12, y: 10, w: 12, h: 10 }),
43+
panels.timeSeries.genericTimeSeriesLegendPanel('Port Binding to Chassis Assignment Latency (P99)', 's', queries.lspToChassis.query(), { x: 0, y: 20, w: 12, h: 10 }),
44+
panels.timeSeries.genericTimeSeriesLegendPanel('Port Marked As Up (P99)', 's', queries.portMarkedAsUp.query(), { x: 12, y: 20, w: 12, h: 10 }),
45+
panels.timeSeries.genericTimeSeriesLegendPanel('CNI Request ADD Latency (P99)', 's', queries.ovnCNIAdd.query(), { x: 0, y: 30, w: 12, h: 10 }),
46+
panels.timeSeries.genericTimeSeriesLegendPanel('Network Programming Complete (P99)', 's', queries.networkProgrammingComplete.query(), { x: 12, y: 30, w: 12, h: 10 }),
47+
panels.timeSeries.genericTimeSeriesLegendPanel('Sync Service Latency', 's', queries.synclatency.query(), { x: 0, y: 40, w: 12, h: 10 }),
48+
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Ready Latency', 's', queries.ovnkubeNodeReadyLatency.query(), { x: 12, y: 40, w: 12, h: 10 }),
49+
]),
50+
g.panel.row.new('OVN Component Resource Usage')
51+
+ g.panel.row.withCollapsed(true)
52+
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
53+
+ g.panel.row.withPanels([
54+
// Worker node pod resource usage
55+
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Pods CPU Usage (Top 10)', 'percent', queries.topOvnkubenodePodCPU.query(), { x: 0, y: 0, w: 12, h: 10 }),
56+
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Pods Memory Usage (Top 10)', 'bytes', queries.topOvnkubenodePodMem.query(), { x: 12, y: 0, w: 12, h: 10 }),
57+
58+
// Component resource usage
59+
panels.timeSeries.genericTimeSeriesLegendPanel('Northd CPU Usage (Top 10)', 'percent', queries.topNorthdCPU.query(), { x: 0, y: 8, w: 12, h: 10 }),
60+
panels.timeSeries.genericTimeSeriesLegendPanel('Northd Memory Usage (Top 10)', 'bytes', queries.topNorthdMem.query(), { x: 12, y: 8, w: 12, h: 10 }),
61+
panels.timeSeries.genericTimeSeriesLegendPanel('Sbdb CPU Usage (Top 10)', 'percent', queries.topSbdbCPU.query(), { x: 0, y: 16, w: 12, h: 10 }),
62+
panels.timeSeries.genericTimeSeriesLegendPanel('Sbdb Memory Usage (Top 10)', 'bytes', queries.topSbdbMem.query(), { x: 12, y: 16, w: 12, h: 10 }),
63+
panels.timeSeries.genericTimeSeriesLegendPanel('Nbdb CPU Usage (Top 10)', 'percent', queries.topNbdbCPU.query(), { x: 0, y: 24, w: 12, h: 10 }),
64+
panels.timeSeries.genericTimeSeriesLegendPanel('Nbdb Memory Usage (Top 10)', 'bytes', queries.topNbdbMem.query(), { x: 12, y: 24, w: 12, h: 10 }),
65+
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Controller CPU Usage (Top 10)', 'percent', queries.topOvnkubeControllerCPU.query(), { x: 0, y: 32, w: 12, h: 10 }),
66+
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Controller Memory Usage (Top 10)', 'bytes', queries.topOvnkubeControllerMem.query(), { x: 12, y: 32, w: 12, h: 10 }),
67+
panels.timeSeries.genericTimeSeriesLegendPanel('OVN Controller CPU Usage (Top 10)', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 40, w: 12, h: 10 }),
68+
panels.timeSeries.genericTimeSeriesLegendPanel('OVN Controller Memory Usage (Top 10)', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 40, w: 12, h: 10 }),
4769
]),
4870
g.panel.row.new('WorkQueue Monitoring')
4971
+ g.panel.row.withCollapsed(true)

0 commit comments

Comments
 (0)