Skip to content

Commit 8eab4b9

Browse files
committed
fix(dashboard): fix CNI network health dashboard queries
- Change Total Peers and Reachable Peers from sum() to avg() to show per-node average instead of total across all nodes - Fix Network Partition and CNI Health tables to use increase() with label_replace() to show current status from Counter metrics - Update value mappings to match new label values (Healthy/PARTITIONED, Healthy/UNHEALTHY) - Rename "Network & CNI Conditions (True)" to "Network Issue Activity" and split into problem conditions vs unhealthy conditions
1 parent d60d61f commit 8eab4b9

File tree

1 file changed

+106
-62
lines changed

1 file changed

+106
-62
lines changed

dashboards/node-doctor-cni-network-health.json

Lines changed: 106 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -449,12 +449,12 @@
449449
"mappings": [
450450
{
451451
"options": {
452-
"False": {
452+
"Healthy": {
453453
"color": "green",
454454
"index": 0,
455455
"text": "Healthy"
456456
},
457-
"True": {
457+
"PARTITIONED": {
458458
"color": "red",
459459
"index": 1,
460460
"text": "PARTITIONED"
@@ -485,6 +485,21 @@
485485
"value": 150
486486
}
487487
]
488+
},
489+
{
490+
"matcher": {
491+
"id": "byName",
492+
"options": "Status"
493+
},
494+
"properties": [
495+
{
496+
"id": "custom.cellOptions",
497+
"value": {
498+
"mode": "basic",
499+
"type": "color-background"
500+
}
501+
}
502+
]
488503
}
489504
]
490505
},
@@ -512,44 +527,40 @@
512527
"type": "prometheus",
513528
"uid": "${datasource}"
514529
},
515-
"expr": "topk(100, node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=\"NetworkPartitioned\"}) by (node, status)",
530+
"expr": "label_replace(increase(node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=\"NetworkPartitioned\", status=\"False\"}[5m]) > 0, \"partition_status\", \"Healthy\", \"\", \"\") or label_replace(increase(node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=\"NetworkPartitioned\", status=\"True\"}[5m]) > 0, \"partition_status\", \"PARTITIONED\", \"\", \"\")",
516531
"format": "table",
517532
"instant": true,
518-
"legendFormat": "{{node}} - {{status}}",
533+
"legendFormat": "__auto",
519534
"refId": "A"
520535
}
521536
],
522537
"title": "Network Partition Status by Node",
523538
"transformations": [
524-
{
525-
"id": "groupBy",
526-
"options": {
527-
"fields": {
528-
"Value": {
529-
"aggregations": ["lastNotNull"],
530-
"operation": "aggregate"
531-
},
532-
"node": {
533-
"aggregations": [],
534-
"operation": "groupby"
535-
},
536-
"status": {
537-
"aggregations": [],
538-
"operation": "groupby"
539-
}
540-
}
541-
}
542-
},
543539
{
544540
"id": "organize",
545541
"options": {
546542
"excludeByName": {
547-
"Value (lastNotNull)": true
543+
"Time": true,
544+
"Value": true,
545+
"__name__": true,
546+
"cluster": true,
547+
"condition_type": true,
548+
"container": true,
549+
"endpoint": true,
550+
"instance": true,
551+
"job": true,
552+
"namespace": true,
553+
"pod": true,
554+
"service": true,
555+
"status": true
556+
},
557+
"indexByName": {
558+
"node": 0,
559+
"partition_status": 1
548560
},
549-
"indexByName": {},
550561
"renameByName": {
551562
"node": "Node",
552-
"status": "Partitioned"
563+
"partition_status": "Status"
553564
}
554565
}
555566
}
@@ -576,12 +587,12 @@
576587
"mappings": [
577588
{
578589
"options": {
579-
"False": {
590+
"UNHEALTHY": {
580591
"color": "red",
581592
"index": 1,
582593
"text": "UNHEALTHY"
583594
},
584-
"True": {
595+
"Healthy": {
585596
"color": "green",
586597
"index": 0,
587598
"text": "Healthy"
@@ -600,7 +611,35 @@
600611
]
601612
}
602613
},
603-
"overrides": []
614+
"overrides": [
615+
{
616+
"matcher": {
617+
"id": "byName",
618+
"options": "Node"
619+
},
620+
"properties": [
621+
{
622+
"id": "custom.width",
623+
"value": 150
624+
}
625+
]
626+
},
627+
{
628+
"matcher": {
629+
"id": "byName",
630+
"options": "CNI Status"
631+
},
632+
"properties": [
633+
{
634+
"id": "custom.cellOptions",
635+
"value": {
636+
"mode": "basic",
637+
"type": "color-background"
638+
}
639+
}
640+
]
641+
}
642+
]
604643
},
605644
"gridPos": {
606645
"h": 8,
@@ -626,44 +665,40 @@
626665
"type": "prometheus",
627666
"uid": "${datasource}"
628667
},
629-
"expr": "topk(100, node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=\"CNIHealthy\"}) by (node, status)",
668+
"expr": "label_replace(increase(node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=\"CNIHealthy\", status=\"True\"}[5m]) > 0, \"health_status\", \"Healthy\", \"\", \"\") or label_replace(increase(node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=\"CNIHealthy\", status=\"False\"}[5m]) > 0, \"health_status\", \"UNHEALTHY\", \"\", \"\")",
630669
"format": "table",
631670
"instant": true,
632-
"legendFormat": "{{node}} - {{status}}",
671+
"legendFormat": "__auto",
633672
"refId": "A"
634673
}
635674
],
636675
"title": "CNI Health Status by Node",
637676
"transformations": [
638-
{
639-
"id": "groupBy",
640-
"options": {
641-
"fields": {
642-
"Value": {
643-
"aggregations": ["lastNotNull"],
644-
"operation": "aggregate"
645-
},
646-
"node": {
647-
"aggregations": [],
648-
"operation": "groupby"
649-
},
650-
"status": {
651-
"aggregations": [],
652-
"operation": "groupby"
653-
}
654-
}
655-
}
656-
},
657677
{
658678
"id": "organize",
659679
"options": {
660680
"excludeByName": {
661-
"Value (lastNotNull)": true
681+
"Time": true,
682+
"Value": true,
683+
"__name__": true,
684+
"cluster": true,
685+
"condition_type": true,
686+
"container": true,
687+
"endpoint": true,
688+
"instance": true,
689+
"job": true,
690+
"namespace": true,
691+
"pod": true,
692+
"service": true,
693+
"status": true
694+
},
695+
"indexByName": {
696+
"node": 0,
697+
"health_status": 1
662698
},
663-
"indexByName": {},
664699
"renameByName": {
665700
"node": "Node",
666-
"status": "CNI Healthy"
701+
"health_status": "CNI Status"
667702
}
668703
}
669704
}
@@ -1003,12 +1038,12 @@
10031038
"type": "prometheus",
10041039
"uid": "${datasource}"
10051040
},
1006-
"expr": "sum(node_doctor_monitor_peers_total{node=~\"$node\"})",
1007-
"legendFormat": "Total Peers",
1041+
"expr": "avg(node_doctor_monitor_peers_total{node=~\"$node\"})",
1042+
"legendFormat": "Avg Peers per Node",
10081043
"refId": "A"
10091044
}
10101045
],
1011-
"title": "Total Peers",
1046+
"title": "Avg Peers per Node",
10121047
"type": "stat"
10131048
},
10141049
{
@@ -1061,12 +1096,12 @@
10611096
"type": "prometheus",
10621097
"uid": "${datasource}"
10631098
},
1064-
"expr": "sum(node_doctor_monitor_peers_reachable_total{node=~\"$node\"})",
1065-
"legendFormat": "Reachable Peers",
1099+
"expr": "avg(node_doctor_monitor_peers_reachable_total{node=~\"$node\"})",
1100+
"legendFormat": "Avg Reachable per Node",
10661101
"refId": "A"
10671102
}
10681103
],
1069-
"title": "Reachable Peers",
1104+
"title": "Avg Reachable per Node",
10701105
"type": "stat"
10711106
},
10721107
{
@@ -2272,12 +2307,21 @@
22722307
"type": "prometheus",
22732308
"uid": "${datasource}"
22742309
},
2275-
"expr": "sum by (condition_type) (node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=~\"NetworkPartitioned|NetworkDegraded|CNIHealthy|CNIConfigValid|CNIInterfacesHealthy\", status=\"True\"})",
2276-
"legendFormat": "{{condition_type}}",
2310+
"expr": "sum by (condition_type) (rate(node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=~\"NetworkPartitioned|NetworkDegraded\", status=\"True\"}[5m])) * 60",
2311+
"legendFormat": "{{condition_type}} (problems)",
22772312
"refId": "A"
2313+
},
2314+
{
2315+
"datasource": {
2316+
"type": "prometheus",
2317+
"uid": "${datasource}"
2318+
},
2319+
"expr": "sum by (condition_type) (rate(node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=~\"CNIHealthy|CNIConfigValid|CNIInterfacesHealthy\", status=\"False\"}[5m])) * 60",
2320+
"legendFormat": "{{condition_type}} (unhealthy)",
2321+
"refId": "B"
22782322
}
22792323
],
2280-
"title": "Network & CNI Conditions (True)",
2324+
"title": "Network Issue Activity (rate/min)",
22812325
"type": "timeseries"
22822326
},
22832327
{

0 commit comments

Comments
 (0)