Skip to content

Commit 8c63a22

Browse files
authored
Add better OOM metrics to dashboards (#3975)
## Motivation We've had OOMs in the shards, and this metric doesn't seem to have caught it. ## Proposal Replace the graph with a graph using a metrics that _does_ seem to catch when a pod was `OOMKilled`. ## Test Plan I used this instead and saw the pods that were `OOMKilled` show there with the correct reason. ## Release Plan - Nothing to do / These changes follow the usual release cycle.
1 parent cce10df commit 8c63a22

File tree

1 file changed

+113
-10
lines changed
  • kubernetes/linera-validator/grafana-dashboards/linera

1 file changed

+113
-10
lines changed

kubernetes/linera-validator/grafana-dashboards/linera/general.json

Lines changed: 113 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1720,6 +1720,7 @@
17201720
"type": "prometheus",
17211721
"uid": "prometheus"
17221722
},
1723+
"description": "",
17231724
"fieldConfig": {
17241725
"defaults": {
17251726
"color": {
@@ -1803,7 +1804,7 @@
18031804
},
18041805
"disableTextWrap": false,
18051806
"editorMode": "code",
1806-
"expr": "sum(rate(container_oom_events_total{pod=~\"(proxy|shards)-.*\"}[1m])) by (pod)",
1807+
"expr": "sum(rate(kube_pod_container_status_last_terminated_reason{pod=~\"(proxy|shards)-.*\"}[1m])) by (pod, reason)",
18071808
"fullMetaSearch": false,
18081809
"includeNullMetadata": true,
18091810
"instant": false,
@@ -1813,7 +1814,7 @@
18131814
"useBackend": false
18141815
}
18151816
],
1816-
"title": "OOM events per second",
1817+
"title": "Container termination",
18171818
"type": "timeseries"
18181819
},
18191820
{
@@ -2777,6 +2778,7 @@
27772778
"type": "prometheus",
27782779
"uid": "prometheus"
27792780
},
2781+
"description": "",
27802782
"fieldConfig": {
27812783
"defaults": {
27822784
"color": {
@@ -2837,7 +2839,7 @@
28372839
"x": 0,
28382840
"y": 116
28392841
},
2840-
"id": 28,
2842+
"id": 45,
28412843
"options": {
28422844
"legend": {
28432845
"calcs": [],
@@ -2858,15 +2860,19 @@
28582860
"type": "prometheus",
28592861
"uid": "prometheus"
28602862
},
2863+
"disableTextWrap": false,
28612864
"editorMode": "code",
2862-
"expr": "sum(rate(container_fs_reads_total{container=\"scylla\"}[60s])) by (container)",
2865+
"expr": "sum(rate(kube_pod_container_status_last_terminated_reason{pod=~\"(scylla)-.*\"}[1m])) by (pod, reason)",
2866+
"fullMetaSearch": false,
2867+
"includeNullMetadata": true,
28632868
"instant": false,
2864-
"legendFormat": "{{container}}",
2869+
"legendFormat": "__auto",
28652870
"range": true,
2866-
"refId": "A"
2871+
"refId": "A",
2872+
"useBackend": false
28672873
}
28682874
],
2869-
"title": "ScyllaDB FS reads per second",
2875+
"title": "ScyllaDB Container termination",
28702876
"type": "timeseries"
28712877
},
28722878
{
@@ -3031,7 +3037,7 @@
30313037
"x": 0,
30323038
"y": 124
30333039
},
3034-
"id": 43,
3040+
"id": 28,
30353041
"options": {
30363042
"legend": {
30373043
"calcs": [],
@@ -3053,14 +3059,14 @@
30533059
"uid": "prometheus"
30543060
},
30553061
"editorMode": "code",
3056-
"expr": "sum(rate(container_fs_writes_total{container=\"scylla\"}[60s])) by (container)",
3062+
"expr": "sum(rate(container_fs_reads_total{container=\"scylla\"}[60s])) by (container)",
30573063
"instant": false,
30583064
"legendFormat": "{{container}}",
30593065
"range": true,
30603066
"refId": "A"
30613067
}
30623068
],
3063-
"title": "ScyllaDB FS writes per second",
3069+
"title": "ScyllaDB FS reads per second",
30643070
"type": "timeseries"
30653071
},
30663072
{
@@ -3225,6 +3231,103 @@
32253231
"x": 0,
32263232
"y": 132
32273233
},
3234+
"id": 43,
3235+
"options": {
3236+
"legend": {
3237+
"calcs": [],
3238+
"displayMode": "list",
3239+
"placement": "bottom",
3240+
"showLegend": true
3241+
},
3242+
"tooltip": {
3243+
"hideZeros": false,
3244+
"mode": "single",
3245+
"sort": "none"
3246+
}
3247+
},
3248+
"pluginVersion": "11.6.0-83314",
3249+
"targets": [
3250+
{
3251+
"datasource": {
3252+
"type": "prometheus",
3253+
"uid": "prometheus"
3254+
},
3255+
"editorMode": "code",
3256+
"expr": "sum(rate(container_fs_writes_total{container=\"scylla\"}[60s])) by (container)",
3257+
"instant": false,
3258+
"legendFormat": "{{container}}",
3259+
"range": true,
3260+
"refId": "A"
3261+
}
3262+
],
3263+
"title": "ScyllaDB FS writes per second",
3264+
"type": "timeseries"
3265+
},
3266+
{
3267+
"datasource": {
3268+
"type": "prometheus",
3269+
"uid": "prometheus"
3270+
},
3271+
"fieldConfig": {
3272+
"defaults": {
3273+
"color": {
3274+
"mode": "palette-classic"
3275+
},
3276+
"custom": {
3277+
"axisCenteredZero": false,
3278+
"axisColorMode": "text",
3279+
"axisLabel": "",
3280+
"axisPlacement": "auto",
3281+
"barAlignment": 0,
3282+
"drawStyle": "line",
3283+
"fillOpacity": 0,
3284+
"gradientMode": "none",
3285+
"hideFrom": {
3286+
"legend": false,
3287+
"tooltip": false,
3288+
"viz": false
3289+
},
3290+
"insertNulls": false,
3291+
"lineInterpolation": "smooth",
3292+
"lineWidth": 1,
3293+
"pointSize": 5,
3294+
"scaleDistribution": {
3295+
"type": "linear"
3296+
},
3297+
"showPoints": "auto",
3298+
"spanNulls": false,
3299+
"stacking": {
3300+
"group": "A",
3301+
"mode": "none"
3302+
},
3303+
"thresholdsStyle": {
3304+
"mode": "off"
3305+
}
3306+
},
3307+
"mappings": [],
3308+
"thresholds": {
3309+
"mode": "absolute",
3310+
"steps": [
3311+
{
3312+
"color": "green",
3313+
"value": null
3314+
},
3315+
{
3316+
"color": "red",
3317+
"value": 80
3318+
}
3319+
]
3320+
},
3321+
"unit": "none"
3322+
},
3323+
"overrides": []
3324+
},
3325+
"gridPos": {
3326+
"h": 8,
3327+
"w": 12,
3328+
"x": 0,
3329+
"y": 140
3330+
},
32283331
"id": 38,
32293332
"options": {
32303333
"legend": {

0 commit comments

Comments
 (0)