Skip to content

Commit e2e7782

Browse files
authored
Stop tracking pgfaults (#4033)
## Motivation Regular page faults are normal when allocating memory ## Proposal Filter them out from dashboards to reduce noise, that way we can see the major page faults, which are much more important signal. Also improved some container termination dashboards. ## Test Plan Generated this from the UI of a deployed network ## Release Plan - Nothing to do / These changes follow the usual release cycle.
1 parent f09c650 commit e2e7782

File tree

1 file changed

+134
-36
lines changed
  • kubernetes/linera-validator/grafana-dashboards/linera

1 file changed

+134
-36
lines changed

kubernetes/linera-validator/grafana-dashboards/linera/general.json

Lines changed: 134 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1741,7 +1741,7 @@
17411741
"viz": false
17421742
},
17431743
"insertNulls": false,
1744-
"lineInterpolation": "smooth",
1744+
"lineInterpolation": "linear",
17451745
"lineWidth": 1,
17461746
"pointSize": 5,
17471747
"scaleDistribution": {
@@ -1790,31 +1790,28 @@
17901790
"showLegend": true
17911791
},
17921792
"tooltip": {
1793-
"hideZeros": false,
17941793
"mode": "single",
17951794
"sort": "none"
17961795
}
17971796
},
1798-
"pluginVersion": "11.6.0-83314",
1797+
"pluginVersion": "10.1.1",
17991798
"targets": [
18001799
{
18011800
"datasource": {
18021801
"type": "prometheus",
18031802
"uid": "prometheus"
18041803
},
1805-
"disableTextWrap": false,
18061804
"editorMode": "code",
1807-
"expr": "sum(rate(kube_pod_container_status_last_terminated_reason{pod=~\"(proxy|shards)-.*\"}[1m])) by (pod, reason)",
1808-
"fullMetaSearch": false,
1809-
"includeNullMetadata": true,
1805+
"expr": "sum(rate(kube_pod_container_status_last_terminated_reason[1m])) by (pod, reason)",
1806+
"hide": false,
18101807
"instant": false,
1811-
"legendFormat": "__auto",
1808+
"legendFormat": "{{pod}} - {{reason}}",
18121809
"range": true,
1813-
"refId": "A",
1814-
"useBackend": false
1810+
"refId": "B"
18151811
}
18161812
],
18171813
"title": "Container termination",
1814+
"transformations": [],
18181815
"type": "timeseries"
18191816
},
18201817
{
@@ -1907,7 +1904,7 @@
19071904
},
19081905
"disableTextWrap": false,
19091906
"editorMode": "code",
1910-
"expr": "sum(rate(container_memory_failures_total{pod=~\"(proxy|shards)-.*\"}[60s])) by (pod, failure_type)",
1907+
"expr": "sum by(pod, failure_type) (rate(container_memory_failures_total{pod=~\"(proxy|shards)-.*\", failure_type!=\"pgfault\"}[60s]))",
19111908
"fullMetaSearch": false,
19121909
"includeNullMetadata": true,
19131910
"instant": false,
@@ -1925,6 +1922,7 @@
19251922
"type": "prometheus",
19261923
"uid": "prometheus"
19271924
},
1925+
"description": "Some useful codes to remember:\n137 - SIGKILL - can happen on OOMs\n143 - SIGTERM",
19281926
"fieldConfig": {
19291927
"defaults": {
19301928
"color": {
@@ -1974,8 +1972,7 @@
19741972
"value": 80
19751973
}
19761974
]
1977-
},
1978-
"unit": "none"
1975+
}
19791976
},
19801977
"overrides": []
19811978
},
@@ -1985,7 +1982,7 @@
19851982
"x": 12,
19861983
"y": 75
19871984
},
1988-
"id": 41,
1985+
"id": 46,
19891986
"options": {
19901987
"legend": {
19911988
"calcs": [],
@@ -1999,35 +1996,26 @@
19991996
"sort": "none"
20001997
}
20011998
},
2002-
"pluginVersion": "11.6.0-83314",
1999+
"pluginVersion": "10.1.1",
20032000
"targets": [
20042001
{
20052002
"datasource": {
20062003
"type": "prometheus",
20072004
"uid": "prometheus"
20082005
},
2009-
"editorMode": "code",
2010-
"expr": "sum(rate(container_fs_reads_total{container=\"linera-server\"}[60s])) by (pod)",
2006+
"disableTextWrap": false,
2007+
"editorMode": "builder",
2008+
"expr": "kube_pod_container_status_last_terminated_exitcode",
2009+
"fullMetaSearch": false,
2010+
"includeNullMetadata": true,
20112011
"instant": false,
20122012
"legendFormat": "{{pod}}",
20132013
"range": true,
2014-
"refId": "A"
2015-
},
2016-
{
2017-
"datasource": {
2018-
"type": "prometheus",
2019-
"uid": "prometheus"
2020-
},
2021-
"editorMode": "code",
2022-
"expr": "sum(rate(container_fs_reads_total{container=\"linera-server\"}[60s]))",
2023-
"hide": false,
2024-
"instant": false,
2025-
"legendFormat": "Total",
2026-
"range": true,
2027-
"refId": "B"
2014+
"refId": "A",
2015+
"useBackend": false
20282016
}
20292017
],
2030-
"title": "Shards FS reads per second",
2018+
"title": "Container termination exit code",
20312019
"type": "timeseries"
20322020
},
20332021
{
@@ -2205,7 +2193,7 @@
22052193
"x": 12,
22062194
"y": 83
22072195
},
2208-
"id": 31,
2196+
"id": 41,
22092197
"options": {
22102198
"legend": {
22112199
"calcs": [],
@@ -2227,7 +2215,7 @@
22272215
"uid": "prometheus"
22282216
},
22292217
"editorMode": "code",
2230-
"expr": "sum(rate(container_fs_writes_total{container=\"linera-server\"}[60s])) by (pod)",
2218+
"expr": "sum(rate(container_fs_reads_total{container=\"linera-server\"}[60s])) by (pod)",
22312219
"instant": false,
22322220
"legendFormat": "{{pod}}",
22332221
"range": true,
@@ -2239,15 +2227,15 @@
22392227
"uid": "prometheus"
22402228
},
22412229
"editorMode": "code",
2242-
"expr": "sum(rate(container_fs_writes_total{container=\"linera-server\"}[60s]))",
2230+
"expr": "sum(rate(container_fs_reads_total{container=\"linera-server\"}[60s]))",
22432231
"hide": false,
22442232
"instant": false,
22452233
"legendFormat": "Total",
22462234
"range": true,
22472235
"refId": "B"
22482236
}
22492237
],
2250-
"title": "Shards FS writes per second",
2238+
"title": "Shards FS reads per second",
22512239
"type": "timeseries"
22522240
},
22532241
{
@@ -2360,6 +2348,116 @@
23602348
"title": "Shards FS written bytes per second",
23612349
"type": "timeseries"
23622350
},
2351+
{
2352+
"datasource": {
2353+
"type": "prometheus",
2354+
"uid": "prometheus"
2355+
},
2356+
"fieldConfig": {
2357+
"defaults": {
2358+
"color": {
2359+
"mode": "palette-classic"
2360+
},
2361+
"custom": {
2362+
"axisCenteredZero": false,
2363+
"axisColorMode": "text",
2364+
"axisLabel": "",
2365+
"axisPlacement": "auto",
2366+
"barAlignment": 0,
2367+
"drawStyle": "line",
2368+
"fillOpacity": 0,
2369+
"gradientMode": "none",
2370+
"hideFrom": {
2371+
"legend": false,
2372+
"tooltip": false,
2373+
"viz": false
2374+
},
2375+
"insertNulls": false,
2376+
"lineInterpolation": "smooth",
2377+
"lineWidth": 1,
2378+
"pointSize": 5,
2379+
"scaleDistribution": {
2380+
"type": "linear"
2381+
},
2382+
"showPoints": "auto",
2383+
"spanNulls": false,
2384+
"stacking": {
2385+
"group": "A",
2386+
"mode": "none"
2387+
},
2388+
"thresholdsStyle": {
2389+
"mode": "off"
2390+
}
2391+
},
2392+
"mappings": [],
2393+
"thresholds": {
2394+
"mode": "absolute",
2395+
"steps": [
2396+
{
2397+
"color": "green",
2398+
"value": null
2399+
},
2400+
{
2401+
"color": "red",
2402+
"value": 80
2403+
}
2404+
]
2405+
},
2406+
"unit": "none"
2407+
},
2408+
"overrides": []
2409+
},
2410+
"gridPos": {
2411+
"h": 8,
2412+
"w": 12,
2413+
"x": 12,
2414+
"y": 91
2415+
},
2416+
"id": 31,
2417+
"options": {
2418+
"legend": {
2419+
"calcs": [],
2420+
"displayMode": "list",
2421+
"placement": "bottom",
2422+
"showLegend": true
2423+
},
2424+
"tooltip": {
2425+
"hideZeros": false,
2426+
"mode": "single",
2427+
"sort": "none"
2428+
}
2429+
},
2430+
"pluginVersion": "11.6.0-83314",
2431+
"targets": [
2432+
{
2433+
"datasource": {
2434+
"type": "prometheus",
2435+
"uid": "prometheus"
2436+
},
2437+
"editorMode": "code",
2438+
"expr": "sum(rate(container_fs_writes_total{container=\"linera-server\"}[60s])) by (pod)",
2439+
"instant": false,
2440+
"legendFormat": "{{pod}}",
2441+
"range": true,
2442+
"refId": "A"
2443+
},
2444+
{
2445+
"datasource": {
2446+
"type": "prometheus",
2447+
"uid": "prometheus"
2448+
},
2449+
"editorMode": "code",
2450+
"expr": "sum(rate(container_fs_writes_total{container=\"linera-server\"}[60s]))",
2451+
"hide": false,
2452+
"instant": false,
2453+
"legendFormat": "Total",
2454+
"range": true,
2455+
"refId": "B"
2456+
}
2457+
],
2458+
"title": "Shards FS writes per second",
2459+
"type": "timeseries"
2460+
},
23632461
{
23642462
"collapsed": false,
23652463
"gridPos": {

0 commit comments

Comments
 (0)