diff --git a/docs/node-mixin/.lint b/docs/node-mixin/.lint new file mode 100644 index 0000000000..c95289c4dc --- /dev/null +++ b/docs/node-mixin/.lint @@ -0,0 +1,58 @@ +--- +exclusions: + template-datasource-rule: + reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources. + panel-datasource-rule: + reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources. + target-job-rule: + reason: Job is hardcoded by the mixin. + entries: + - dashboard: Node Exporter / USE Method / Node + - dashboard: Node Exporter / Nodes + - dashboard: Node Exporter / MacOS + - dashboard: Node Exporter / USE Method / Multi-cluster + - dashboard: Node Exporter / USE Method / Cluster + template-job-rule: + reason: Job is hardcoded by the mixin. + entries: + - dashboard: Node Exporter / USE Method / Node + - dashboard: Node Exporter / Nodes + - dashboard: Node Exporter / MacOS + - dashboard: Node Exporter / USE Method / Multi-cluster + - dashboard: Node Exporter / USE Method / Cluster + target-instance-rule: + entries: + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Instances are aggregated for all clusters + - dashboard: Node Exporter / USE Method / Cluster + reason: Instances are aggregated for the whole cluster + - dashboard: Node Exporter / USE Method / Node + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / Nodes + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / MacOS + reason: Dashboard only allows selecting a single instance at a time. + template-instance-rule: + entries: + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Instances are aggregated for all clusters + - dashboard: Node Exporter / USE Method / Cluster + reason: Instances are aggregated for the whole cluster + - dashboard: Node Exporter / Nodes + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / MacOS + reason: Ignoring mislabeling of instance template + - dashboard: Node Exporter / USE Method / Node + reason: Ignoring mislabeling of instance template + panel-units-rule: + entries: + - dashboard: Node Exporter / Nodes + reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / MacOS + reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Cluster + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Node + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 2382ac292d..4496573204 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -204,7 +204,7 @@ ||| % $._config, annotations: { summary: 'Node Exporter text file collector failed to scrape.', - description: 'Node Exporter text file collector failed to scrape.', + description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', }, labels: { severity: 'warning', @@ -260,7 +260,7 @@ severity: 'critical', }, annotations: { - summary: 'RAID Array is degraded', + summary: 'RAID Array is degraded.', description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", }, }, @@ -273,7 +273,7 @@ severity: 'warning', }, annotations: { - summary: 'Failed device in RAID array', + summary: 'Failed device in RAID array.', description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", }, }, diff --git a/docs/node-mixin/dashboards.jsonnet b/docs/node-mixin/dashboards.jsonnet index 9d913ed3f1..fb70fdeabe 100644 --- a/docs/node-mixin/dashboards.jsonnet +++ b/docs/node-mixin/dashboards.jsonnet @@ -1,6 +1,6 @@ local dashboards = (import 'mixin.libsonnet').grafanaDashboards; { - [name]: dashboards[name] + [name]: dashboards[name] + { uid: std.md5(name) }, for name in std.objectFields(dashboards) } diff --git a/docs/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet index e6adbd4fa0..cb340b952a 100644 --- a/docs/node-mixin/dashboards/dashboards.libsonnet +++ b/docs/node-mixin/dashboards/dashboards.libsonnet @@ -1,2 +1,3 @@ (import 'node.libsonnet') + -(import 'use.libsonnet') +(import 'use.libsonnet') + +(import 'defaults.libsonnet') \ No newline at end of file diff --git a/docs/node-mixin/dashboards/defaults.libsonnet b/docs/node-mixin/dashboards/defaults.libsonnet new file mode 100644 index 0000000000..e9197bb0e8 --- /dev/null +++ b/docs/node-mixin/dashboards/defaults.libsonnet @@ -0,0 +1,8 @@ +{ + local grafanaDashboards = super.grafanaDashboards, + grafanaDashboards:: + { + [fname]: grafanaDashboards[fname] { uid: std.md5(fname) } + for fname in std.objectFields(grafanaDashboards) + }, +} diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 1602c13480..d56ef4f8a7 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -25,6 +25,7 @@ local datasourceTemplate = { local CPUUtilisation = graphPanel.new( 'CPU Utilisation', + description='Total CPU utilisation percent.', datasource='$datasource', span=6, format='percentunit', @@ -38,6 +39,7 @@ local CPUSaturation = // average relates to the "CPU saturation" in the title. graphPanel.new( 'CPU Saturation (Load1 per CPU)', + description='System load average over the last minute. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.', datasource='$datasource', span=6, format='percentunit', @@ -49,6 +51,7 @@ local CPUSaturation = local memoryUtilisation = graphPanel.new( 'Memory Utilisation', + description='Total memory utilisation in bytes.', datasource='$datasource', span=6, format='percentunit', @@ -60,6 +63,7 @@ local memoryUtilisation = local memorySaturation = graphPanel.new( 'Memory Saturation (Major Page Faults)', + description='Rate of major memory page faults.', datasource='$datasource', span=6, format='rds', @@ -71,6 +75,7 @@ local memorySaturation = local networkUtilisation = graphPanel.new( 'Network Utilisation (Bytes Receive/Transmit)', + description='Network Utilisation (Bytes Receive/Transmit)', datasource='$datasource', span=6, format='Bps', @@ -85,6 +90,7 @@ local networkUtilisation = local networkSaturation = graphPanel.new( 'Network Saturation (Drops Receive/Transmit)', + description='Network Saturation (Drops Receive/Transmit)', datasource='$datasource', span=6, format='Bps', @@ -99,6 +105,7 @@ local networkSaturation = local diskIOUtilisation = graphPanel.new( 'Disk IO Utilisation', + description='Disk total IO seconds.', datasource='$datasource', span=6, format='percentunit', @@ -110,6 +117,7 @@ local diskIOUtilisation = local diskIOSaturation = graphPanel.new( 'Disk IO Saturation', + description='Disk saturation (weighted seconds spent, 1 second rate)', datasource='$datasource', span=6, format='percentunit', @@ -121,6 +129,7 @@ local diskIOSaturation = local diskSpaceUtilisation = graphPanel.new( 'Disk Space Utilisation', + description='Total disk utilisation percent', datasource='$datasource', span=12, format='percentunit', @@ -453,7 +462,7 @@ local diskSpaceUtilisation = sum ( sum without (device) ( max without (fstype, mountpoint, instance, pod) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s) - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} ) != 0) ) / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}))) diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet index 6c4d990481..d8e4d9ff07 100644 --- a/docs/node-mixin/lib/prom-mixin.libsonnet +++ b/docs/node-mixin/lib/prom-mixin.libsonnet @@ -47,6 +47,7 @@ local table = grafana70.panel.table; local idleCPU = graphPanel.new( 'CPU Usage', + description='Total CPU utilisation percent.', datasource='$datasource', span=6, format='percentunit', @@ -69,6 +70,7 @@ local table = grafana70.panel.table; local systemLoad = graphPanel.new( 'Load Average', + description='System load average over the previous 1, 5, and 15 minute ranges. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.', datasource='$datasource', span=6, format='short', @@ -83,6 +85,7 @@ local table = grafana70.panel.table; local memoryGraphPanelPrototype = graphPanel.new( 'Memory Usage', + description='Memory usage by category, measured in bytes.', datasource='$datasource', span=9, format='bytes', @@ -137,6 +140,7 @@ local table = grafana70.panel.table; local memoryGaugePanelPrototype = gaugePanel.new( title='Memory Usage', + description='Total memory utilisation by category, in bytes.', datasource='$datasource', ) .addThresholdStep('rgba(50, 172, 45, 0.97)') @@ -183,6 +187,7 @@ local table = grafana70.panel.table; local diskIO = graphPanel.new( 'Disk I/O', + description='Disk read/writes in bytes, and total IO seconds.', datasource='$datasource', span=6, min=0, @@ -224,6 +229,7 @@ local table = grafana70.panel.table; local diskSpaceUsage = table.new( title='Disk Space Usage', + description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.', datasource='$datasource', ) .setFieldConfig(unit='decbytes') diff --git a/go.mod b/go.mod index a98d3de2d4..1761405d9c 100644 --- a/go.mod +++ b/go.mod @@ -22,7 +22,7 @@ require ( github.com/prometheus/client_golang v1.13.0 github.com/prometheus/client_model v0.2.0 github.com/prometheus/common v0.37.0 - github.com/prometheus/exporter-toolkit v0.8.1 + github.com/prometheus/exporter-toolkit v0.8.2 github.com/prometheus/procfs v0.8.0 github.com/safchain/ethtool v0.2.0 github.com/soundcloud/go-runit v0.0.0-20150630195641-06ad41a06c4a diff --git a/go.sum b/go.sum index f20b226111..5e555cfa65 100644 --- a/go.sum +++ b/go.sum @@ -306,8 +306,8 @@ github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9 github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE= github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= -github.com/prometheus/exporter-toolkit v0.8.1 h1:TpKt8z55q1zF30BYaZKqh+bODY0WtByHDOhDA2M9pEs= -github.com/prometheus/exporter-toolkit v0.8.1/go.mod h1:00shzmJL7KxcsabLWcONwpyNEuWhREOnFqZW7vadFS0= +github.com/prometheus/exporter-toolkit v0.8.2 h1:sbJAfBXQFkG6sUkbwBun8MNdzW9+wd5YfPYofbmj0YM= +github.com/prometheus/exporter-toolkit v0.8.2/go.mod h1:00shzmJL7KxcsabLWcONwpyNEuWhREOnFqZW7vadFS0= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=