Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions docs/node-mixin/.lint
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
exclusions:
template-datasource-rule:
reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources.
panel-datasource-rule:
reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources.
target-job-rule:
reason: Job is hardcoded by the mixin.
entries:
- dashboard: Node Exporter / USE Method / Node
- dashboard: Node Exporter / Nodes
- dashboard: Node Exporter / MacOS
- dashboard: Node Exporter / USE Method / Multi-cluster
- dashboard: Node Exporter / USE Method / Cluster
template-job-rule:
reason: Job is hardcoded by the mixin.
entries:
- dashboard: Node Exporter / USE Method / Node
- dashboard: Node Exporter / Nodes
- dashboard: Node Exporter / MacOS
- dashboard: Node Exporter / USE Method / Multi-cluster
- dashboard: Node Exporter / USE Method / Cluster
target-instance-rule:
entries:
- dashboard: Node Exporter / USE Method / Multi-cluster
reason: Instances are aggregated for all clusters
- dashboard: Node Exporter / USE Method / Cluster
reason: Instances are aggregated for the whole cluster
- dashboard: Node Exporter / USE Method / Node
reason: Dashboard only allows selecting a single instance at a time.
- dashboard: Node Exporter / Nodes
reason: Dashboard only allows selecting a single instance at a time.
- dashboard: Node Exporter / MacOS
reason: Dashboard only allows selecting a single instance at a time.
template-instance-rule:
entries:
- dashboard: Node Exporter / USE Method / Multi-cluster
reason: Instances are aggregated for all clusters
- dashboard: Node Exporter / USE Method / Cluster
reason: Instances are aggregated for the whole cluster
- dashboard: Node Exporter / Nodes
reason: Dashboard only allows selecting a single instance at a time.
- dashboard: Node Exporter / MacOS
reason: Ignoring mislabeling of instance template
- dashboard: Node Exporter / USE Method / Node
reason: Ignoring mislabeling of instance template
panel-units-rule:
entries:
- dashboard: Node Exporter / Nodes
reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly.
- dashboard: Node Exporter / MacOS
reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly.
- dashboard: Node Exporter / USE Method / Cluster
reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly.
- dashboard: Node Exporter / USE Method / Multi-cluster
reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly.
- dashboard: Node Exporter / USE Method / Node
reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly.
6 changes: 3 additions & 3 deletions docs/node-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@
||| % $._config,
annotations: {
summary: 'Node Exporter text file collector failed to scrape.',
description: 'Node Exporter text file collector failed to scrape.',
description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.',
},
labels: {
severity: 'warning',
Expand Down Expand Up @@ -260,7 +260,7 @@
severity: 'critical',
},
annotations: {
summary: 'RAID Array is degraded',
summary: 'RAID Array is degraded.',
description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
},
},
Expand All @@ -273,7 +273,7 @@
severity: 'warning',
},
annotations: {
summary: 'Failed device in RAID array',
summary: 'Failed device in RAID array.',
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
},
},
Expand Down
2 changes: 1 addition & 1 deletion docs/node-mixin/dashboards.jsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
local dashboards = (import 'mixin.libsonnet').grafanaDashboards;

{
[name]: dashboards[name]
[name]: dashboards[name] + { uid: std.md5(name) },
for name in std.objectFields(dashboards)
}
3 changes: 2 additions & 1 deletion docs/node-mixin/dashboards/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
(import 'node.libsonnet') +
(import 'use.libsonnet')
(import 'use.libsonnet') +
(import 'defaults.libsonnet')
8 changes: 8 additions & 0 deletions docs/node-mixin/dashboards/defaults.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
local grafanaDashboards = super.grafanaDashboards,
grafanaDashboards::
{
[fname]: grafanaDashboards[fname] { uid: std.md5(fname) }
for fname in std.objectFields(grafanaDashboards)
},
}
11 changes: 10 additions & 1 deletion docs/node-mixin/dashboards/use.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ local datasourceTemplate = {
local CPUUtilisation =
graphPanel.new(
'CPU Utilisation',
description='Total CPU utilisation percent.',
datasource='$datasource',
span=6,
format='percentunit',
Expand All @@ -38,6 +39,7 @@ local CPUSaturation =
// average relates to the "CPU saturation" in the title.
graphPanel.new(
'CPU Saturation (Load1 per CPU)',
description='System load average over the last minute. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.',
datasource='$datasource',
span=6,
format='percentunit',
Expand All @@ -49,6 +51,7 @@ local CPUSaturation =
local memoryUtilisation =
graphPanel.new(
'Memory Utilisation',
description='Total memory utilisation in bytes.',
datasource='$datasource',
span=6,
format='percentunit',
Expand All @@ -60,6 +63,7 @@ local memoryUtilisation =
local memorySaturation =
graphPanel.new(
'Memory Saturation (Major Page Faults)',
description='Rate of major memory page faults.',
datasource='$datasource',
span=6,
format='rds',
Expand All @@ -71,6 +75,7 @@ local memorySaturation =
local networkUtilisation =
graphPanel.new(
'Network Utilisation (Bytes Receive/Transmit)',
description='Network Utilisation (Bytes Receive/Transmit)',
datasource='$datasource',
span=6,
format='Bps',
Expand All @@ -85,6 +90,7 @@ local networkUtilisation =
local networkSaturation =
graphPanel.new(
'Network Saturation (Drops Receive/Transmit)',
description='Network Saturation (Drops Receive/Transmit)',
datasource='$datasource',
span=6,
format='Bps',
Expand All @@ -99,6 +105,7 @@ local networkSaturation =
local diskIOUtilisation =
graphPanel.new(
'Disk IO Utilisation',
description='Disk total IO seconds.',
datasource='$datasource',
span=6,
format='percentunit',
Expand All @@ -110,6 +117,7 @@ local diskIOUtilisation =
local diskIOSaturation =
graphPanel.new(
'Disk IO Saturation',
description='Disk saturation (weighted seconds spent, 1 second rate)',
datasource='$datasource',
span=6,
format='percentunit',
Expand All @@ -121,6 +129,7 @@ local diskIOSaturation =
local diskSpaceUtilisation =
graphPanel.new(
'Disk Space Utilisation',
description='Total disk utilisation percent',
datasource='$datasource',
span=12,
format='percentunit',
Expand Down Expand Up @@ -453,7 +462,7 @@ local diskSpaceUtilisation =
sum (
sum without (device) (
max without (fstype, mountpoint, instance, pod) ((
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s) - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}
) != 0)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s})))
Expand Down
6 changes: 6 additions & 0 deletions docs/node-mixin/lib/prom-mixin.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ local table = grafana70.panel.table;
local idleCPU =
graphPanel.new(
'CPU Usage',
description='Total CPU utilisation percent.',
datasource='$datasource',
span=6,
format='percentunit',
Expand All @@ -69,6 +70,7 @@ local table = grafana70.panel.table;
local systemLoad =
graphPanel.new(
'Load Average',
description='System load average over the previous 1, 5, and 15 minute ranges. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.',
datasource='$datasource',
span=6,
format='short',
Expand All @@ -83,6 +85,7 @@ local table = grafana70.panel.table;
local memoryGraphPanelPrototype =
graphPanel.new(
'Memory Usage',
description='Memory usage by category, measured in bytes.',
datasource='$datasource',
span=9,
format='bytes',
Expand Down Expand Up @@ -137,6 +140,7 @@ local table = grafana70.panel.table;
local memoryGaugePanelPrototype =
gaugePanel.new(
title='Memory Usage',
description='Total memory utilisation by category, in bytes.',
datasource='$datasource',
)
.addThresholdStep('rgba(50, 172, 45, 0.97)')
Expand Down Expand Up @@ -183,6 +187,7 @@ local table = grafana70.panel.table;
local diskIO =
graphPanel.new(
'Disk I/O',
description='Disk read/writes in bytes, and total IO seconds.',
datasource='$datasource',
span=6,
min=0,
Expand Down Expand Up @@ -224,6 +229,7 @@ local table = grafana70.panel.table;
local diskSpaceUsage =
table.new(
title='Disk Space Usage',
description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.',
datasource='$datasource',
)
.setFieldConfig(unit='decbytes')
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ require (
github.com/prometheus/client_golang v1.13.0
github.com/prometheus/client_model v0.2.0
github.com/prometheus/common v0.37.0
github.com/prometheus/exporter-toolkit v0.8.1
github.com/prometheus/exporter-toolkit v0.8.2
github.com/prometheus/procfs v0.8.0
github.com/safchain/ethtool v0.2.0
github.com/soundcloud/go-runit v0.0.0-20150630195641-06ad41a06c4a
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,8 @@ github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9
github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE=
github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
github.com/prometheus/exporter-toolkit v0.8.1 h1:TpKt8z55q1zF30BYaZKqh+bODY0WtByHDOhDA2M9pEs=
github.com/prometheus/exporter-toolkit v0.8.1/go.mod h1:00shzmJL7KxcsabLWcONwpyNEuWhREOnFqZW7vadFS0=
github.com/prometheus/exporter-toolkit v0.8.2 h1:sbJAfBXQFkG6sUkbwBun8MNdzW9+wd5YfPYofbmj0YM=
github.com/prometheus/exporter-toolkit v0.8.2/go.mod h1:00shzmJL7KxcsabLWcONwpyNEuWhREOnFqZW7vadFS0=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
Expand Down