Skip to content

Commit bbd9afa

Browse files
authored
Windows memory and service signals fix (#1464)
* Fix memory and pagefile signals * Remove windows_service_status from current prometheus source * make fmt
1 parent 0818107 commit bbd9afa

File tree

7 files changed

+61
-41
lines changed

7 files changed

+61
-41
lines changed

windows-observ-lib/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ Grafana Agent/Alloy or combination of windows_exporter/promtail can be used in o
105105

106106
The following collectors should be enabled in windows_exporter/windows integration:
107107

108-
`enabled_collectors: cpu,cs,logical_disk,net,os,service,system,textfile,time,diskdrive`
108+
`enabled_collectors: cpu,logical_disk,net,os,service,system,textfile,time,diskdrive,pagefile,memory`
109109

110110
### Logs collection
111111

windows-observ-lib/alerts.libsonnet

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,17 @@
122122
annotations: {
123123
summary: 'High memory usage on Windows host.',
124124
description: |||
125-
Memory usage on host {{ $labels.%s }} is critically high, with {{ printf "%%.2f" $value }}%% of total memory used.
126-
This exceeds the threshold of %s%%.
127-
Current memory free: {{ with printf `windows_os_physical_memory_free_bytes{}` | query | first | value | humanize }}{{ . }}{{ end }}.
128-
Total memory: {{ with printf `windows_cs_physical_memory_bytes{}` | query | first | value | humanize }}{{ . }}{{ end }}.
125+
Memory usage on host {{ $labels.%(instanceLabel)s }} is critically high, with {{ printf "%%.2f" $value }}%% of total memory used.
126+
This exceeds the threshold of %(threshold)s%%.
127+
Current memory free: {{ with printf `%(memoryFree)s` | query | first | value | humanize }}{{ . }}{{ end }}.
128+
Total memory: {{ with printf `%(memoryTotal)s` | query | first | value | humanize }}{{ . }}{{ end }}.
129129
Consider investigating processes consuming high memory or increasing available memory.
130-
||| % [instanceLabel, config.alertMemoryUsageThresholdCritical],
130+
||| % {
131+
instanceLabel: instanceLabel,
132+
threshold: config.alertMemoryUsageThresholdCritical,
133+
memoryFree: signals.memory.memoryFree.asRuleExpression(),
134+
memoryTotal: signals.memory.memoryTotal.asRuleExpression(),
135+
},
131136
},
132137
},
133138
{
@@ -154,22 +159,6 @@
154159
||| % [instanceLabel, config.alertDiskUsageThresholdCritical, config.filteringSelector, config.filteringSelector],
155160
},
156161
},
157-
{
158-
alert: 'WindowsServiceNotHealthy',
159-
expr: |||
160-
(%s) > 0
161-
||| % [
162-
signals.services.serviceNotHealthy.asRuleExpression(),
163-
],
164-
'for': '5m',
165-
labels: {
166-
severity: 'critical',
167-
},
168-
annotations: {
169-
summary: 'Windows service is not healthy.',
170-
description: "Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.",
171-
},
172-
},
173162
{
174163
alert: 'WindowsDiskDriveNotHealthy',
175164
expr: |||
@@ -239,7 +228,26 @@
239228
},
240229
},
241230
]
242-
+ if config.enableADDashboard then ADAlerts else [],
231+
+ if std.member(config.metricsSource, 'prometheus_pre_0_30') then
232+
[
233+
{
234+
alert: 'WindowsServiceNotHealthy',
235+
expr: |||
236+
(%s) > 0
237+
||| % [
238+
signals.services.serviceNotHealthy.asRuleExpression(),
239+
],
240+
'for': '5m',
241+
labels: {
242+
severity: 'critical',
243+
},
244+
annotations: {
245+
summary: 'Windows service is not healthy.',
246+
description: "Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.",
247+
},
248+
},
249+
] else []
250+
+ if config.enableADDashboard then ADAlerts else [],
243251
},
244252
],
245253
},

windows-observ-lib/dashboards_out/overview

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@
416416
"type": "prometheus",
417417
"uid": "${prometheus_datasource}"
418418
},
419-
"expr": "windows_os_paging_limit_bytes{job=~\"$job\",instance=~\"$instance\"}",
419+
"expr": "windows_os_paging_limit_bytes{job=~\"$job\",instance=~\"$instance\"}\nor\nwindows_pagefile_limit_bytes{job=~\"$job\",instance=~\"$instance\"}",
420420
"format": "time_series",
421421
"instant": false,
422422
"legendFormat": "{{instance}}: Page total",

windows-observ-lib/prometheus_rules_out/prometheus_alerts.yaml

Lines changed: 15 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

windows-observ-lib/signals/memory.libsonnet

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@ function(this)
8686
unit: 'bytes',
8787
sources: {
8888
prometheus: {
89+
expr: 'windows_pagefile_limit_bytes{%(queriesSelector)s}',
90+
},
91+
prometheus_pre_0_30: {
8992
expr: 'windows_os_paging_limit_bytes{%(queriesSelector)s}',
9093
},
9194
},

windows-observ-lib/signals/services.libsonnet

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ function(this)
99
aggLevel: 'none',
1010
aggFunction: 'avg',
1111
discoveryMetric: {
12-
prometheus: 'windows_service_status',
12+
prometheus: 'windows_service_state',
13+
prometheus_pre_0_30: 'windows_service_status',
1314
},
1415
signals: {
1516
serviceStatus: {
@@ -18,8 +19,10 @@ function(this)
1819
type: 'gauge',
1920
description: 'Windows service status',
2021
unit: 'short',
22+
//https://github.com/prometheus-community/windows_exporter/pull/1584
23+
optional: true,
2124
sources: {
22-
prometheus: {
25+
prometheus_pre_0_30: {
2326
expr: 'windows_service_status{%(queriesSelector)s}',
2427
legendCustomTemplate: '{{ name }}',
2528
valueMappings: [
@@ -46,10 +49,12 @@ function(this)
4649
name: 'Service not healthy',
4750
nameShort: 'Not healthy',
4851
type: 'gauge',
49-
description: 'Services not in healthy state',
52+
description: 'Service not in healthy state',
5053
unit: 'short',
54+
optional: true,
5155
sources: {
52-
prometheus: {
56+
57+
prometheus_pre_0_30: {
5358
expr: 'windows_service_status{status!~"starting|stopping|ok", %(queriesSelector)s}',
5459
legendCustomTemplate: '{{ name }} ({{ status }})',
5560
},

windows-observ-lib/tests/prometheus_alerts_test.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ evaluation_interval: 15m
66
tests:
77
- interval: 1m
88
input_series:
9-
- series: 'windows_os_physical_memory_free_bytes{instance="host1"}'
9+
- series: 'windows_memory_physical_free_bytes{instance="host1"}'
1010
values: '10000000x15'
11-
- series: 'windows_cs_physical_memory_bytes{instance="host1"}'
11+
- series: 'windows_memory_physical_total_bytes{instance="host1"}'
1212
values: '1000000000x15'
1313
- series: 'windows_logical_disk_free_bytes{volume="C:", instance="host1"}'
1414
values: '10000000x15'

0 commit comments

Comments
 (0)