Skip to content

Commit b9f4106

Browse files
authored
Refactor windows observ lib to use signals (#1455)
* Refactor windows observ lib to use signals * fmt * Add .pint exclusion * Update fleet
1 parent c4a4ab2 commit b9f4106

22 files changed

+1768
-783
lines changed

windows-observ-lib/.pint.hcl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
//ignore smelly selectors only for specific disk alerts with legitimate volume filtering
2+
rule {
3+
match {
4+
kind = "alerting"
5+
name = "WindowsDiskAlmostOutOfSpace"
6+
}
7+
disable = ["promql/regexp"]
8+
}

windows-observ-lib/alerts.libsonnet

Lines changed: 188 additions & 169 deletions
Large diffs are not rendered by default.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
local g = import './g.libsonnet';
2+
local commonlib = import 'common-lib/common/main.libsonnet';
3+
4+
{
5+
new(this): {
6+
local config = this.config,
7+
local signals = this.signals,
8+
9+
reboot: commonlib.annotations.reboot.new(
10+
title='Reboot',
11+
target=signals.system.bootTime.asTarget() +
12+
{
13+
expr: signals.system.bootTime.asPanelExpression() + '*1000 > $__from < $__to',
14+
},
15+
instanceLabels=std.join(',', config.instanceLabels),
16+
)
17+
+ commonlib.annotations.base.withTagKeys(std.join(',', config.groupLabels + config.instanceLabels)),
18+
19+
serviceFailed: if config.enableLokiLogs then
20+
commonlib.annotations.serviceFailed.new(
21+
title='Service failed',
22+
target=signals.logs.serviceFailedLogs.asTarget(),
23+
)
24+
+ commonlib.annotations.base.withTagKeys(std.join(',', config.groupLabels + config.instanceLabels + ['level']))
25+
+ commonlib.annotations.base.withTextFormat('{{message}}')
26+
else {},
27+
28+
criticalEvents: if config.enableLokiLogs then
29+
commonlib.annotations.fatal.new(
30+
title='Critical system event',
31+
target=signals.logs.criticalEventsLogs.asTarget(),
32+
)
33+
+ commonlib.annotations.base.withTagKeys(std.join(',', config.groupLabels + config.instanceLabels + ['level']))
34+
+ commonlib.annotations.base.withTextFormat('{{message}}')
35+
else {},
36+
},
37+
}

windows-observ-lib/config.libsonnet

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
local this = self,
23
// any modular library should include as inputs:
34
// 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups
45
// 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules.
@@ -11,6 +12,7 @@
1112
dashboardTags: ['windows'],
1213
uid: 'windows',
1314
dashboardNamePrefix: '',
15+
metricsSource: 'prometheus', // metrics source for signals
1416

1517
// optional
1618
ignoreVolumes: 'HarddiskVolume.*',
@@ -22,21 +24,36 @@
2224
dashboardRefresh: '1m',
2325

2426
// optional Windows AD
25-
alertsHighPendingReplicationOperations: 50, // count
26-
alertsHighReplicationSyncRequestFailures: 0, // count
27-
alertsHighPasswordChanges: 25, // count
27+
alertsHighPendingReplicationOperations: '50', // count
28+
alertsHighReplicationSyncRequestFailures: '0', // count
29+
alertsHighPasswordChanges: '25', // count
2830
alertsMetricsDownJobName: 'integrations/windows_exporter',
2931
enableADDashboard: false,
3032

3133
// logs lib related
3234
enableLokiLogs: true,
3335
extraLogLabels: ['channel', 'source', 'keywords', 'level'],
3436
logsVolumeGroupBy: 'level',
37+
logsGroupLabels: this.groupLabels,
38+
logsInstanceLabels: this.instanceLabels,
39+
logsFilteringSelector: this.filteringSelector,
3540
showLogsVolume: true,
3641
logsExtraFilters:
3742
|||
3843
| label_format timestamp="{{__timestamp__}}"
3944
| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted
4045
| line_format `{{ if eq "[[instance]]" ".*" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`
4146
|||,
47+
48+
signals+: {
49+
system: (import './signals/system.libsonnet')(this),
50+
cpu: (import './signals/cpu.libsonnet')(this),
51+
memory: (import './signals/memory.libsonnet')(this),
52+
disk: (import './signals/disk.libsonnet')(this),
53+
network: (import './signals/network.libsonnet')(this),
54+
services: (import './signals/services.libsonnet')(this),
55+
activeDirectory: (import './signals/activeDirectory.libsonnet')(this),
56+
alerts: (import './signals/alerts.libsonnet')(this),
57+
logs: (import './signals/logs.libsonnet')(this),
58+
},
4259
}

windows-observ-lib/dashboards_out/disks

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,10 @@
126126
"uid": "${prometheus_datasource}"
127127
},
128128
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
129-
"legendFormat": "{{ volume }} available"
129+
"format": "time_series",
130+
"instant": false,
131+
"legendFormat": "{{ volume }} available",
132+
"refId": "Disk free"
130133
}
131134
],
132135
"title": "Filesystem space available",
@@ -254,6 +257,7 @@
254257
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
255258
"format": "table",
256259
"instant": true,
260+
"legendFormat": "{{volume}}: Total",
257261
"refId": "TOTAL"
258262
},
259263
{
@@ -427,23 +431,32 @@
427431
"uid": "${prometheus_datasource}"
428432
},
429433
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
430-
"legendFormat": "{{ volume }} read"
434+
"format": "time_series",
435+
"instant": false,
436+
"legendFormat": "{{ volume }} read",
437+
"refId": "Disk read bytes"
431438
},
432439
{
433440
"datasource": {
434441
"type": "prometheus",
435442
"uid": "${prometheus_datasource}"
436443
},
437444
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
438-
"legendFormat": "{{ volume }} written"
445+
"format": "time_series",
446+
"instant": false,
447+
"legendFormat": "{{ volume }} written",
448+
"refId": "Disk write bytes"
439449
},
440450
{
441451
"datasource": {
442452
"type": "prometheus",
443453
"uid": "${prometheus_datasource}"
444454
},
445455
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
446-
"legendFormat": "{{ volume }} io util"
456+
"format": "time_series",
457+
"instant": false,
458+
"legendFormat": "{{ volume }} io util",
459+
"refId": "Disk I/O utilization"
447460
}
448461
],
449462
"title": "Disk reads/writes",
@@ -507,16 +520,22 @@
507520
"type": "prometheus",
508521
"uid": "${prometheus_datasource}"
509522
},
510-
"expr": "irate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
511-
"legendFormat": "{{ volume }} reads"
523+
"expr": "irate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
524+
"format": "time_series",
525+
"instant": false,
526+
"legendFormat": "{{ volume }} reads",
527+
"refId": "Disk reads"
512528
},
513529
{
514530
"datasource": {
515531
"type": "prometheus",
516532
"uid": "${prometheus_datasource}"
517533
},
518-
"expr": "irate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
519-
"legendFormat": "{{ volume }} writes"
534+
"expr": "irate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
535+
"format": "time_series",
536+
"instant": false,
537+
"legendFormat": "{{ volume }} writes",
538+
"refId": "Disk writes"
520539
}
521540
],
522541
"title": "Disk I/O",
@@ -580,16 +599,22 @@
580599
"type": "prometheus",
581600
"uid": "${prometheus_datasource}"
582601
},
583-
"expr": "irate(windows_logical_disk_read_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
584-
"legendFormat": "{{ volume }} avg read time"
602+
"expr": "irate(windows_logical_disk_read_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) / irate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
603+
"format": "time_series",
604+
"instant": false,
605+
"legendFormat": "{{ volume }} avg read time",
606+
"refId": "Disk read time"
585607
},
586608
{
587609
"datasource": {
588610
"type": "prometheus",
589611
"uid": "${prometheus_datasource}"
590612
},
591-
"expr": "irate(windows_logical_disk_write_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
592-
"legendFormat": "{{ volume }} avg write time"
613+
"expr": "irate(windows_logical_disk_write_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) / irate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
614+
"format": "time_series",
615+
"instant": false,
616+
"legendFormat": "{{ volume }} avg write time",
617+
"refId": "Disk write time"
593618
}
594619
],
595620
"title": "Disk average wait time",
@@ -653,15 +678,21 @@
653678
"uid": "${prometheus_datasource}"
654679
},
655680
"expr": "irate(windows_logical_disk_avg_read_requests_queued{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
656-
"legendFormat": "{{ volume }} read queue"
681+
"format": "time_series",
682+
"instant": false,
683+
"legendFormat": "{{ volume }} read queue",
684+
"refId": "Disk read queue"
657685
},
658686
{
659687
"datasource": {
660688
"type": "prometheus",
661689
"uid": "${prometheus_datasource}"
662690
},
663691
"expr": "irate(windows_logical_disk_avg_write_requests_queued{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
664-
"legendFormat": "{{ volume }} write queue"
692+
"format": "time_series",
693+
"instant": false,
694+
"legendFormat": "{{ volume }} write queue",
695+
"refId": "Disk write queue"
665696
}
666697
],
667698
"title": "Disk average queue",

0 commit comments

Comments
 (0)