Skip to content

Commit 014f157

Browse files
committed
feat(monitoring): added alert manager and improve scraping on switches
1 parent f89ea8e commit 014f157

File tree

1 file changed

+198
-11
lines changed

1 file changed

+198
-11
lines changed

nix/nixos-modules/services/monitoring.nix

Lines changed: 198 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,67 @@
77
let
88
cfg = config.scale-network.services.monitoring;
99

10+
mimirRules = pkgs.writeText "rules.yml" (
11+
builtins.toJSON {
12+
groups = [
13+
{
14+
name = "server-alerts";
15+
rules = [
16+
{
17+
alert = "ServerDown";
18+
expr = ''count(up{job="integrations/unix"} == 0) > 0'';
19+
"for" = "5m";
20+
labels = {
21+
severity = "critical";
22+
};
23+
annotations = {
24+
summary = "One or more servers are down";
25+
};
26+
}
27+
{
28+
alert = "SwitchDown";
29+
expr = ''count(up{job="switches_system"} == 0) > 0'';
30+
"for" = "5m";
31+
labels = {
32+
severity = "critical";
33+
};
34+
annotations = {
35+
summary = "One or more switches are down";
36+
};
37+
}
38+
{
39+
alert = "ApDown";
40+
expr = ''count(up{job="aps"} == 0) > 0'';
41+
"for" = "5m";
42+
labels = {
43+
severity = "critical";
44+
};
45+
annotations = {
46+
summary = "One or more aps are down";
47+
};
48+
}
49+
{
50+
alert = "PiDown";
51+
expr = ''count(up{job="pis"} == 0) > 0'';
52+
"for" = "5m";
53+
labels = {
54+
severity = "critical";
55+
};
56+
annotations = {
57+
summary = "One or more pis are down";
58+
};
59+
}
60+
];
61+
}
62+
];
63+
}
64+
);
65+
66+
mimirRulesDir = pkgs.runCommand "mimir-rules" { } ''
67+
mkdir -p $out/anonymous
68+
cp ${mimirRules} $out/anonymous/rules.yml
69+
'';
70+
1071
inherit (lib)
1172
types
1273
;
@@ -38,6 +99,7 @@ in
3899
14250 # tempo-jaeger-grpc
39100
14268 # tempo-jaeger-thrift-http
40101
3000 # grafana-http
102+
9093 # alertmanager-http
41103
3100 # loki-http
42104
3200 # mimir-http
43105
3300 # tempo-http
@@ -139,14 +201,23 @@ in
139201
};
140202
};
141203

204+
ruler = {
205+
alertmanager_url = "http://127.0.0.1:9093";
206+
enable_api = true;
207+
evaluation_interval = "1m";
208+
};
209+
142210
ruler_storage = {
143-
storage_prefix = "ruler";
211+
backend = "local";
212+
local = {
213+
directory = mimirRulesDir;
214+
};
144215
};
145216

146217
limits = {
147218
ingestion_burst_size = 1000000;
148219
ingestion_rate = 100000;
149-
max_global_series_per_user = 300000;
220+
max_global_series_per_user = 600000;
150221
};
151222

152223
alertmanager_storage = {
@@ -261,11 +332,40 @@ in
261332
access = "proxy";
262333
url = "http://127.0.0.1:3300";
263334
}
335+
{
336+
name = "Alertmanager";
337+
type = "alertmanager";
338+
access = "proxy";
339+
url = "http://127.0.0.1:9093";
340+
}
264341
];
265342
};
266343
};
267344
};
268345

346+
# Alertmanager for receiving and routing alerts from Mimir
347+
services.prometheus.alertmanager = {
348+
enable = true;
349+
port = 9093;
350+
extraFlags = [
351+
"--web.external-url=https://${cfg.fqdn}/alertmanager/"
352+
"--web.route-prefix=/"
353+
];
354+
configuration = {
355+
route = {
356+
receiver = "default";
357+
group_wait = "30s";
358+
group_interval = "5m";
359+
repeat_interval = "4h";
360+
};
361+
receivers = [
362+
{
363+
name = "default";
364+
}
365+
];
366+
};
367+
};
368+
269369
# Standalone SNMP exporter for Juniper switches
270370
services.prometheus.exporters.snmp = {
271371
configurationPath = ./snmp-alloy.yml;
@@ -311,14 +411,16 @@ in
311411
}
312412
313413
// Relabel switch targets for the standalone SNMP exporter.
314-
// The SNMP exporter is an HTTP service at 127.0.0.1:9116 that accepts
315-
// target, module, and auth as query parameters:
316-
// GET /snmp?target=[ipv6]:161&module=if_mib,system,jnxOperating&auth=Junitux
317-
discovery.relabel "switches" {
414+
// Each module gets its own relabel + scrape to avoid overwhelming the
415+
// SNMP exporter with 39 concurrent multi-module walks.
416+
discovery.relabel "switches_if_mib" {
318417
targets = discovery.file.switches.targets
319418
419+
// Strip brackets and port from IPv6 addresses (e.g. [2001:db8::1]:161 -> 2001:db8::1)
320420
rule {
321421
source_labels = ["__address__"]
422+
regex = "\\[(.+?)\\](?::\\d+)?"
423+
replacement = "$1"
322424
target_label = "__param_target"
323425
}
324426
@@ -329,7 +431,7 @@ in
329431
330432
rule {
331433
target_label = "__param_module"
332-
replacement = "if_mib,system,jnxOperating"
434+
replacement = "if_mib"
333435
}
334436
335437
rule {
@@ -343,14 +445,96 @@ in
343445
}
344446
}
345447
346-
// Scrape SNMP metrics from switches via the standalone SNMP exporter
347-
prometheus.scrape "switches" {
348-
targets = discovery.relabel.switches.output
448+
discovery.relabel "switches_system" {
449+
targets = discovery.file.switches.targets
450+
451+
rule {
452+
source_labels = ["__address__"]
453+
regex = "\\[(.+?)\\](?::\\d+)?"
454+
replacement = "$1"
455+
target_label = "__param_target"
456+
}
457+
458+
rule {
459+
source_labels = ["__param_target"]
460+
target_label = "instance"
461+
}
462+
463+
rule {
464+
target_label = "__param_module"
465+
replacement = "system"
466+
}
467+
468+
rule {
469+
target_label = "__param_auth"
470+
replacement = "Junitux"
471+
}
472+
473+
rule {
474+
target_label = "__address__"
475+
replacement = "127.0.0.1:9116"
476+
}
477+
}
478+
479+
discovery.relabel "switches_jnx_operating" {
480+
targets = discovery.file.switches.targets
481+
482+
rule {
483+
source_labels = ["__address__"]
484+
regex = "\\[(.+?)\\](?::\\d+)?"
485+
replacement = "$1"
486+
target_label = "__param_target"
487+
}
488+
489+
rule {
490+
source_labels = ["__param_target"]
491+
target_label = "instance"
492+
}
493+
494+
rule {
495+
target_label = "__param_module"
496+
replacement = "jnxOperating"
497+
}
498+
499+
rule {
500+
target_label = "__param_auth"
501+
replacement = "Junitux"
502+
}
503+
504+
rule {
505+
target_label = "__address__"
506+
replacement = "127.0.0.1:9116"
507+
}
508+
}
509+
510+
// Scrape SNMP system metrics (SNMPv2-MIB)
511+
prometheus.scrape "switches_system" {
512+
targets = discovery.relabel.switches_system.output
513+
forward_to = [prometheus.remote_write.mimir.receiver]
514+
scrape_interval = "30s"
515+
scrape_timeout = "15s"
516+
metrics_path = "/snmp"
517+
job_name = "switches_system"
518+
}
519+
520+
// Scrape SNMP Juniper operating metrics (JUNIPER-MIB)
521+
prometheus.scrape "switches_jnx_operating" {
522+
targets = discovery.relabel.switches_jnx_operating.output
523+
forward_to = [prometheus.remote_write.mimir.receiver]
524+
scrape_interval = "60s"
525+
scrape_timeout = "30s"
526+
metrics_path = "/snmp"
527+
job_name = "switches_jnx_operating"
528+
}
529+
530+
// Scrape SNMP interface metrics (IF-MIB)
531+
prometheus.scrape "switches_if_mib" {
532+
targets = discovery.relabel.switches_if_mib.output
349533
forward_to = [prometheus.remote_write.mimir.receiver]
350534
scrape_interval = "120s"
351535
scrape_timeout = "60s"
352536
metrics_path = "/snmp"
353-
job_name = "switches"
537+
job_name = "switches_if_mib"
354538
}
355539
'';
356540
};
@@ -400,6 +584,9 @@ in
400584
locations."/mimir/" = {
401585
proxyPass = "http://127.0.0.1:3200/";
402586
};
587+
locations."/alertmanager/" = {
588+
proxyPass = "http://127.0.0.1:9093/";
589+
};
403590
locations."/tempo/" = {
404591
proxyPass = "http://127.0.0.1:3300/";
405592
};

0 commit comments

Comments
 (0)