|
7 | 7 | let |
8 | 8 | cfg = config.scale-network.services.monitoring; |
9 | 9 |
|
| 10 | + mimirRules = pkgs.writeText "rules.yml" ( |
| 11 | + builtins.toJSON { |
| 12 | + groups = [ |
| 13 | + { |
| 14 | + name = "server-alerts"; |
| 15 | + rules = [ |
| 16 | + { |
| 17 | + alert = "ServerDown"; |
| 18 | + expr = ''count(up{job="integrations/unix"} == 0) > 0''; |
| 19 | + "for" = "5m"; |
| 20 | + labels = { |
| 21 | + severity = "critical"; |
| 22 | + }; |
| 23 | + annotations = { |
| 24 | + summary = "One or more servers are down"; |
| 25 | + }; |
| 26 | + } |
| 27 | + { |
| 28 | + alert = "SwitchDown"; |
| 29 | + expr = ''count(up{job="switches_system"} == 0) > 0''; |
| 30 | + "for" = "5m"; |
| 31 | + labels = { |
| 32 | + severity = "critical"; |
| 33 | + }; |
| 34 | + annotations = { |
| 35 | + summary = "One or more switches are down"; |
| 36 | + }; |
| 37 | + } |
| 38 | + { |
| 39 | + alert = "ApDown"; |
| 40 | + expr = ''count(up{job="aps"} == 0) > 0''; |
| 41 | + "for" = "5m"; |
| 42 | + labels = { |
| 43 | + severity = "critical"; |
| 44 | + }; |
| 45 | + annotations = { |
| 46 | + summary = "One or more aps are down"; |
| 47 | + }; |
| 48 | + } |
| 49 | + { |
| 50 | + alert = "PiDown"; |
| 51 | + expr = ''count(up{job="pis"} == 0) > 0''; |
| 52 | + "for" = "5m"; |
| 53 | + labels = { |
| 54 | + severity = "critical"; |
| 55 | + }; |
| 56 | + annotations = { |
| 57 | + summary = "One or more pis are down"; |
| 58 | + }; |
| 59 | + } |
| 60 | + ]; |
| 61 | + } |
| 62 | + ]; |
| 63 | + } |
| 64 | + ); |
| 65 | + |
| 66 | + mimirRulesDir = pkgs.runCommand "mimir-rules" { } '' |
| 67 | + mkdir -p $out/anonymous |
| 68 | + cp ${mimirRules} $out/anonymous/rules.yml |
| 69 | + ''; |
| 70 | + |
10 | 71 | inherit (lib) |
11 | 72 | types |
12 | 73 | ; |
|
38 | 99 | 14250 # tempo-jaeger-grpc |
39 | 100 | 14268 # tempo-jaeger-thrift-http |
40 | 101 | 3000 # grafana-http |
| 102 | + 9093 # alertmanager-http |
41 | 103 | 3100 # loki-http |
42 | 104 | 3200 # mimir-http |
43 | 105 | 3300 # tempo-http |
|
139 | 201 | }; |
140 | 202 | }; |
141 | 203 |
|
| 204 | + ruler = { |
| 205 | + alertmanager_url = "http://127.0.0.1:9093"; |
| 206 | + enable_api = true; |
| 207 | + evaluation_interval = "1m"; |
| 208 | + }; |
| 209 | + |
142 | 210 | ruler_storage = { |
143 | | - storage_prefix = "ruler"; |
| 211 | + backend = "local"; |
| 212 | + local = { |
| 213 | + directory = mimirRulesDir; |
| 214 | + }; |
144 | 215 | }; |
145 | 216 |
|
146 | 217 | limits = { |
147 | 218 | ingestion_burst_size = 1000000; |
148 | 219 | ingestion_rate = 100000; |
149 | | - max_global_series_per_user = 300000; |
| 220 | + max_global_series_per_user = 600000; |
150 | 221 | }; |
151 | 222 |
|
152 | 223 | alertmanager_storage = { |
|
261 | 332 | access = "proxy"; |
262 | 333 | url = "http://127.0.0.1:3300"; |
263 | 334 | } |
| 335 | + { |
| 336 | + name = "Alertmanager"; |
| 337 | + type = "alertmanager"; |
| 338 | + access = "proxy"; |
| 339 | + url = "http://127.0.0.1:9093"; |
| 340 | + } |
264 | 341 | ]; |
265 | 342 | }; |
266 | 343 | }; |
267 | 344 | }; |
268 | 345 |
|
| 346 | + # Alertmanager for receiving and routing alerts from Mimir |
| 347 | + services.prometheus.alertmanager = { |
| 348 | + enable = true; |
| 349 | + port = 9093; |
| 350 | + extraFlags = [ |
| 351 | + "--web.external-url=https://${cfg.fqdn}/alertmanager/" |
| 352 | + "--web.route-prefix=/" |
| 353 | + ]; |
| 354 | + configuration = { |
| 355 | + route = { |
| 356 | + receiver = "default"; |
| 357 | + group_wait = "30s"; |
| 358 | + group_interval = "5m"; |
| 359 | + repeat_interval = "4h"; |
| 360 | + }; |
| 361 | + receivers = [ |
| 362 | + { |
| 363 | + name = "default"; |
| 364 | + } |
| 365 | + ]; |
| 366 | + }; |
| 367 | + }; |
| 368 | + |
269 | 369 | # Standalone SNMP exporter for Juniper switches |
270 | 370 | services.prometheus.exporters.snmp = { |
271 | 371 | configurationPath = ./snmp-alloy.yml; |
|
311 | 411 | } |
312 | 412 |
|
313 | 413 | // Relabel switch targets for the standalone SNMP exporter. |
314 | | - // The SNMP exporter is an HTTP service at 127.0.0.1:9116 that accepts |
315 | | - // target, module, and auth as query parameters: |
316 | | - // GET /snmp?target=[ipv6]:161&module=if_mib,system,jnxOperating&auth=Junitux |
317 | | - discovery.relabel "switches" { |
| 414 | + // Each module gets its own relabel + scrape to avoid overwhelming the |
| 415 | + // SNMP exporter with 39 concurrent multi-module walks. |
| 416 | + discovery.relabel "switches_if_mib" { |
318 | 417 | targets = discovery.file.switches.targets |
319 | 418 |
|
| 419 | + // Strip brackets and port from IPv6 addresses (e.g. [2001:db8::1]:161 -> 2001:db8::1) |
320 | 420 | rule { |
321 | 421 | source_labels = ["__address__"] |
| 422 | + regex = "\\[(.+?)\\](?::\\d+)?" |
| 423 | + replacement = "$1" |
322 | 424 | target_label = "__param_target" |
323 | 425 | } |
324 | 426 |
|
|
329 | 431 |
|
330 | 432 | rule { |
331 | 433 | target_label = "__param_module" |
332 | | - replacement = "if_mib,system,jnxOperating" |
| 434 | + replacement = "if_mib" |
333 | 435 | } |
334 | 436 |
|
335 | 437 | rule { |
|
343 | 445 | } |
344 | 446 | } |
345 | 447 |
|
346 | | - // Scrape SNMP metrics from switches via the standalone SNMP exporter |
347 | | - prometheus.scrape "switches" { |
348 | | - targets = discovery.relabel.switches.output |
| 448 | + discovery.relabel "switches_system" { |
| 449 | + targets = discovery.file.switches.targets |
| 450 | +
|
| 451 | + rule { |
| 452 | + source_labels = ["__address__"] |
| 453 | + regex = "\\[(.+?)\\](?::\\d+)?" |
| 454 | + replacement = "$1" |
| 455 | + target_label = "__param_target" |
| 456 | + } |
| 457 | +
|
| 458 | + rule { |
| 459 | + source_labels = ["__param_target"] |
| 460 | + target_label = "instance" |
| 461 | + } |
| 462 | +
|
| 463 | + rule { |
| 464 | + target_label = "__param_module" |
| 465 | + replacement = "system" |
| 466 | + } |
| 467 | +
|
| 468 | + rule { |
| 469 | + target_label = "__param_auth" |
| 470 | + replacement = "Junitux" |
| 471 | + } |
| 472 | +
|
| 473 | + rule { |
| 474 | + target_label = "__address__" |
| 475 | + replacement = "127.0.0.1:9116" |
| 476 | + } |
| 477 | + } |
| 478 | +
|
| 479 | + discovery.relabel "switches_jnx_operating" { |
| 480 | + targets = discovery.file.switches.targets |
| 481 | +
|
| 482 | + rule { |
| 483 | + source_labels = ["__address__"] |
| 484 | + regex = "\\[(.+?)\\](?::\\d+)?" |
| 485 | + replacement = "$1" |
| 486 | + target_label = "__param_target" |
| 487 | + } |
| 488 | +
|
| 489 | + rule { |
| 490 | + source_labels = ["__param_target"] |
| 491 | + target_label = "instance" |
| 492 | + } |
| 493 | +
|
| 494 | + rule { |
| 495 | + target_label = "__param_module" |
| 496 | + replacement = "jnxOperating" |
| 497 | + } |
| 498 | +
|
| 499 | + rule { |
| 500 | + target_label = "__param_auth" |
| 501 | + replacement = "Junitux" |
| 502 | + } |
| 503 | +
|
| 504 | + rule { |
| 505 | + target_label = "__address__" |
| 506 | + replacement = "127.0.0.1:9116" |
| 507 | + } |
| 508 | + } |
| 509 | +
|
| 510 | + // Scrape SNMP system metrics (SNMPv2-MIB) |
| 511 | + prometheus.scrape "switches_system" { |
| 512 | + targets = discovery.relabel.switches_system.output |
| 513 | + forward_to = [prometheus.remote_write.mimir.receiver] |
| 514 | + scrape_interval = "30s" |
| 515 | + scrape_timeout = "15s" |
| 516 | + metrics_path = "/snmp" |
| 517 | + job_name = "switches_system" |
| 518 | + } |
| 519 | +
|
| 520 | + // Scrape SNMP Juniper operating metrics (JUNIPER-MIB) |
| 521 | + prometheus.scrape "switches_jnx_operating" { |
| 522 | + targets = discovery.relabel.switches_jnx_operating.output |
| 523 | + forward_to = [prometheus.remote_write.mimir.receiver] |
| 524 | + scrape_interval = "60s" |
| 525 | + scrape_timeout = "30s" |
| 526 | + metrics_path = "/snmp" |
| 527 | + job_name = "switches_jnx_operating" |
| 528 | + } |
| 529 | +
|
| 530 | + // Scrape SNMP interface metrics (IF-MIB) |
| 531 | + prometheus.scrape "switches_if_mib" { |
| 532 | + targets = discovery.relabel.switches_if_mib.output |
349 | 533 | forward_to = [prometheus.remote_write.mimir.receiver] |
350 | 534 | scrape_interval = "120s" |
351 | 535 | scrape_timeout = "60s" |
352 | 536 | metrics_path = "/snmp" |
353 | | - job_name = "switches" |
| 537 | + job_name = "switches_if_mib" |
354 | 538 | } |
355 | 539 | ''; |
356 | 540 | }; |
|
400 | 584 | locations."/mimir/" = { |
401 | 585 | proxyPass = "http://127.0.0.1:3200/"; |
402 | 586 | }; |
| 587 | + locations."/alertmanager/" = { |
| 588 | + proxyPass = "http://127.0.0.1:9093/"; |
| 589 | + }; |
403 | 590 | locations."/tempo/" = { |
404 | 591 | proxyPass = "http://127.0.0.1:3300/"; |
405 | 592 | }; |
|
0 commit comments