Skip to content

Commit c3e55fb

Browse files
committed
Merge alerts and rules in two separate rules
1 parent eadacc9 commit c3e55fb

File tree

3 files changed

+4
-247
lines changed

3 files changed

+4
-247
lines changed

modules/workloads/infra/alerts.tf

Lines changed: 1 addition & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Alerting rules ###############################################################################################################################
33
################################################################################################################################################
44

5-
resource "aws_prometheus_rule_group_namespace" "nodenw" {
5+
resource "aws_prometheus_rule_group_namespace" "alerting_rules" {
66
count = var.enable_alerting_rules ? 1 : 0
77

88
name = "nodenw-rules"
@@ -19,18 +19,6 @@ groups:
1919
annotations:
2020
description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
2121
summary: Network interface is often changing its status
22-
EOF
23-
}
24-
25-
26-
27-
resource "aws_prometheus_rule_group_namespace" "nodeexporter" {
28-
count = var.enable_alerting_rules ? 1 : 0
29-
30-
name = "nodeexporter-rules"
31-
workspace_id = var.managed_prometheus_workspace_id
32-
data = <<EOF
33-
groups:
3422
- name: nodeexp-01
3523
rules:
3624
- alert: NodeFilesystemSpaceFillingUp
@@ -208,16 +196,6 @@ groups:
208196
annotations:
209197
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
210198
summary: Kernel is predicted to exhaust file descriptors limit soon.
211-
EOF
212-
}
213-
214-
resource "aws_prometheus_rule_group_namespace" "kubesyschdlr" {
215-
count = var.enable_alerting_rules ? 1 : 0
216-
217-
name = "kubesyschdlr-rules"
218-
workspace_id = var.managed_prometheus_workspace_id
219-
data = <<EOF
220-
groups:
221199
- name: kubesysschdlr-01
222200
rules:
223201
- alert: KubeSchedulerDown
@@ -228,16 +206,6 @@ groups:
228206
annotations:
229207
description: KubeScheduler has disappeared from Prometheus target discovery.
230208
summary: Target disappeared from Prometheus target discovery.
231-
EOF
232-
}
233-
234-
resource "aws_prometheus_rule_group_namespace" "kubesyskblt" {
235-
count = var.enable_alerting_rules ? 1 : 0
236-
237-
name = "kubesyskblt-rules"
238-
workspace_id = var.managed_prometheus_workspace_id
239-
data = <<EOF
240-
groups:
241209
- name: kubesyskblt-01
242210
rules:
243211
- alert: KubeNodeNotReady
@@ -364,17 +332,6 @@ groups:
364332
annotations:
365333
description: Kubelet has disappeared from Prometheus target discovery.
366334
summary: Target disappeared from Prometheus target discovery.
367-
368-
EOF
369-
}
370-
371-
resource "aws_prometheus_rule_group_namespace" "kubesyskbpxy" {
372-
count = var.enable_alerting_rules ? 1 : 0
373-
374-
name = "kubesyskbpxy-rules"
375-
workspace_id = var.managed_prometheus_workspace_id
376-
data = <<EOF
377-
groups:
378335
- name: kubesyspxy-01
379336
rules:
380337
- alert: KubeProxyDown
@@ -385,16 +342,6 @@ groups:
385342
annotations:
386343
description: KubeProxy has disappeared from Prometheus target discovery.
387344
summary: Target disappeared from Prometheus target discovery.
388-
EOF
389-
}
390-
391-
resource "aws_prometheus_rule_group_namespace" "kubesys" {
392-
count = var.enable_alerting_rules ? 1 : 0
393-
394-
name = "kubesys-rules"
395-
workspace_id = var.managed_prometheus_workspace_id
396-
data = <<EOF
397-
groups:
398345
- name: kubesys-01
399346
rules:
400347
- alert: KubeVersionMismatch
@@ -415,17 +362,6 @@ groups:
415362
annotations:
416363
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
417364
summary: Kubernetes API server client is experiencing errors.
418-
419-
EOF
420-
}
421-
422-
resource "aws_prometheus_rule_group_namespace" "kubesyscm" {
423-
count = var.enable_alerting_rules ? 1 : 0
424-
425-
name = "kubesyscm-rules"
426-
workspace_id = var.managed_prometheus_workspace_id
427-
data = <<EOF
428-
groups:
429365
- name: kubesyscm-01
430366
rules:
431367
- alert: KubeControllerManagerDown
@@ -436,16 +372,6 @@ groups:
436372
annotations:
437373
description: KubeControllerManager has disappeared from Prometheus target discovery.
438374
summary: Target disappeared from Prometheus target discovery.
439-
EOF
440-
}
441-
442-
resource "aws_prometheus_rule_group_namespace" "kubesysapi" {
443-
count = var.enable_alerting_rules ? 1 : 0
444-
445-
name = "kubesysapi-rules"
446-
workspace_id = var.managed_prometheus_workspace_id
447-
data = <<EOF
448-
groups:
449375
- name: kubesysapi-01
450376
rules:
451377
- alert: KubeClientCertificateExpiration
@@ -503,18 +429,6 @@ groups:
503429
annotations:
504430
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
505431
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
506-
EOF
507-
}
508-
509-
# Default limit of 10 needs to be raised
510-
511-
resource "aws_prometheus_rule_group_namespace" "kubestorage" {
512-
count = var.enable_alerting_rules ? 1 : 0
513-
514-
name = "kubestorage-rules"
515-
workspace_id = var.managed_prometheus_workspace_id
516-
data = <<EOF
517-
groups:
518432
- name: kubestg-01
519433
rules:
520434
- alert: KubePersistentVolumeFillingUp
@@ -565,16 +479,6 @@ groups:
565479
annotations:
566480
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
567481
summary: PersistentVolume is having issues with provisioning.
568-
EOF
569-
}
570-
571-
resource "aws_prometheus_rule_group_namespace" "kuberesources" {
572-
count = var.enable_alerting_rules ? 1 : 0
573-
574-
name = "kuberesources-rules"
575-
workspace_id = var.managed_prometheus_workspace_id
576-
data = <<EOF
577-
groups:
578482
- name: kuberes-01
579483
rules:
580484
- alert: KubeCPUOvercommit
@@ -655,16 +559,6 @@ groups:
655559
annotations:
656560
description: The {{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.
657561
summary: Processes experience elevated CPU throttling.
658-
EOF
659-
}
660-
661-
resource "aws_prometheus_rule_group_namespace" "kubeapps" {
662-
count = var.enable_alerting_rules ? 1 : 0
663-
664-
name = "kubeapps-rules"
665-
workspace_id = var.managed_prometheus_workspace_id
666-
data = <<EOF
667-
groups:
668562
- name: kubeapps-01
669563
rules:
670564
- alert: KubePodCrashLooping
@@ -814,18 +708,6 @@ groups:
814708
annotations:
815709
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.
816710
summary: HPA is running at max replicas
817-
EOF
818-
}
819-
820-
821-
822-
resource "aws_prometheus_rule_group_namespace" "kubestm" {
823-
count = var.enable_alerting_rules ? 1 : 0
824-
825-
name = "kubestm-rules"
826-
workspace_id = var.managed_prometheus_workspace_id
827-
data = <<EOF
828-
groups:
829711
- name: kubestm-01
830712
rules:
831713
- alert: KubeStateMetricsListErrors
@@ -866,16 +748,6 @@ groups:
866748
annotations:
867749
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
868750
summary: kube-state-metrics shards are missing.
869-
EOF
870-
}
871-
872-
resource "aws_prometheus_rule_group_namespace" "apislos" {
873-
count = var.enable_alerting_rules ? 1 : 0
874-
875-
name = "api-slos"
876-
workspace_id = var.managed_prometheus_workspace_id
877-
data = <<EOF
878-
groups:
879751
- name: apislos-01
880752
rules:
881753
- alert: KubeAPIErrorBudgetBurn
@@ -924,17 +796,6 @@ groups:
924796
annotations:
925797
description: The API server is burning too much error budget.
926798
summary: The API server is burning too much error budget.
927-
EOF
928-
}
929-
930-
931-
resource "aws_prometheus_rule_group_namespace" "generic" {
932-
count = var.enable_alerting_rules ? 1 : 0
933-
934-
name = "generic-rules"
935-
workspace_id = var.managed_prometheus_workspace_id
936-
data = <<EOF
937-
groups:
938799
- name: general-01
939800
rules:
940801
- alert: TargetDown
@@ -960,16 +821,6 @@ groups:
960821
severity: none
961822
annotations:
962823
description: This is an alert that is used to inhibit info alerts. By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with other alerts. This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a severity of 'warning' or 'critical' starts firing on the same namespace. This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
963-
EOF
964-
}
965-
966-
resource "aws_prometheus_rule_group_namespace" "etcd" {
967-
count = var.enable_alerting_rules ? 1 : 0
968-
969-
name = "etcd-rules"
970-
workspace_id = var.managed_prometheus_workspace_id
971-
data = <<EOF
972-
groups:
973824
- name: etcd-01
974825
rules:
975826
- alert: etcdInsufficientMembers

modules/workloads/infra/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ resource "helm_release" "prometheus_node_exporter" {
4444
}
4545

4646
module "helm_addon" {
47-
source = "github.com/aws-observability/terraform-aws-eks-blueprints/modules/kubernetes-addons/helm-addon"
47+
source = "github.com/aws-ia/terraform-aws-eks-blueprints/modules/kubernetes-addons/helm-addon"
4848

4949
helm_config = merge(
5050
{

0 commit comments

Comments
 (0)