Skip to content

Commit 0eaf527

Browse files
authored
Merge pull request #14 from aws-observability/infra-dashboard-curation
Infra dashboard curation
2 parents 7aa1dcc + 0432593 commit 0eaf527

30 files changed

+19547
-30880
lines changed

examples/eks-cluster-with-vpc/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ module "eks_blueprints" {
5252
managed_node_groups = {
5353
mg_5 = {
5454
node_group_name = "managed-ondemand"
55-
instance_types = ["m5.large"]
55+
instance_types = ["t3.xlarge"]
5656
min_size = 2
5757
subnet_ids = module.vpc.private_subnets
5858
}

modules/workloads/infra/alerts.tf

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ groups:
211211
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
212212
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
213213
- alert: KubeletPodStartUpLatencyHigh
214-
expr: histogram_quantile(0.99, sum by(cluster, instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",metrics_path="/metrics"}[5m]))) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"} > 60
214+
expr: histogram_quantile(0.99, sum by(cluster, instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m]))) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
215215
for: 15m
216216
labels:
217217
severity: warning
@@ -263,7 +263,7 @@ groups:
263263
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
264264
summary: Kubelet has failed to renew its server certificate.
265265
- alert: KubeletDown
266-
expr: absent(up{job="kubelet",metrics_path="/metrics"} == 1)
266+
expr: absent(up{job="kubelet"} == 1)
267267
for: 15m
268268
labels:
269269
severity: critical
@@ -350,31 +350,31 @@ groups:
350350
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
351351
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
352352
- alert: KubePersistentVolumeFillingUp
353-
expr: (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
353+
expr: (kubelet_volume_stats_available_bytes{job="kubelet",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",namespace=~".*"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet",namespace=~".*"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
354354
for: 1m
355355
labels:
356356
severity: critical
357357
annotations:
358358
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
359359
summary: PersistentVolume is filling up.
360360
- alert: KubePersistentVolumeFillingUp
361-
expr: (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
361+
expr: (kubelet_volume_stats_available_bytes{job="kubelet",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",namespace=~".*"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet",namespace=~".*"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",namespace=~".*"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
362362
for: 1h
363363
labels:
364364
severity: warning
365365
annotations:
366366
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days.
367367
summary: PersistentVolume is filling up.
368368
- alert: KubePersistentVolumeInodesFillingUp
369-
expr: (kubelet_volume_stats_inodes_free{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_inodes{job="kubelet",metrics_path="/metrics",namespace=~".*"}) < 0.03 and kubelet_volume_stats_inodes_used{job="kubelet",metrics_path="/metrics",namespace=~".*"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
369+
expr: (kubelet_volume_stats_inodes_free{job="kubelet",namespace=~".*"} / kubelet_volume_stats_inodes{job="kubelet",namespace=~".*"}) < 0.03 and kubelet_volume_stats_inodes_used{job="kubelet",namespace=~".*"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
370370
for: 1m
371371
labels:
372372
severity: critical
373373
annotations:
374374
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes.
375375
summary: PersistentVolumeInodes is filling up.
376376
- alert: KubePersistentVolumeInodesFillingUp
377-
expr: (kubelet_volume_stats_inodes_free{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_inodes{job="kubelet",metrics_path="/metrics",namespace=~".*"}) < 0.15 and kubelet_volume_stats_inodes_used{job="kubelet",metrics_path="/metrics",namespace=~".*"} > 0 and predict_linear(kubelet_volume_stats_inodes_free{job="kubelet",metrics_path="/metrics",namespace=~".*"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
377+
expr: (kubelet_volume_stats_inodes_free{job="kubelet",namespace=~".*"} / kubelet_volume_stats_inodes{job="kubelet",namespace=~".*"}) < 0.15 and kubelet_volume_stats_inodes_used{job="kubelet",namespace=~".*"} > 0 and predict_linear(kubelet_volume_stats_inodes_free{job="kubelet",namespace=~".*"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
378378
for: 1h
379379
labels:
380380
severity: warning

modules/workloads/infra/dashboards.tf

Lines changed: 1 addition & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,6 @@ resource "grafana_dashboard" "workloads" {
44
config_json = file("${path.module}/dashboards/workloads.json")
55
}
66

7-
resource "grafana_dashboard" "scheduler" {
8-
count = var.enable_dashboards ? 1 : 0
9-
folder = var.dashboards_folder_id
10-
config_json = file("${path.module}/dashboards/scheduler.json")
11-
}
12-
13-
resource "grafana_dashboard" "proxy" {
14-
count = var.enable_dashboards ? 1 : 0
15-
folder = var.dashboards_folder_id
16-
config_json = file("${path.module}/dashboards/proxy.json")
17-
}
18-
197
resource "grafana_dashboard" "podnetwork" {
208
count = var.enable_dashboards ? 1 : 0
219
folder = var.dashboards_folder_id
@@ -28,36 +16,12 @@ resource "grafana_dashboard" "pods" {
2816
config_json = file("${path.module}/dashboards/pods.json")
2917
}
3018

31-
resource "grafana_dashboard" "pv" {
32-
count = var.enable_dashboards ? 1 : 0
33-
folder = var.dashboards_folder_id
34-
config_json = file("${path.module}/dashboards/pesistentvolumes.json")
35-
}
36-
3719
resource "grafana_dashboard" "nodes" {
3820
count = var.enable_dashboards ? 1 : 0
3921
folder = var.dashboards_folder_id
4022
config_json = file("${path.module}/dashboards/nodes.json")
4123
}
4224

43-
resource "grafana_dashboard" "necluster" {
44-
count = var.enable_dashboards ? 1 : 0
45-
folder = var.dashboards_folder_id
46-
config_json = file("${path.module}/dashboards/nodeexpoter-use-cluster.json")
47-
}
48-
49-
resource "grafana_dashboard" "nenodeuse" {
50-
count = var.enable_dashboards ? 1 : 0
51-
folder = var.dashboards_folder_id
52-
config_json = file("${path.module}/dashboards/nodeexporter-use-node.json")
53-
}
54-
55-
resource "grafana_dashboard" "nenode" {
56-
count = var.enable_dashboards ? 1 : 0
57-
folder = var.dashboards_folder_id
58-
config_json = file("${path.module}/dashboards/nodeexporter-nodes.json")
59-
}
60-
6125
resource "grafana_dashboard" "nwworload" {
6226
count = var.enable_dashboards ? 1 : 0
6327
folder = var.dashboards_folder_id
@@ -94,24 +58,6 @@ resource "grafana_dashboard" "kubelet" {
9458
config_json = file("${path.module}/dashboards/kubelet.json")
9559
}
9660

97-
resource "grafana_dashboard" "etcd" {
98-
count = var.enable_dashboards ? 1 : 0
99-
folder = var.dashboards_folder_id
100-
config_json = file("${path.module}/dashboards/etcd.json")
101-
}
102-
103-
resource "grafana_dashboard" "coredns" {
104-
count = var.enable_dashboards ? 1 : 0
105-
folder = var.dashboards_folder_id
106-
config_json = file("${path.module}/dashboards/coredns.json")
107-
}
108-
109-
resource "grafana_dashboard" "controller" {
110-
count = var.enable_dashboards ? 1 : 0
111-
folder = var.dashboards_folder_id
112-
config_json = file("${path.module}/dashboards/controller.json")
113-
}
114-
11561
resource "grafana_dashboard" "clusternw" {
11662
count = var.enable_dashboards ? 1 : 0
11763
folder = var.dashboards_folder_id
@@ -122,10 +68,4 @@ resource "grafana_dashboard" "cluster" {
12268
count = var.enable_dashboards ? 1 : 0
12369
folder = var.dashboards_folder_id
12470
config_json = file("${path.module}/dashboards/cluster.json")
125-
}
126-
127-
resource "grafana_dashboard" "apis" {
128-
count = var.enable_dashboards ? 1 : 0
129-
folder = var.dashboards_folder_id
130-
config_json = file("${path.module}/dashboards/apiserver.json")
131-
}
71+
}

0 commit comments

Comments
 (0)