Skip to content

Commit 8a1ba7a

Browse files
authored
Merge pull request #54 from worldcoin/INFRA-5561-monitor-system-workload-by-default
INFRA-5561 Monitor system workload option
2 parents 1a6489a + 77d8412 commit 8a1ba7a

File tree

2 files changed

+32
-4
lines changed

2 files changed

+32
-4
lines changed

datadog.tf

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
locals {
2+
all_filter_str = format("kube_cluster_name:%s", var.cluster_name)
3+
system_filter_str = format("%s AND kube_namespace IN (kube-system, argocd, cloudflared-tunnel, node-problem-detector, policy-reporter, karpenter, kube-ops, podsteward, cluster-autoscaler, cluster-monitoring, keda, kyverno, kyverno-policy-reporter, prometheus, teleport-agent, traefik, traefik-internal, wiz)", local.all_filter_str)
4+
5+
oom_filter_str = var.monitor_system_workload_only ? local.system_filter_str : local.all_filter_str
6+
deployment_multiple_restarts_filter_override = var.monitor_system_workload_only ? "${local.system_filter_str} AND kube_deployment:*" : null
7+
replicaset_unavailable_filter_override = var.monitor_system_workload_only ? local.system_filter_str : null
8+
replicaset_incomplete_filter_override = var.monitor_system_workload_only ? local.system_filter_str : null
9+
}
10+
111
module "datadog_monitoring" {
212
count = var.monitoring_enabled ? 1 : 0
313

@@ -6,11 +16,18 @@ module "datadog_monitoring" {
616
notification_channel = var.monitoring_notification_channel
717
service = format("EKS %s", var.cluster_name)
818
env = var.environment
9-
filter_str = format("kube_cluster_name:%s", var.cluster_name)
19+
filter_str = local.all_filter_str
1020
additional_tags = [
11-
"CreatedBy:terraform"
21+
"CreatedBy:terraform",
22+
"service:k8s",
23+
"team:infrastructure",
24+
"env:${var.environment}",
1225
]
1326

27+
deployment_multiple_restarts_filter_override = local.deployment_multiple_restarts_filter_override
28+
replicaset_unavailable_filter_override = local.replicaset_unavailable_filter_override
29+
replicaset_incomplete_filter_override = local.replicaset_incomplete_filter_override
30+
1431
# don't alert on cpu overbooking
1532
cpu_limits_low_perc_enabled = false
1633
cpu_requests_low_perc_enabled = false
@@ -32,7 +49,7 @@ resource "datadog_monitor" "oom" {
3249

3350
name = "OOM kill detected on ${var.cluster_name}"
3451
type = "metric alert"
35-
query = "sum(last_4h):sum:oom_kill.oom_process.count{cluster_name:${var.cluster_name}} by {kube_namespace,kube_container_name}.as_count() >= 1"
52+
query = "sum(last_4h):sum:oom_kill.oom_process.count{${local.oom_filter_str}} by {kube_namespace,kube_container_name}.as_count() >= 1"
3653

3754
on_missing_data = "default"
3855
group_retention_duration = "24h"
@@ -56,7 +73,10 @@ Notify: ${var.monitoring_notification_channel}
5673
EOT
5774

5875
tags = [
59-
"CreatedBy:terraform"
76+
"CreatedBy:terraform",
77+
"service:k8s",
78+
"team:infrastructure",
79+
"env:${var.environment}",
6080
]
6181
}
6282

@@ -76,6 +96,8 @@ resource "datadog_synthetics_test" "cluster_monitoring" {
7696
tags = [
7797
"CreatedBy:terraform",
7898
"env:${var.environment}",
99+
"service:k8s",
100+
"team:infrastructure",
79101
]
80102

81103
request_definition {

variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,12 @@ variable "monitoring_enabled" {
151151
default = true
152152
}
153153

154+
variable "monitor_system_workload_only" {
155+
description = "Monitor system workloads only."
156+
type = bool
157+
default = false
158+
}
159+
154160
variable "monitoring_external_enabled" {
155161
description = "Whether to enable external monitoring (Datadog Synthetics)."
156162
type = bool

0 commit comments

Comments
 (0)