1+ locals {
2+ all_filter_str = format (" kube_cluster_name:%s" , var. cluster_name )
3+ system_filter_str = format (" %s AND kube_namespace IN (kube-system, argocd, cloudflared-tunnel, node-problem-detector, policy-reporter, karpenter, kube-ops, podsteward, cluster-autoscaler, cluster-monitoring, keda, kyverno, kyverno-policy-reporter, prometheus, teleport-agent, traefik, traefik-internal, wiz)" , local. all_filter_str )
4+
5+ oom_filter_str = var. monitor_system_workload_only ? local. system_filter_str : local. all_filter_str
6+ deployment_multiple_restarts_filter_override = var. monitor_system_workload_only ? " ${ local . system_filter_str } AND kube_deployment:*" : null
7+ replicaset_unavailable_filter_override = var. monitor_system_workload_only ? local. system_filter_str : null
8+ replicaset_incomplete_filter_override = var. monitor_system_workload_only ? local. system_filter_str : null
9+ }
10+
111module "datadog_monitoring" {
212 count = var. monitoring_enabled ? 1 : 0
313
@@ -6,11 +16,18 @@ module "datadog_monitoring" {
616 notification_channel = var. monitoring_notification_channel
717 service = format (" EKS %s" , var. cluster_name )
818 env = var. environment
9- filter_str = format ( " kube_cluster_name:%s " , var . cluster_name )
19+ filter_str = local . all_filter_str
1020 additional_tags = [
11- " CreatedBy:terraform"
21+ " CreatedBy:terraform" ,
22+ " service:k8s" ,
23+ " team:infrastructure" ,
24+ " env:${ var . environment } " ,
1225 ]
1326
27+ deployment_multiple_restarts_filter_override = local. deployment_multiple_restarts_filter_override
28+ replicaset_unavailable_filter_override = local. replicaset_unavailable_filter_override
29+ replicaset_incomplete_filter_override = local. replicaset_incomplete_filter_override
30+
1431 # don't alert on cpu overbooking
1532 cpu_limits_low_perc_enabled = false
1633 cpu_requests_low_perc_enabled = false
@@ -32,7 +49,7 @@ resource "datadog_monitor" "oom" {
3249
3350 name = " OOM kill detected on ${ var . cluster_name } "
3451 type = " metric alert"
35- query = " sum(last_4h):sum:oom_kill.oom_process.count{cluster_name: ${ var . cluster_name } } by {kube_namespace,kube_container_name}.as_count() >= 1"
52+ query = " sum(last_4h):sum:oom_kill.oom_process.count{${ local . oom_filter_str } } by {kube_namespace,kube_container_name}.as_count() >= 1"
3653
3754 on_missing_data = " default"
3855 group_retention_duration = " 24h"
@@ -56,7 +73,10 @@ Notify: ${var.monitoring_notification_channel}
5673EOT
5774
5875 tags = [
59- " CreatedBy:terraform"
76+ " CreatedBy:terraform" ,
77+ " service:k8s" ,
78+ " team:infrastructure" ,
79+ " env:${ var . environment } " ,
6080 ]
6181}
6282
@@ -76,6 +96,8 @@ resource "datadog_synthetics_test" "cluster_monitoring" {
7696 tags = [
7797 " CreatedBy:terraform" ,
7898 " env:${ var . environment } " ,
99+ " service:k8s" ,
100+ " team:infrastructure" ,
79101 ]
80102
81103 request_definition {
0 commit comments