diff --git a/integrations.tf b/integrations.tf index 4cdd3a3..6de4c15 100644 --- a/integrations.tf +++ b/integrations.tf @@ -16,7 +16,12 @@ resource "datadog_integration_aws_account" "pytorch" { lambda_forwarder {} } metrics_config { - namespace_filters {} + namespace_filters { + exclude_only = [ + "AWS/SQS", + "AWS/ElasticMapReduce", + ] + } } resources_config {} traces_config { diff --git a/monitors.tf b/monitors.tf index 3837027..5ae8a1b 100644 --- a/monitors.tf +++ b/monitors.tf @@ -28,33 +28,30 @@ resource "datadog_monitor" "ci_retry_deadletter" { resource "datadog_monitor" "all_queues_anomaly" { - name = "Queue **{{queuename.name}}** has a high number of visible messages" - message = <<-MSG - The number of visible messages in `{{queuename.name}}` is outside of the typical range. - MSG - priority = 5 - - type = "query alert" - query = <<-QUERY - avg(last_1w): - anomalies( - avg:aws.sqs.approximate_number_of_messages_visible{project:pytorch/pytorch} by {queuename,region}, - 'basic', 2, direction='both', interval=3600, alert_window='last_1d', count_default_zero='true' - ) >= 1 - QUERY - - include_tags = true - on_missing_data = "default" + evaluation_delay = 900 require_full_window = false - + monitor_thresholds { + critical = 1 + critical_recovery = 0 + warning = 0.9 + } monitor_threshold_windows { recovery_window = "last_15m" trigger_window = "last_1d" } - - monitor_thresholds { - critical = "1" - critical_recovery = "0" - warning = "0.9" - } + name = "Queue **{{queuename.name}}** has a high number of visible messages" + type = "query alert" + priority = 5 + query = <= 1 +EOT + message = <