Skip to content

Commit a9c5253

Browse files
authored
VED-973: Slack Delta Alerts (#1075)
* setup delta error alarm and metrics * change account level terraform
1 parent 4132301 commit a9c5253

File tree

9 files changed

+67
-36
lines changed

9 files changed

+67
-36
lines changed

infrastructure/account/batch_processor_errors_slack_chatbot.tf

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
resource "aws_chatbot_slack_channel_configuration" "imms_system_alert_errors" {
2+
configuration_name = "${var.environment}-imms-system-alert-errors-slack-channel-config"
3+
iam_role_arn = aws_iam_role.imms_system_alert_errors_chatbot.arn
4+
slack_channel_id = var.environment == "prod" ? "C09EA0HE202" : "C09E48NDP18"
5+
slack_team_id = "TJ00QR03U"
6+
sns_topic_arns = [aws_sns_topic.imms_system_alert_errors.arn]
7+
}
8+
9+
resource "aws_iam_role" "imms_system_alert_errors_chatbot" {
10+
name = "${var.environment}-imms-system-alert-errors-chatbot-channel-role"
11+
assume_role_policy = jsonencode({
12+
Version = "2012-10-17"
13+
Statement = [
14+
{
15+
Action = "sts:AssumeRole"
16+
Effect = "Allow"
17+
Sid = "AssumeChatbotRole"
18+
Principal = {
19+
Service = "chatbot.amazonaws.com"
20+
}
21+
},
22+
]
23+
})
24+
}

infrastructure/account/batch_processor_errors_sns_topic.tf renamed to infrastructure/account/imms_alerts_errors_sns_topic.tf

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
resource "aws_sns_topic" "batch_processor_errors" {
2-
name = "${var.environment}-batch-processor-errors"
1+
resource "aws_sns_topic" "imms_system_alert_errors" {
2+
name = "${var.environment}-imms-system-alert-errors"
33
kms_master_key_id = aws_kms_key.error_alerts_sns_encryption_key.arn
44
}
55

6-
resource "aws_sns_topic_policy" "batch_processor_errors_topic_policy" {
7-
arn = aws_sns_topic.batch_processor_errors.arn
6+
resource "aws_sns_topic_policy" "imms_system_alert_errors_topic_policy" {
7+
arn = aws_sns_topic.imms_system_alert_errors.arn
88
policy = jsonencode({
99
Version = "2012-10-17",
1010
Statement = [
@@ -15,7 +15,7 @@ resource "aws_sns_topic_policy" "batch_processor_errors_topic_policy" {
1515
Service = "cloudwatch.amazonaws.com"
1616
},
1717
Action = "SNS:Publish",
18-
Resource = aws_sns_topic.batch_processor_errors.arn
18+
Resource = aws_sns_topic.imms_system_alert_errors.arn
1919
}
2020
]
2121
})

infrastructure/account/kms.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,8 @@ resource "aws_kms_key" "error_alerts_sns_encryption_key" {
216216
})
217217
}
218218

219-
resource "aws_kms_alias" "batch_processor_errors_sns_encryption_key" {
220-
name = "alias/${var.environment}-batch-processor-errors-imms-sns-encryption"
219+
resource "aws_kms_alias" "imms_system_alert_errors_sns_encryption_key" {
220+
name = "alias/${var.environment}-imms-alert-errors-sns-encryption"
221221
target_key_id = aws_kms_key.error_alerts_sns_encryption_key.key_id
222222
}
223223

infrastructure/instance/batch_processor_filter_lambda.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,6 @@ resource "aws_cloudwatch_metric_alarm" "batch_processor_filter_error_alarm" {
331331
statistic = "Sum"
332332
threshold = 1
333333
alarm_description = "This sets off an alarm for any error logs found in the batch processor filter Lambda function"
334-
alarm_actions = [data.aws_sns_topic.batch_processor_errors.arn]
334+
alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn]
335335
treat_missing_data = "notBreaching"
336336
}

infrastructure/instance/delta.tf

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,34 @@ resource "aws_cloudwatch_log_group" "delta_lambda" {
178178
name = "/aws/lambda/${local.short_prefix}-${local.function_name}"
179179
retention_in_days = 30
180180
}
181+
182+
183+
resource "aws_cloudwatch_log_metric_filter" "delta_error_logs" {
184+
count = var.error_alarm_notifications_enabled ? 1 : 0
185+
186+
name = "${local.short_prefix}-DeltaErrorLogsFilter"
187+
pattern = "%\\[ERROR\\]%"
188+
log_group_name = aws_cloudwatch_log_group.delta_lambda.name
189+
190+
metric_transformation {
191+
name = "${local.short_prefix}-DeltaErrorLogs"
192+
namespace = "${local.short_prefix}-DeltaLambda"
193+
value = "1"
194+
}
195+
}
196+
197+
resource "aws_cloudwatch_metric_alarm" "delta_error_alarm" {
198+
count = var.error_alarm_notifications_enabled ? 1 : 0
199+
200+
alarm_name = "${local.short_prefix}-delta-lambda-error"
201+
comparison_operator = "GreaterThanOrEqualToThreshold"
202+
evaluation_periods = 1
203+
metric_name = "${local.short_prefix}-DeltaErrorLogs"
204+
namespace = "${local.short_prefix}-DeltaLambda"
205+
period = 120
206+
statistic = "Sum"
207+
threshold = 1
208+
alarm_description = "This sets off an alarm for any error logs found in the delta Lambda function"
209+
alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn]
210+
treat_missing_data = "notBreaching"
211+
}

infrastructure/instance/ecs_batch_processor_config.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,6 @@ resource "aws_cloudwatch_metric_alarm" "record_processor_task_error_alarm" {
397397
statistic = "Sum"
398398
threshold = 1
399399
alarm_description = "This sets off an alarm for any error logs found in the record processor ECS task"
400-
alarm_actions = [data.aws_sns_topic.batch_processor_errors.arn]
400+
alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn]
401401
treat_missing_data = "notBreaching"
402402
}

infrastructure/instance/file_name_processor.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,6 @@ resource "aws_cloudwatch_metric_alarm" "file_name_processor_error_alarm" {
401401
statistic = "Sum"
402402
threshold = 1
403403
alarm_description = "This sets off an alarm for any error logs found in the file name processor Lambda function"
404-
alarm_actions = [data.aws_sns_topic.batch_processor_errors.arn]
404+
alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn]
405405
treat_missing_data = "notBreaching"
406406
}

infrastructure/instance/main.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,6 @@ data "aws_route53_zone" "project_zone" {
108108
name = local.project_domain_name
109109
}
110110

111-
data "aws_sns_topic" "batch_processor_errors" {
112-
name = "${var.environment}-batch-processor-errors"
111+
data "aws_sns_topic" "imms_system_alert_errors" {
112+
name = "${var.environment}-imms-system-alert-errors"
113113
}

0 commit comments

Comments
 (0)