From 97208e16794d4d499b40cf83f3a1d96a2d7de080 Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 26 Sep 2024 11:51:55 +0530 Subject: [PATCH 1/8] add ec2 alarms --- .gitignore | 3 ++ README.md | 1 - custom.tf | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ main.tf | 10 ++++- variables.tf | 6 +-- wrappers/main.tf | 1 - 6 files changed, 108 insertions(+), 9 deletions(-) create mode 100644 custom.tf diff --git a/.gitignore b/.gitignore index 397af322..58ccfdb1 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ override.tf.json # Ignore CLI configuration files .terraformrc terraform.rc + +.idea/ +.vscode/ diff --git a/README.md b/README.md index d9f0fa28..b2eb364e 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,6 @@ No modules. | [placement\_group](#input\_placement\_group) | The Placement Group to start the instance in | `string` | `null` | no | | [private\_dns\_name\_options](#input\_private\_dns\_name\_options) | Customize the private DNS name options of the instance | `map(string)` | `{}` | no | | [private\_ip](#input\_private\_ip) | Private IP address to associate with the instance in a VPC | `string` | `null` | no | -| [putin\_khuylo](#input\_putin\_khuylo) | Do you agree that Putin doesn't respect Ukrainian sovereignty and territorial integrity? More info: https://en.wikipedia.org/wiki/Putin_khuylo! | `bool` | `true` | no | | [root\_block\_device](#input\_root\_block\_device) | Customize details about the root block device of the instance. See Block Devices below for details | `list(any)` | `[]` | no | | [secondary\_private\_ips](#input\_secondary\_private\_ips) | A list of secondary private IPv4 addresses to assign to the instance's primary network interface (eth0) in a VPC. Can only be assigned to the primary network interface (eth0) attached at instance creation, not a pre-existing network interface i.e. referenced in a `network_interface block` | `list(string)` | `null` | no | | [source\_dest\_check](#input\_source\_dest\_check) | Controls if traffic is routed to the instance when the destination address does not match the instance. Used for NAT or VPNs | `bool` | `null` | no | diff --git a/custom.tf b/custom.tf new file mode 100644 index 00000000..20fcab86 --- /dev/null +++ b/custom.tf @@ -0,0 +1,96 @@ +variable "alarm_info_sns_topic_arn" { + type = string + description = "The ARN of the SNS topic to notify when on info alerts" +} + +variable "alarm_sns_topic_arn" { + type = string + description = "The ARN of the SNS topic to notify when on critical alerts" +} + +variable "environment" { + type = string + description = "The environment tag to apply to all resources. eg: production, testing, staging, etc" + + validation { + condition = var.environment == null || can(regex("^(production|testing|staging|development)$", var.environment)) + error_message = "environment must be lowercase alphanumeric with hyphens only." + } +} + +variable "service" { + type = string + description = "Service hosted on this instance. eg: squadstack, metabase, grafana, etc" + + validation { + condition = var.service == null || can(regex("^[a-z-]+$", var.service)) + error_message = "service_component must be lowercase alphabets with hyphens only." + } +} + +variable "service_component" { + type = string + default = null + description = "[optional] Service Group within a service. eg: app, api, celery, etc" + + validation { + condition = var.service_component == null || can(regex("^[a-z0-9-]+$", var.service_component)) + error_message = "If provided, service_component must be lowercase alphanumeric with hyphens only." + } +} + +variable "owner_team" { + type = string + description = "Owner of this ec2. eg: platform, supply, demand, ds, etc" + + validation { + condition = var.owner_team == null || can(regex("^(platform|supply|demand|integrations|data-science)$", var.owner_team)) + error_message = "owner_team must be amongst platform, supply, demand, integrations, data-science." + } +} + +resource "aws_cloudwatch_metric_alarm" "ec2_cpuutilization_alert_info" { + alarm_name = "${var.name}_high_cpu_alert" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "5" + datapoints_to_alarm = "4" + treat_missing_data = "missing" + metric_name = "CPUUtilization" + namespace = "AWS/EC2" + period = "120" + statistic = "Average" + threshold = "85" + alarm_description = "This metric monitors ec2 CPU Utilization" + alarm_actions = [var.alarm_info_sns_topic_arn] + unit = "Percent" + dimensions = { + InstanceId = try( + aws_instance.this[0].id, + aws_instance.ignore_ami[0].id, + ) + } +} + + + +resource "aws_cloudwatch_metric_alarm" "ec2_cpuutilization_alert_warning" { + alarm_name = "${var.name}_critical_cpu_alert" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "15" + datapoints_to_alarm = "12" + treat_missing_data = "breaching" + metric_name = "CPUUtilization" + namespace = "AWS/EC2" + period = "120" + statistic = "Maximum" + threshold = "95" + alarm_description = "This metric monitors ec2 CPU Utilization" + alarm_actions = [var.alarm_sns_topic_arn] + unit = "Percent" + dimensions = { + InstanceId = try( + aws_instance.this[0].id, + aws_instance.ignore_ami[0].id, + ) + } +} diff --git a/main.tf b/main.tf index 749e67d4..eb0d8369 100644 --- a/main.tf +++ b/main.tf @@ -1,7 +1,7 @@ data "aws_partition" "current" {} locals { - create = var.create && var.putin_khuylo + create = var.create is_t_instance_type = replace(var.instance_type, "/^t(2|3|3a|4g){1}\\..*$/", "1") == "1" ? true : false @@ -188,7 +188,13 @@ resource "aws_instance" "this" { delete = try(var.timeouts.delete, null) } - tags = merge({ "Name" = var.name }, var.instance_tags, var.tags) + tags = merge({ + "Name" = var.name, + "Environment" = var.environment, + "Service" = var.service, + "ServiceComponent" = var.service_component, + "OwnerTeam" = var.owner_team + }, var.instance_tags, var.tags) volume_tags = var.enable_volume_tags ? merge({ "Name" = var.name }, var.volume_tags) : null } diff --git a/variables.tf b/variables.tf index 38a1b5b2..d8df78ca 100644 --- a/variables.tf +++ b/variables.tf @@ -351,11 +351,7 @@ variable "disable_api_stop" { default = null } -variable "putin_khuylo" { - description = "Do you agree that Putin doesn't respect Ukrainian sovereignty and territorial integrity? More info: https://en.wikipedia.org/wiki/Putin_khuylo!" - type = bool - default = true -} + ################################################################################ # IAM Role / Instance Profile diff --git a/wrappers/main.tf b/wrappers/main.tf index 9ba0cdb4..c6b517dd 100644 --- a/wrappers/main.tf +++ b/wrappers/main.tf @@ -56,7 +56,6 @@ module "wrapper" { placement_group = try(each.value.placement_group, var.defaults.placement_group, null) private_dns_name_options = try(each.value.private_dns_name_options, var.defaults.private_dns_name_options, {}) private_ip = try(each.value.private_ip, var.defaults.private_ip, null) - putin_khuylo = try(each.value.putin_khuylo, var.defaults.putin_khuylo, true) root_block_device = try(each.value.root_block_device, var.defaults.root_block_device, []) secondary_private_ips = try(each.value.secondary_private_ips, var.defaults.secondary_private_ips, null) source_dest_check = try(each.value.source_dest_check, var.defaults.source_dest_check, null) From f3f920ca763811cb2c263db099f7d99555822fdf Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 24 Oct 2024 01:25:52 +0530 Subject: [PATCH 2/8] add disaster recovery tags --- custom.tf | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/custom.tf b/custom.tf index 20fcab86..6da50ebb 100644 --- a/custom.tf +++ b/custom.tf @@ -39,6 +39,19 @@ variable "service_component" { } } +variable "backup_for_disaster_recovery" { + type = bool + default = false + description = "If we need to keep backup of this instance's ami in other region" +} + +variable "backup_frequency_days" { + type = number + default = 7 + description = "The number of days after which the backup should be taken of the ami" +} + + variable "owner_team" { type = string description = "Owner of this ec2. eg: platform, supply, demand, ds, etc" From 41ea7b5a3c9299fcabae10fd98e4da6abf6a9a4c Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 24 Oct 2024 01:38:19 +0530 Subject: [PATCH 3/8] add ssm access tag --- custom.tf | 5 +++++ main.tf | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/custom.tf b/custom.tf index 6da50ebb..f1d9cb35 100644 --- a/custom.tf +++ b/custom.tf @@ -51,6 +51,11 @@ variable "backup_frequency_days" { description = "The number of days after which the backup should be taken of the ami" } +variable "ssm_access_type" { + type = string + default = "" + description = "ssm access type" +} variable "owner_team" { type = string diff --git a/main.tf b/main.tf index eb0d8369..df0f1884 100644 --- a/main.tf +++ b/main.tf @@ -193,7 +193,10 @@ resource "aws_instance" "this" { "Environment" = var.environment, "Service" = var.service, "ServiceComponent" = var.service_component, - "OwnerTeam" = var.owner_team + "OwnerTeam" = var.owner_team, + env_type_ssm = var.ssm_access_type, + disaster-recovery-backup = var.backup_for_disaster_recovery, + backup-frequency = var.backup_frequency_days }, var.instance_tags, var.tags) volume_tags = var.enable_volume_tags ? merge({ "Name" = var.name }, var.volume_tags) : null } From 4e2a344f3f2a8b46ede260f23a9f1c563ab9a053 Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 24 Oct 2024 19:39:46 +0530 Subject: [PATCH 4/8] ignore some things --- custom.tf | 5 +++++ main.tf | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/custom.tf b/custom.tf index f1d9cb35..785f2ffe 100644 --- a/custom.tf +++ b/custom.tf @@ -8,6 +8,11 @@ variable "alarm_sns_topic_arn" { description = "The ARN of the SNS topic to notify when on critical alerts" } +variable "ignore_in_lifecycle" { + default = [] + description = "ignore instance type changes" +} + variable "environment" { type = string description = "The environment tag to apply to all resources. eg: production, testing, staging, etc" diff --git a/main.tf b/main.tf index df0f1884..765783dc 100644 --- a/main.tf +++ b/main.tf @@ -188,6 +188,10 @@ resource "aws_instance" "this" { delete = try(var.timeouts.delete, null) } + lifecycle { + ignore_changes = var.ignore_in_lifecycle + } + tags = merge({ "Name" = var.name, "Environment" = var.environment, From 0df6efe25df5b6aacbb15013716b614a56a58eda Mon Sep 17 00:00:00 2001 From: Ayush Date: Thu, 24 Oct 2024 19:49:45 +0530 Subject: [PATCH 5/8] ignore instance_type & key --- custom.tf | 5 ----- main.tf | 5 ++++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/custom.tf b/custom.tf index 785f2ffe..f1d9cb35 100644 --- a/custom.tf +++ b/custom.tf @@ -8,11 +8,6 @@ variable "alarm_sns_topic_arn" { description = "The ARN of the SNS topic to notify when on critical alerts" } -variable "ignore_in_lifecycle" { - default = [] - description = "ignore instance type changes" -} - variable "environment" { type = string description = "The environment tag to apply to all resources. eg: production, testing, staging, etc" diff --git a/main.tf b/main.tf index 765783dc..c651f8c2 100644 --- a/main.tf +++ b/main.tf @@ -189,7 +189,10 @@ resource "aws_instance" "this" { } lifecycle { - ignore_changes = var.ignore_in_lifecycle + ignore_changes = [ + key_name, + instance_type, + ] } tags = merge({ From 3df407ba8ce8839ec50658b68cdb414816d5baab Mon Sep 17 00:00:00 2001 From: Ayush Date: Fri, 25 Oct 2024 11:54:01 +0530 Subject: [PATCH 6/8] require imdsv2 tokens --- variables.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/variables.tf b/variables.tf index d8df78ca..a95dbd81 100644 --- a/variables.tf +++ b/variables.tf @@ -160,7 +160,8 @@ variable "metadata_options" { default = { "http_endpoint" = "enabled" "http_put_response_hop_limit" = 1 - "http_tokens" = "optional" + "http_tokens" = "required" + "instance_metadata_tags" = "disabled" } } From 05b35d303f37e9ea815456154af14253709201d3 Mon Sep 17 00:00:00 2001 From: sandal jain Date: Fri, 14 Mar 2025 01:45:15 +0530 Subject: [PATCH 7/8] fixing alarms --- custom.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/custom.tf b/custom.tf index f1d9cb35..9c56a174 100644 --- a/custom.tf +++ b/custom.tf @@ -72,7 +72,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2_cpuutilization_alert_info" { comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "5" datapoints_to_alarm = "4" - treat_missing_data = "missing" + treat_missing_data = "ignore" metric_name = "CPUUtilization" namespace = "AWS/EC2" period = "120" @@ -96,7 +96,7 @@ resource "aws_cloudwatch_metric_alarm" "ec2_cpuutilization_alert_warning" { comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "15" datapoints_to_alarm = "12" - treat_missing_data = "breaching" + treat_missing_data = "ignore" metric_name = "CPUUtilization" namespace = "AWS/EC2" period = "120" From d04c6ae9e2438c0d96e3ef81debf2e505ee724dc Mon Sep 17 00:00:00 2001 From: sandal jain Date: Thu, 20 Mar 2025 11:46:26 +0530 Subject: [PATCH 8/8] adding alarms for RAM & disk --- custom.tf | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/custom.tf b/custom.tf index 9c56a174..52df05cd 100644 --- a/custom.tf +++ b/custom.tf @@ -112,3 +112,95 @@ resource "aws_cloudwatch_metric_alarm" "ec2_cpuutilization_alert_warning" { ) } } + +resource "aws_cloudwatch_metric_alarm" "ec2_memory_utilization_alert_info" { + alarm_name = "${var.name}_high_memory_alert" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "5" + datapoints_to_alarm = "4" + treat_missing_data = "ignore" + metric_name = "MemoryUtilization" + namespace = "CWAgent" + period = "120" + statistic = "Average" + threshold = "85" + alarm_description = "This metric monitors EC2 memory utilization" + alarm_actions = [var.alarm_info_sns_topic_arn] + unit = "Percent" + dimensions = { + InstanceId = try( + aws_instance.this[0].id, + aws_instance.ignore_ami[0].id + ) + } +} + +resource "aws_cloudwatch_metric_alarm" "ec2_memory_utilization_alert_warning" { + alarm_name = "${var.name}_critical_memory_alert" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "15" + datapoints_to_alarm = "12" + treat_missing_data = "ignore" + metric_name = "MemoryUtilization" + namespace = "CWAgent" + period = "120" + statistic = "Maximum" + threshold = "95" + alarm_description = "This metric monitors EC2 memory utilization" + alarm_actions = [var.alarm_sns_topic_arn] + unit = "Percent" + dimensions = { + InstanceId = try( + aws_instance.this[0].id, + aws_instance.ignore_ami[0].id, + ) + } +} + +resource "aws_cloudwatch_metric_alarm" "ec2_disk_utilization_alert_info" { + alarm_name = "${var.name}_high_disk_alert" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "5" + datapoints_to_alarm = "4" + treat_missing_data = "ignore" + metric_name = "DiskSpaceUtilization" + namespace = "CWAgent" + period = "120" + statistic = "Average" + threshold = "85" + alarm_description = "This metric monitors EC2 disk space utilization" + alarm_actions = [var.alarm_info_sns_topic_arn] + unit = "Percent" + dimensions = { + InstanceId = try( + aws_instance.this[0].id, + aws_instance.ignore_ami[0].id, + ) + MountPath = "/" + Filesystem = "ext4" + } +} + +resource "aws_cloudwatch_metric_alarm" "ec2_disk_utilization_alert_warning" { + alarm_name = "${var.name}_critical_disk_alert" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "15" + datapoints_to_alarm = "12" + treat_missing_data = "ignore" + metric_name = "DiskSpaceUtilization" + namespace = "CWAgent" + period = "120" + statistic = "Maximum" + threshold = "95" + alarm_description = "This metric monitors EC2 disk space utilization" + alarm_actions = [var.alarm_sns_topic_arn] + unit = "Percent" + dimensions = { + InstanceId = try( + aws_instance.this[0].id, + aws_instance.ignore_ami[0].id, + ) + MountPath = "/" + Filesystem = "ext4" + } +}