Skip to content

Commit 3a9e082

Browse files
committed
Add Cloudwatch monitoring
We need to know when our Archivematica instances go down, so this commit adds Cloudwatch alarms in case they drop below the expected number of running pods.
1 parent 1c84fa5 commit 3a9e082

23 files changed

+318
-61
lines changed

archivematica/prod_cluster/--addon-name

Whitespace-only changes.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
resource "aws_sns_topic" "prod_archivematica_alarm_topic" {
2+
name = "prod-archivematica-alarm"
3+
}
4+
5+
resource "aws_sns_topic_subscription" "prod_archivematica_alarm_subscription" {
6+
topic_arn = aws_sns_topic.prod_archivematica_alarm_topic.arn
7+
protocol = "email"
8+
endpoint = var.alert_email
9+
}
10+
11+
resource "aws_cloudwatch_metric_alarm" "prod_archivematica_pods" {
12+
alarm_name = "prod-archivematica-running-pods"
13+
alarm_description = "Alert if the archivematica-prod namespace has fewer running pods than expected"
14+
15+
namespace = "ContainerInsights"
16+
metric_name = "namespace_number_of_running_pods"
17+
statistic = "Average"
18+
period = 60
19+
evaluation_periods = 2
20+
comparison_operator = "LessThanThreshold"
21+
threshold = 7
22+
23+
dimensions = {
24+
ClusterName = local.cluster_name
25+
Namespace = "archivematica-prod"
26+
}
27+
28+
treat_missing_data = "breaching"
29+
30+
alarm_actions = [aws_sns_topic.prod_archivematica_alarm_topic.arn]
31+
ok_actions = [aws_sns_topic.prod_archivematica_alarm_topic.arn]
32+
}

archivematica/prod_cluster/archivematica_deployment.tf

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
1+
resource "kubernetes_namespace" "archivematica_prod" {
2+
metadata {
3+
name = "archivematica-prod"
4+
}
5+
}
6+
17
data "kubernetes_resource" "archivematica_prod" {
28
kind = "Deployment"
39
api_version = "apps/v1"
4-
metadata { name = "archivematica-prod" }
10+
metadata {
11+
name = "archivematica-prod"
12+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
13+
}
514
}
615

716
resource "kubernetes_deployment" "archivematica_prod" {
@@ -11,6 +20,7 @@ resource "kubernetes_deployment" "archivematica_prod" {
1120
App = "archivematica-prod"
1221
Environment = "prod"
1322
}
23+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
1424
}
1525
spec {
1626
replicas = 1
@@ -565,7 +575,10 @@ resource "kubernetes_deployment" "archivematica_prod" {
565575
data "kubernetes_resource" "mcp_client_prod" {
566576
kind = "Deployment"
567577
api_version = "apps/v1"
568-
metadata { name = "archivematica-mcp-client-prod" }
578+
metadata {
579+
name = "archivematica-mcp-client-prod"
580+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
581+
}
569582
}
570583

571584
resource "kubernetes_deployment" "mcp_client_prod" {
@@ -575,6 +588,7 @@ resource "kubernetes_deployment" "mcp_client_prod" {
575588
App = "archivematica-prod"
576589
Environment = "prod"
577590
}
591+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
578592
}
579593
spec {
580594
replicas = 4
@@ -701,7 +715,8 @@ resource "kubernetes_deployment" "mcp_client_prod" {
701715

702716
resource "kubernetes_service" "archivematica_dashboard_service_prod" {
703717
metadata {
704-
name = "archivematica-dashboard-prod"
718+
name = "archivematica-dashboard-prod"
719+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
705720
}
706721
spec {
707722
type = "ClusterIP"
@@ -717,7 +732,8 @@ resource "kubernetes_service" "archivematica_dashboard_service_prod" {
717732

718733
resource "kubernetes_service" "archivematica_storage_service_prod" {
719734
metadata {
720-
name = "archivematica-storage-prod"
735+
name = "archivematica-storage-prod"
736+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
721737
}
722738
spec {
723739
type = "ClusterIP"
@@ -733,7 +749,8 @@ resource "kubernetes_service" "archivematica_storage_service_prod" {
733749

734750
resource "kubernetes_persistent_volume_claim" "archivematica_prod_pipeline_data_pvc" {
735751
metadata {
736-
name = "prod-pipeline-data"
752+
name = "prod-pipeline-data"
753+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
737754
}
738755
spec {
739756
access_modes = ["ReadWriteOnce"]
@@ -749,7 +766,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_pipeline_data_
749766

750767
resource "kubernetes_persistent_volume_claim" "archivematica_prod_staging_data_pvc" {
751768
metadata {
752-
name = "prod-staging-data"
769+
name = "prod-staging-data"
770+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
753771
}
754772
spec {
755773
access_modes = ["ReadWriteOnce"]
@@ -765,7 +783,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_staging_data_p
765783

766784
resource "kubernetes_persistent_volume_claim" "archivematica_prod_location_data_pvc" {
767785
metadata {
768-
name = "prod-location-data"
786+
name = "prod-location-data"
787+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
769788
}
770789
spec {
771790
access_modes = ["ReadWriteOnce"]
@@ -781,7 +800,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_location_data_
781800

782801
resource "kubernetes_persistent_volume_claim" "archivematica_prod_transfer_share_pvc" {
783802
metadata {
784-
name = "prod-transfer-share"
803+
name = "prod-transfer-share"
804+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
785805
}
786806
spec {
787807
access_modes = ["ReadWriteOnce"]
@@ -797,7 +817,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_transfer_share
797817

798818
resource "kubernetes_persistent_volume_claim" "archivematica_prod_storage_share_pvc" {
799819
metadata {
800-
name = "prod-storage-share"
820+
name = "prod-storage-share"
821+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
801822
}
802823
spec {
803824
access_modes = ["ReadWriteOnce"]

archivematica/prod_cluster/eks-cluster.tf

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ module "eks" {
4343
service_account_role_arn = module.ebs_csi_irsa.iam_role_arn
4444
resolve_conflicts = "OVERWRITE"
4545
}
46+
amazon-cloudwatch-observability = {
47+
most_recent = true
48+
resolve_conflicts_on_create = "OVERWRITE"
49+
resolve_conflicts = "OVERWRITE"
50+
service_account_role_arn = module.cloudwatch_observability_irsa.iam_role_arn
51+
}
4652
}
4753

4854
eks_managed_node_groups = {
@@ -73,20 +79,42 @@ module "eks" {
7379
}
7480

7581
module "ebs_csi_irsa" {
76-
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
82+
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
7783
version = "5.60.0"
7884

79-
role_name_prefix = "${local.cluster_name}-ebs-csi-"
85+
role_name_prefix = "${local.cluster_name}-ebs-csi-"
8086
attach_ebs_csi_policy = true
8187

8288
oidc_providers = {
8389
main = {
84-
provider_arn = module.eks.oidc_provider_arn
90+
provider_arn = module.eks.oidc_provider_arn
8591
namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
8692
}
8793
}
8894
}
8995

96+
module "cloudwatch_observability_irsa" {
97+
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
98+
version = "5.60.0"
99+
100+
// It is generally our practice to avoid abbreviations, but this actually does have a
101+
// length limit we'd run into if we spelled out "observability"
102+
role_name_prefix = "${local.cluster_name}-o11y-"
103+
104+
attach_cloudwatch_observability_policy = true
105+
106+
oidc_providers = {
107+
main = {
108+
provider_arn = module.eks.oidc_provider_arn
109+
110+
namespace_service_accounts = [
111+
"amazon-cloudwatch:cloudwatch-agent",
112+
"amazon-cloudwatch:adot-collector",
113+
]
114+
}
115+
}
116+
}
117+
90118
resource "kubernetes_cluster_role_binding" "eks_admins_cluster_admin" {
91119
metadata {
92120
name = "eks-admins-cluster-admin"

archivematica/prod_cluster/gearman_deployment.tf

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ resource "kubernetes_deployment" "archivematica_gearman_prod" {
55
App = "archivematica-prod"
66
Environment = "prod"
77
}
8+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
89
}
910
spec {
1011
replicas = 1
@@ -45,7 +46,8 @@ resource "kubernetes_deployment" "archivematica_gearman_prod" {
4546

4647
resource "kubernetes_service" "archivematica_gearman_prod" {
4748
metadata {
48-
name = "archivematica-gearman-prod"
49+
name = "archivematica-gearman-prod"
50+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
4951
}
5052
spec {
5153
selector = {

archivematica/prod_cluster/ingress.tf

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
resource "kubernetes_ingress_v1" "archivematica_dashboard_ingress_prod" {
22
metadata {
3-
name = "archivematica-dashboard-ingress-dev"
3+
name = "archivematica-dashboard-ingress-prod"
44
annotations = {
55
"alb.ingress.kubernetes.io/scheme" = "internet-facing"
66
"alb.ingress.kubernetes.io/subnets" = "${var.subnet_ids[0]},${var.subnet_ids[1]},${var.subnet_ids[2]}"
@@ -11,6 +11,7 @@ resource "kubernetes_ingress_v1" "archivematica_dashboard_ingress_prod" {
1111
"alb.ingress.kubernetes.io/target-type" = "ip"
1212
"alb.ingress.kubernetes.io/inbound-cidrs" = join(",", var.whitelisted_cidrs)
1313
}
14+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
1415
}
1516
spec {
1617
ingress_class_name = "alb"
@@ -44,6 +45,7 @@ resource "kubernetes_ingress_v1" "archivematica_storage_ingress_prod" {
4445
"alb.ingress.kubernetes.io/target-type" = "ip"
4546
"alb.ingress.kubernetes.io/inbound-cidrs" = join(",", var.whitelisted_cidrs)
4647
}
48+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
4749
}
4850
spec {
4951
ingress_class_name = "alb"
@@ -77,6 +79,7 @@ resource "kubernetes_ingress_v1" "archivematica_dashboard_internal_ingress_prod"
7779
"alb.ingress.kubernetes.io/target-type" = "ip"
7880
"alb.ingress.kubernetes.io/security-groups" = var.security_group_id
7981
}
82+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
8083
}
8184
spec {
8285
ingress_class_name = "alb"
@@ -110,6 +113,7 @@ resource "kubernetes_ingress_v1" "archivematica_storage_internal_ingress_prod" {
110113
"alb.ingress.kubernetes.io/target-type" = "ip"
111114
"alb.ingress.kubernetes.io/security-groups" = var.security_group_id
112115
}
116+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
113117
}
114118
spec {
115119
ingress_class_name = "alb"

archivematica/prod_cluster/redis_deployment.tf

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ resource "kubernetes_deployment" "archivematica_redis_prod" {
55
App = "archivematica-prod"
66
Environment = "prod"
77
}
8+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
89
}
910
spec {
1011
replicas = 1
@@ -54,7 +55,8 @@ resource "kubernetes_deployment" "archivematica_redis_prod" {
5455

5556
resource "kubernetes_service" "archivematica_redis_prod" {
5657
metadata {
57-
name = "archivematica-redis-prod"
58+
name = "archivematica-redis-prod"
59+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
5860
}
5961
spec {
6062
selector = {
@@ -70,7 +72,8 @@ resource "kubernetes_service" "archivematica_redis_prod" {
7072

7173
resource "kubernetes_persistent_volume_claim" "archivematica_redis_pvc_prod" {
7274
metadata {
73-
name = "archivematica-redis-pvc-prod"
75+
name = "archivematica-redis-pvc-prod"
76+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
7477
}
7578
spec {
7679
access_modes = ["ReadWriteOnce"]

archivematica/prod_cluster/secrets.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
resource "kubernetes_secret" "prod-archivematica-secrets" {
22
metadata {
3-
name = "prod-archivematica-secrets"
3+
name = "prod-archivematica-secrets"
4+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
45
}
56

67
data = {

archivematica/prod_cluster/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,9 @@ variable "image_overrides" {
7777
type = map(string)
7878
default = {}
7979
}
80+
81+
variable "alert_email" {
82+
description = "The email to which to send Cloudwatch alerts"
83+
type = string
84+
default = "engineering@permanent.org"
85+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
resource "aws_sns_topic" "dev_archivematica_alarm_topic" {
2+
name = "dev-archivematica-alarm"
3+
}
4+
5+
resource "aws_sns_topic_subscription" "dev_archivematica_alarm_subscription" {
6+
topic_arn = aws_sns_topic.dev_archivematica_alarm_topic.arn
7+
protocol = "email"
8+
endpoint = var.alert_email
9+
}
10+
11+
resource "aws_cloudwatch_metric_alarm" "dev_archivematica_pods" {
12+
alarm_name = "dev-archivematica-running-pods"
13+
alarm_description = "Alert if the archivematica-dev namespace has fewer running pods than expected"
14+
15+
namespace = "ContainerInsights"
16+
metric_name = "namespace_number_of_running_pods"
17+
statistic = "Average"
18+
period = 60
19+
evaluation_periods = 2
20+
comparison_operator = "LessThanThreshold"
21+
threshold = 7
22+
23+
dimensions = {
24+
ClusterName = local.cluster_name
25+
Namespace = "archivematica-dev"
26+
}
27+
28+
treat_missing_data = "breaching"
29+
30+
alarm_actions = [aws_sns_topic.dev_archivematica_alarm_topic.arn]
31+
ok_actions = [aws_sns_topic.dev_archivematica_alarm_topic.arn]
32+
}

0 commit comments

Comments
 (0)