Skip to content

Commit 1886b40

Browse files
committed
Add Cloudwatch monitoring
We need to know when our Archivematica instances go down, so this commit adds Cloudwatch alarms in case they drop below the expected number of running pods.
1 parent 908e257 commit 1886b40

24 files changed

+1129
-39
lines changed

archivematica/prod_cluster/--addon-name

Whitespace-only changes.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
resource "aws_sns_topic" "prod_archivematica_alarm_topic" {
2+
name = "prod-archivematica-alarm"
3+
}
4+
5+
resource "aws_sns_topic_subscription" "prod_archivematica_alarm_subscription" {
6+
topic_arn = aws_sns_topic.prod_archivematica_alarm_topic.arn
7+
protocol = "email"
8+
endpoint = var.alert_email
9+
}
10+
11+
resource "aws_cloudwatch_metric_alarm" "prod_archivematica_pods" {
12+
alarm_name = "prod-archivematica-running-pods"
13+
alarm_description = "Alert if the archivematica-prod namespace has fewer running pods than expected"
14+
15+
namespace = "ContainerInsights"
16+
metric_name = "namespace_number_of_running_pods"
17+
statistic = "Average"
18+
period = 60
19+
evaluation_periods = 2
20+
comparison_operator = "LessThanThreshold"
21+
threshold = 7
22+
23+
dimensions = {
24+
ClusterName = local.cluster_name
25+
Namespace = "archivematica-prod"
26+
}
27+
28+
treat_missing_data = "breaching"
29+
30+
alarm_actions = [aws_sns_topic.prod_archivematica_alarm_topic.arn]
31+
ok_actions = [aws_sns_topic.prod_archivematica_alarm_topic.arn]
32+
}

archivematica/prod_cluster/archivematica_deployment.tf

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1+
resource "kubernetes_namespace" "archivematica_prod" {
2+
metadata {
3+
name = "archivematica-prod"
4+
}
5+
}
6+
17
data "kubernetes_resource" "archivematica_prod" {
28
count = local.need_images ? 1 : 0
39
kind = "Deployment"
410
api_version = "apps/v1"
5-
metadata { name = "archivematica-prod" }
11+
metadata {
12+
name = "archivematica-prod"
13+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
14+
}
615
}
716

817
resource "kubernetes_deployment" "archivematica_prod" {
@@ -12,6 +21,7 @@ resource "kubernetes_deployment" "archivematica_prod" {
1221
App = "archivematica-prod"
1322
Environment = "prod"
1423
}
24+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
1525
}
1626
spec {
1727
replicas = 1
@@ -25,6 +35,9 @@ resource "kubernetes_deployment" "archivematica_prod" {
2535
labels = {
2636
App = "archivematica-prod"
2737
}
38+
annotations = {
39+
"instrumentation.opentelemetry.io/inject-python" = "false"
40+
}
2841
}
2942
spec {
3043
security_context {
@@ -567,7 +580,10 @@ data "kubernetes_resource" "mcp_client_prod" {
567580
count = local.need_images ? 1 : 0
568581
kind = "Deployment"
569582
api_version = "apps/v1"
570-
metadata { name = "archivematica-mcp-client-prod" }
583+
metadata {
584+
name = "archivematica-mcp-client-prod"
585+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
586+
}
571587
}
572588

573589
resource "kubernetes_deployment" "mcp_client_prod" {
@@ -577,6 +593,7 @@ resource "kubernetes_deployment" "mcp_client_prod" {
577593
App = "archivematica-prod"
578594
Environment = "prod"
579595
}
596+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
580597
}
581598
spec {
582599
replicas = 4
@@ -590,6 +607,9 @@ resource "kubernetes_deployment" "mcp_client_prod" {
590607
labels = {
591608
App = "archivematica-mcp-client-prod"
592609
}
610+
annotations = {
611+
"instrumentation.opentelemetry.io/inject-python" = "false"
612+
}
593613
}
594614
spec {
595615
container {
@@ -703,7 +723,8 @@ resource "kubernetes_deployment" "mcp_client_prod" {
703723

704724
resource "kubernetes_service" "archivematica_dashboard_service_prod" {
705725
metadata {
706-
name = "archivematica-dashboard-prod"
726+
name = "archivematica-dashboard-prod"
727+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
707728
}
708729
spec {
709730
type = "ClusterIP"
@@ -719,7 +740,8 @@ resource "kubernetes_service" "archivematica_dashboard_service_prod" {
719740

720741
resource "kubernetes_service" "archivematica_storage_service_prod" {
721742
metadata {
722-
name = "archivematica-storage-prod"
743+
name = "archivematica-storage-prod"
744+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
723745
}
724746
spec {
725747
type = "ClusterIP"
@@ -735,7 +757,8 @@ resource "kubernetes_service" "archivematica_storage_service_prod" {
735757

736758
resource "kubernetes_persistent_volume_claim" "archivematica_prod_pipeline_data_pvc" {
737759
metadata {
738-
name = "prod-pipeline-data"
760+
name = "prod-pipeline-data"
761+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
739762
}
740763
spec {
741764
access_modes = ["ReadWriteOnce"]
@@ -751,7 +774,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_pipeline_data_
751774

752775
resource "kubernetes_persistent_volume_claim" "archivematica_prod_staging_data_pvc" {
753776
metadata {
754-
name = "prod-staging-data"
777+
name = "prod-staging-data"
778+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
755779
}
756780
spec {
757781
access_modes = ["ReadWriteOnce"]
@@ -767,7 +791,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_staging_data_p
767791

768792
resource "kubernetes_persistent_volume_claim" "archivematica_prod_location_data_pvc" {
769793
metadata {
770-
name = "prod-location-data"
794+
name = "prod-location-data"
795+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
771796
}
772797
spec {
773798
access_modes = ["ReadWriteOnce"]
@@ -783,7 +808,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_location_data_
783808

784809
resource "kubernetes_persistent_volume_claim" "archivematica_prod_transfer_share_pvc" {
785810
metadata {
786-
name = "prod-transfer-share"
811+
name = "prod-transfer-share"
812+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
787813
}
788814
spec {
789815
access_modes = ["ReadWriteOnce"]
@@ -799,7 +825,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_transfer_share
799825

800826
resource "kubernetes_persistent_volume_claim" "archivematica_prod_storage_share_pvc" {
801827
metadata {
802-
name = "prod-storage-share"
828+
name = "prod-storage-share"
829+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
803830
}
804831
spec {
805832
access_modes = ["ReadWriteOnce"]

archivematica/prod_cluster/eks-cluster.tf

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ module "eks" {
4343
service_account_role_arn = module.ebs_csi_irsa.iam_role_arn
4444
resolve_conflicts = "OVERWRITE"
4545
}
46+
amazon-cloudwatch-observability = {
47+
most_recent = true
48+
resolve_conflicts_on_create = "OVERWRITE"
49+
resolve_conflicts = "OVERWRITE"
50+
service_account_role_arn = module.cloudwatch_observability_irsa.iam_role_arn
51+
}
4652
}
4753

4854
eks_managed_node_groups = {
@@ -87,6 +93,28 @@ module "ebs_csi_irsa" {
8793
}
8894
}
8995

96+
module "cloudwatch_observability_irsa" {
97+
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
98+
version = "5.60.0"
99+
100+
// It is generally our practice to avoid abbreviations, but this actually does have a
101+
// length limit we'd run into if we spelled out "observability"
102+
role_name_prefix = "${local.cluster_name}-o11y-"
103+
104+
attach_cloudwatch_observability_policy = true
105+
106+
oidc_providers = {
107+
main = {
108+
provider_arn = module.eks.oidc_provider_arn
109+
110+
namespace_service_accounts = [
111+
"amazon-cloudwatch:cloudwatch-agent",
112+
"amazon-cloudwatch:adot-collector",
113+
]
114+
}
115+
}
116+
}
117+
90118
resource "kubernetes_cluster_role_binding" "eks_admins_cluster_admin" {
91119
metadata {
92120
name = "eks-admins-cluster-admin"

archivematica/prod_cluster/gearman_deployment.tf

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ resource "kubernetes_deployment" "archivematica_gearman_prod" {
55
App = "archivematica-prod"
66
Environment = "prod"
77
}
8+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
89
}
910
spec {
1011
replicas = 1
@@ -45,7 +46,8 @@ resource "kubernetes_deployment" "archivematica_gearman_prod" {
4546

4647
resource "kubernetes_service" "archivematica_gearman_prod" {
4748
metadata {
48-
name = "archivematica-gearman-prod"
49+
name = "archivematica-gearman-prod"
50+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
4951
}
5052
spec {
5153
selector = {

archivematica/prod_cluster/ingress.tf

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
resource "kubernetes_ingress_v1" "archivematica_dashboard_ingress_prod" {
22
metadata {
3-
name = "archivematica-dashboard-ingress-dev"
3+
name = "archivematica-dashboard-ingress-prod"
44
annotations = {
55
"alb.ingress.kubernetes.io/scheme" = "internet-facing"
66
"alb.ingress.kubernetes.io/subnets" = "${var.subnet_ids[0]},${var.subnet_ids[1]},${var.subnet_ids[2]}"
@@ -11,6 +11,7 @@ resource "kubernetes_ingress_v1" "archivematica_dashboard_ingress_prod" {
1111
"alb.ingress.kubernetes.io/target-type" = "ip"
1212
"alb.ingress.kubernetes.io/inbound-cidrs" = join(",", var.whitelisted_cidrs)
1313
}
14+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
1415
}
1516
spec {
1617
ingress_class_name = "alb"
@@ -44,6 +45,7 @@ resource "kubernetes_ingress_v1" "archivematica_storage_ingress_prod" {
4445
"alb.ingress.kubernetes.io/target-type" = "ip"
4546
"alb.ingress.kubernetes.io/inbound-cidrs" = join(",", var.whitelisted_cidrs)
4647
}
48+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
4749
}
4850
spec {
4951
ingress_class_name = "alb"
@@ -77,6 +79,7 @@ resource "kubernetes_ingress_v1" "archivematica_dashboard_internal_ingress_prod"
7779
"alb.ingress.kubernetes.io/target-type" = "ip"
7880
"alb.ingress.kubernetes.io/security-groups" = var.security_group_id
7981
}
82+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
8083
}
8184
spec {
8285
ingress_class_name = "alb"
@@ -110,6 +113,7 @@ resource "kubernetes_ingress_v1" "archivematica_storage_internal_ingress_prod" {
110113
"alb.ingress.kubernetes.io/target-type" = "ip"
111114
"alb.ingress.kubernetes.io/security-groups" = var.security_group_id
112115
}
116+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
113117
}
114118
spec {
115119
ingress_class_name = "alb"

archivematica/prod_cluster/redis_deployment.tf

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ resource "kubernetes_deployment" "archivematica_redis_prod" {
55
App = "archivematica-prod"
66
Environment = "prod"
77
}
8+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
89
}
910
spec {
1011
replicas = 1
@@ -54,7 +55,8 @@ resource "kubernetes_deployment" "archivematica_redis_prod" {
5455

5556
resource "kubernetes_service" "archivematica_redis_prod" {
5657
metadata {
57-
name = "archivematica-redis-prod"
58+
name = "archivematica-redis-prod"
59+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
5860
}
5961
spec {
6062
selector = {
@@ -70,7 +72,8 @@ resource "kubernetes_service" "archivematica_redis_prod" {
7072

7173
resource "kubernetes_persistent_volume_claim" "archivematica_redis_pvc_prod" {
7274
metadata {
73-
name = "archivematica-redis-pvc-prod"
75+
name = "archivematica-redis-pvc-prod"
76+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
7477
}
7578
spec {
7679
access_modes = ["ReadWriteOnce"]

archivematica/prod_cluster/secrets.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
resource "kubernetes_secret" "prod-archivematica-secrets" {
22
metadata {
3-
name = "prod-archivematica-secrets"
3+
name = "prod-archivematica-secrets"
4+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
45
}
56

67
data = {

archivematica/prod_cluster/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,9 @@ variable "image_overrides" {
7777
type = map(string)
7878
default = {}
7979
}
80+
81+
variable "alert_email" {
82+
description = "The email to which to send Cloudwatch alerts"
83+
type = string
84+
default = "engineering@permanent.org"
85+
}

0 commit comments

Comments
 (0)