Skip to content

Commit ee27638

Browse files
authored
Merge pull request #214 from PermanentOrg/add_monitoring
Add Cloudwatch monitoring
2 parents 2db841d + 1886b40 commit ee27638

24 files changed

+1129
-39
lines changed

archivematica/prod_cluster/--addon-name

Whitespace-only changes.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
resource "aws_sns_topic" "prod_archivematica_alarm_topic" {
2+
name = "prod-archivematica-alarm"
3+
}
4+
5+
resource "aws_sns_topic_subscription" "prod_archivematica_alarm_subscription" {
6+
topic_arn = aws_sns_topic.prod_archivematica_alarm_topic.arn
7+
protocol = "email"
8+
endpoint = var.alert_email
9+
}
10+
11+
resource "aws_cloudwatch_metric_alarm" "prod_archivematica_pods" {
12+
alarm_name = "prod-archivematica-running-pods"
13+
alarm_description = "Alert if the archivematica-prod namespace has fewer running pods than expected"
14+
15+
namespace = "ContainerInsights"
16+
metric_name = "namespace_number_of_running_pods"
17+
statistic = "Average"
18+
period = 60
19+
evaluation_periods = 2
20+
comparison_operator = "LessThanThreshold"
21+
threshold = 7
22+
23+
dimensions = {
24+
ClusterName = local.cluster_name
25+
Namespace = "archivematica-prod"
26+
}
27+
28+
treat_missing_data = "breaching"
29+
30+
alarm_actions = [aws_sns_topic.prod_archivematica_alarm_topic.arn]
31+
ok_actions = [aws_sns_topic.prod_archivematica_alarm_topic.arn]
32+
}

archivematica/prod_cluster/archivematica_deployment.tf

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1+
resource "kubernetes_namespace" "archivematica_prod" {
2+
metadata {
3+
name = "archivematica-prod"
4+
}
5+
}
6+
17
data "kubernetes_resource" "archivematica_prod" {
28
count = local.need_images ? 1 : 0
39
kind = "Deployment"
410
api_version = "apps/v1"
5-
metadata { name = "archivematica-prod" }
11+
metadata {
12+
name = "archivematica-prod"
13+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
14+
}
615
}
716

817
resource "kubernetes_deployment" "archivematica_prod" {
@@ -12,6 +21,7 @@ resource "kubernetes_deployment" "archivematica_prod" {
1221
App = "archivematica-prod"
1322
Environment = "prod"
1423
}
24+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
1525
}
1626
spec {
1727
replicas = 1
@@ -25,6 +35,9 @@ resource "kubernetes_deployment" "archivematica_prod" {
2535
labels = {
2636
App = "archivematica-prod"
2737
}
38+
annotations = {
39+
"instrumentation.opentelemetry.io/inject-python" = "false"
40+
}
2841
}
2942
spec {
3043
security_context {
@@ -567,7 +580,10 @@ data "kubernetes_resource" "mcp_client_prod" {
567580
count = local.need_images ? 1 : 0
568581
kind = "Deployment"
569582
api_version = "apps/v1"
570-
metadata { name = "archivematica-mcp-client-prod" }
583+
metadata {
584+
name = "archivematica-mcp-client-prod"
585+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
586+
}
571587
}
572588

573589
resource "kubernetes_deployment" "mcp_client_prod" {
@@ -577,6 +593,7 @@ resource "kubernetes_deployment" "mcp_client_prod" {
577593
App = "archivematica-prod"
578594
Environment = "prod"
579595
}
596+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
580597
}
581598
spec {
582599
replicas = 4
@@ -590,6 +607,9 @@ resource "kubernetes_deployment" "mcp_client_prod" {
590607
labels = {
591608
App = "archivematica-mcp-client-prod"
592609
}
610+
annotations = {
611+
"instrumentation.opentelemetry.io/inject-python" = "false"
612+
}
593613
}
594614
spec {
595615
container {
@@ -703,7 +723,8 @@ resource "kubernetes_deployment" "mcp_client_prod" {
703723

704724
resource "kubernetes_service" "archivematica_dashboard_service_prod" {
705725
metadata {
706-
name = "archivematica-dashboard-prod"
726+
name = "archivematica-dashboard-prod"
727+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
707728
}
708729
spec {
709730
type = "ClusterIP"
@@ -719,7 +740,8 @@ resource "kubernetes_service" "archivematica_dashboard_service_prod" {
719740

720741
resource "kubernetes_service" "archivematica_storage_service_prod" {
721742
metadata {
722-
name = "archivematica-storage-prod"
743+
name = "archivematica-storage-prod"
744+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
723745
}
724746
spec {
725747
type = "ClusterIP"
@@ -735,7 +757,8 @@ resource "kubernetes_service" "archivematica_storage_service_prod" {
735757

736758
resource "kubernetes_persistent_volume_claim" "archivematica_prod_pipeline_data_pvc" {
737759
metadata {
738-
name = "prod-pipeline-data"
760+
name = "prod-pipeline-data"
761+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
739762
}
740763
spec {
741764
access_modes = ["ReadWriteOnce"]
@@ -751,7 +774,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_pipeline_data_
751774

752775
resource "kubernetes_persistent_volume_claim" "archivematica_prod_staging_data_pvc" {
753776
metadata {
754-
name = "prod-staging-data"
777+
name = "prod-staging-data"
778+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
755779
}
756780
spec {
757781
access_modes = ["ReadWriteOnce"]
@@ -767,7 +791,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_staging_data_p
767791

768792
resource "kubernetes_persistent_volume_claim" "archivematica_prod_location_data_pvc" {
769793
metadata {
770-
name = "prod-location-data"
794+
name = "prod-location-data"
795+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
771796
}
772797
spec {
773798
access_modes = ["ReadWriteOnce"]
@@ -783,7 +808,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_location_data_
783808

784809
resource "kubernetes_persistent_volume_claim" "archivematica_prod_transfer_share_pvc" {
785810
metadata {
786-
name = "prod-transfer-share"
811+
name = "prod-transfer-share"
812+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
787813
}
788814
spec {
789815
access_modes = ["ReadWriteOnce"]
@@ -799,7 +825,8 @@ resource "kubernetes_persistent_volume_claim" "archivematica_prod_transfer_share
799825

800826
resource "kubernetes_persistent_volume_claim" "archivematica_prod_storage_share_pvc" {
801827
metadata {
802-
name = "prod-storage-share"
828+
name = "prod-storage-share"
829+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
803830
}
804831
spec {
805832
access_modes = ["ReadWriteOnce"]

archivematica/prod_cluster/eks-cluster.tf

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ module "eks" {
4343
service_account_role_arn = module.ebs_csi_irsa.iam_role_arn
4444
resolve_conflicts = "OVERWRITE"
4545
}
46+
amazon-cloudwatch-observability = {
47+
most_recent = true
48+
resolve_conflicts_on_create = "OVERWRITE"
49+
resolve_conflicts = "OVERWRITE"
50+
service_account_role_arn = module.cloudwatch_observability_irsa.iam_role_arn
51+
}
4652
}
4753

4854
eks_managed_node_groups = {
@@ -87,6 +93,28 @@ module "ebs_csi_irsa" {
8793
}
8894
}
8995

96+
module "cloudwatch_observability_irsa" {
97+
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
98+
version = "5.60.0"
99+
100+
// It is generally our practice to avoid abbreviations, but this actually does have a
101+
// length limit we'd run into if we spelled out "observability"
102+
role_name_prefix = "${local.cluster_name}-o11y-"
103+
104+
attach_cloudwatch_observability_policy = true
105+
106+
oidc_providers = {
107+
main = {
108+
provider_arn = module.eks.oidc_provider_arn
109+
110+
namespace_service_accounts = [
111+
"amazon-cloudwatch:cloudwatch-agent",
112+
"amazon-cloudwatch:adot-collector",
113+
]
114+
}
115+
}
116+
}
117+
90118
resource "kubernetes_cluster_role_binding" "eks_admins_cluster_admin" {
91119
metadata {
92120
name = "eks-admins-cluster-admin"

archivematica/prod_cluster/gearman_deployment.tf

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ resource "kubernetes_deployment" "archivematica_gearman_prod" {
55
App = "archivematica-prod"
66
Environment = "prod"
77
}
8+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
89
}
910
spec {
1011
replicas = 1
@@ -45,7 +46,8 @@ resource "kubernetes_deployment" "archivematica_gearman_prod" {
4546

4647
resource "kubernetes_service" "archivematica_gearman_prod" {
4748
metadata {
48-
name = "archivematica-gearman-prod"
49+
name = "archivematica-gearman-prod"
50+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
4951
}
5052
spec {
5153
selector = {

archivematica/prod_cluster/ingress.tf

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
resource "kubernetes_ingress_v1" "archivematica_dashboard_ingress_prod" {
22
metadata {
3-
name = "archivematica-dashboard-ingress-dev"
3+
name = "archivematica-dashboard-ingress-prod"
44
annotations = {
55
"alb.ingress.kubernetes.io/scheme" = "internet-facing"
66
"alb.ingress.kubernetes.io/subnets" = "${var.subnet_ids[0]},${var.subnet_ids[1]},${var.subnet_ids[2]}"
@@ -11,6 +11,7 @@ resource "kubernetes_ingress_v1" "archivematica_dashboard_ingress_prod" {
1111
"alb.ingress.kubernetes.io/target-type" = "ip"
1212
"alb.ingress.kubernetes.io/inbound-cidrs" = join(",", var.whitelisted_cidrs)
1313
}
14+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
1415
}
1516
spec {
1617
ingress_class_name = "alb"
@@ -44,6 +45,7 @@ resource "kubernetes_ingress_v1" "archivematica_storage_ingress_prod" {
4445
"alb.ingress.kubernetes.io/target-type" = "ip"
4546
"alb.ingress.kubernetes.io/inbound-cidrs" = join(",", var.whitelisted_cidrs)
4647
}
48+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
4749
}
4850
spec {
4951
ingress_class_name = "alb"
@@ -77,6 +79,7 @@ resource "kubernetes_ingress_v1" "archivematica_dashboard_internal_ingress_prod"
7779
"alb.ingress.kubernetes.io/target-type" = "ip"
7880
"alb.ingress.kubernetes.io/security-groups" = var.security_group_id
7981
}
82+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
8083
}
8184
spec {
8285
ingress_class_name = "alb"
@@ -110,6 +113,7 @@ resource "kubernetes_ingress_v1" "archivematica_storage_internal_ingress_prod" {
110113
"alb.ingress.kubernetes.io/target-type" = "ip"
111114
"alb.ingress.kubernetes.io/security-groups" = var.security_group_id
112115
}
116+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
113117
}
114118
spec {
115119
ingress_class_name = "alb"

archivematica/prod_cluster/redis_deployment.tf

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ resource "kubernetes_deployment" "archivematica_redis_prod" {
55
App = "archivematica-prod"
66
Environment = "prod"
77
}
8+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
89
}
910
spec {
1011
replicas = 1
@@ -54,7 +55,8 @@ resource "kubernetes_deployment" "archivematica_redis_prod" {
5455

5556
resource "kubernetes_service" "archivematica_redis_prod" {
5657
metadata {
57-
name = "archivematica-redis-prod"
58+
name = "archivematica-redis-prod"
59+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
5860
}
5961
spec {
6062
selector = {
@@ -70,7 +72,8 @@ resource "kubernetes_service" "archivematica_redis_prod" {
7072

7173
resource "kubernetes_persistent_volume_claim" "archivematica_redis_pvc_prod" {
7274
metadata {
73-
name = "archivematica-redis-pvc-prod"
75+
name = "archivematica-redis-pvc-prod"
76+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
7477
}
7578
spec {
7679
access_modes = ["ReadWriteOnce"]

archivematica/prod_cluster/secrets.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
resource "kubernetes_secret" "prod-archivematica-secrets" {
22
metadata {
3-
name = "prod-archivematica-secrets"
3+
name = "prod-archivematica-secrets"
4+
namespace = kubernetes_namespace.archivematica_prod.metadata[0].name
45
}
56

67
data = {

archivematica/prod_cluster/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,9 @@ variable "image_overrides" {
7777
type = map(string)
7878
default = {}
7979
}
80+
81+
variable "alert_email" {
82+
description = "The email to which to send Cloudwatch alerts"
83+
type = string
84+
default = "engineering@permanent.org"
85+
}

0 commit comments

Comments
 (0)