datafold
diff --git a/‎README.md‎
Lines changed: 55 additions & 0 deletions b/‎README.md‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎main.tf‎
Lines changed: 9 additions & 2 deletions b/‎main.tf‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎modules/eks/main.tf‎
Lines changed: 12 additions & 20 deletions b/‎modules/eks/main.tf‎
Lines changed: 12 additions & 20 deletions
diff --git a/‎modules/eks/outputs.tf‎
Lines changed: 15 additions & 15 deletions b/‎modules/eks/outputs.tf‎
Lines changed: 15 additions & 15 deletions
@@ -134,6 +134,61 @@ All we need to is to run these commands:
 
 Now all containers should be up and running.
 
+### Upgrading to 1.15+
+
+In this version the terraform providers were upgraded to newer versions and this introduces
+role name changes and a lot of other things. This means that after the upgrade, you can expect
+issues with certain kube-system pods in a crashloop. 
+
+The reason this happens is that the role names have changed that infra creates. They're using a 
+prefix and a suffix now.
+
+AWS authenticates the service accounts for certain kube-system pods like aws-loadbalancer-controller,
+but after this change that role mapping breaks.
+
+There are ways to fix that manually:
+* Apply the application again after applying the infra. This should fix the role names for two pods.
+* Go to the service account of the aws-load-balancer-controller pod.
+* The service account has a forward mapping to the role ARN they need to assume on the cloud in the annotations
+* Update that annotation.
+
+Example:
+
+```yaml
+apiVersion: v1
+automountServiceAccountToken: true
+kind: ServiceAccount
+metadata:
+  annotations:
+    eks.amazonaws.com/role-arn: arn:aws:iam::1234567889:role/datafold-lb-controller-2025082013431968900000001  <-- This role ARN should correspond to the role.
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: aws-load-balancer-controller
+  name: aws-load-balancer-controller
+  namespace: kube-system
+```
+
+Check kubernetes for any failing pods in the kube-system namespace, possibly these need updating in the same
+way if the pods continue in the crashloop backoff phase.
+
+* In the newest version of Amazon Linux 3, Datadog cannot determine the local hostname, which it needs for tagging. Updating to the most recent datadog operator solves this issue:
+
+```bash
+> helm repo add datadog https://helm.datadoghq.com
+> helm repo udpate datadog
+> helm update datafold-datadog-operator datadog/datadog-operator
+```
+
+* The default version of kubernetes is now 1.33. Nodes will be replaced if you execute this upgrade.
+* The AWS LB controller must make calls to the metadata servers. But doing this from a pod means that the hop limit that is in place
+  needs to be increased to 2. This avoids having explicit VPC ID's or regions in the configuration of the LB controller, but comes at a 
+  limited security impact: 
+
+https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/
+
+https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#imds-considerations
+
+
 <!-- BEGIN_TF_DOCS -->
 
 ## Requirements
 
@@ -102,6 +102,9 @@ locals {
           }
         }
       }
+      metadata_options = {
+        http_put_response_hop_limit = 2
+      }
     }, var.managed_node_grp1)
   second_node_pool = merge(
     {
@@ -125,6 +128,9 @@ locals {
           }
         }
       }
+      metadata_options = {
+        http_put_response_hop_limit = 2
+      }
     }, var.managed_node_grp2)
   third_node_pool = merge(
     {
@@ -148,6 +154,9 @@ locals {
           }
         }
       }
+      metadata_options = {
+        http_put_response_hop_limit = 2
+      }
     }, var.managed_node_grp3)
   managed_node_groups = merge(
     {"${var.deployment_name}-k8s": local.default_node_pool},
@@ -182,9 +191,7 @@ module "eks" {
   lb_security_group_id                = local.lb_security_group_id
   db_security_group_id                = local.db_security_group_id
   self_managed_node_grp_instance_type = var.self_managed_node_grp_instance_type
-  self_managed_node_grp_default       = var.self_managed_node_grp_default
   self_managed_node_grps              = var.self_managed_node_grps
-  managed_node_grp_default            = var.managed_node_grp_default
   managed_node_grps                   = local.managed_node_groups
   k8s_api_access_roles                = var.k8s_api_access_roles
 
 
@@ -1,9 +1,9 @@
 data "aws_caller_identity" "current" {}
 
 module "ebs_csi_irsa_role" {
-  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts"
 
-  role_name             = "${var.deployment_name}-ebs-csi-controller"
+  name             = "${var.deployment_name}-ebs-csi-controller"
   attach_ebs_csi_policy = true
 
   oidc_providers = {
@@ -15,9 +15,9 @@ module "ebs_csi_irsa_role" {
 }
 
 module "k8s_load_balancer_controller_role" {
-  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts"
 
-  role_name                              = "${var.deployment_name}-lb-controller"
+  name                              = "${var.deployment_name}-lb-controller"
   attach_load_balancer_controller_policy = true
 
   oidc_providers = {
@@ -29,9 +29,9 @@ module "k8s_load_balancer_controller_role" {
 }
 
 module "cluster_autoscaler_role" {
-  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts"
 
-  role_name                        = "${var.deployment_name}-cluster-autoscaler"
+  name                        = "${var.deployment_name}-cluster-autoscaler"
   attach_cluster_autoscaler_policy = true
   cluster_autoscaler_cluster_names = [module.eks.cluster_name]
 
@@ -47,18 +47,18 @@ module "eks" {
   # https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/docs
 
   source  = "terraform-aws-modules/eks/aws"
-  version = "~> 20.37.1"
+  version = "~> 21.1.0"
   # version = var.eks_module_version
 
-  cluster_name    = var.deployment_name
-  cluster_version = var.k8s_cluster_version
+  name               = var.deployment_name
+  kubernetes_version = var.k8s_cluster_version
 
-  cluster_endpoint_public_access = true
-  cluster_endpoint_public_access_cidrs = var.k8s_public_access_cidrs
+  endpoint_public_access = true
+  endpoint_public_access_cidrs = var.k8s_public_access_cidrs
 
   enable_irsa = true
 
-  cluster_addons = {
+  addons = {
     coredns = {
       most_recent = true
     },
@@ -91,15 +91,7 @@ module "eks" {
   authentication_mode      = "API"
 
   # Self Managed Node Group(s)
-  self_managed_node_group_defaults = var.self_managed_node_grp_default
-
   self_managed_node_groups = var.self_managed_node_grps
-
-  # EKS Managed Node Group(s)
-  eks_managed_node_group_defaults = {
-    instance_types = var.managed_node_grp_default
-  }
-
   eks_managed_node_groups = var.managed_node_grps
 
 #  access_entries = {
 
@@ -3,11 +3,11 @@ output "cluster_name" {
 }
 
 output "k8s_load_balancer_controller_role_arn" {
-  value = module.k8s_load_balancer_controller_role.iam_role_arn
+  value = module.k8s_load_balancer_controller_role.arn
 }
 
 output "cluster_scaler_role_arn" {
-  value = module.cluster_autoscaler_role.iam_role_arn
+  value = module.cluster_autoscaler_role.arn
 }
 
 output "control_plane_security_group_id" {
@@ -20,7 +20,7 @@ output "cluster_endpoint" {
 
 # dfshell
 output "dfshell_role_arn" {
-  value = module.dfshell_role[0].iam_role_arn
+  value = module.dfshell_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "dfshell_service_account_name" {
@@ -30,7 +30,7 @@ output "dfshell_service_account_name" {
 
 # worker_portal
 output "worker_portal_role_arn" {
-  value = module.worker_portal_role[0].iam_role_arn
+  value = module.worker_portal_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "worker_portal_service_account_name" {
@@ -40,7 +40,7 @@ output "worker_portal_service_account_name" {
 
 # operator
 output "operator_role_arn" {
-  value = module.operator_role[0].iam_role_arn
+  value = module.operator_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "operator_service_account_name" {
@@ -50,7 +50,7 @@ output "operator_service_account_name" {
 
 # server
 output "server_role_arn" {
-  value = module.server_role[0].iam_role_arn
+  value = module.server_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "server_service_account_name" {
@@ -60,7 +60,7 @@ output "server_service_account_name" {
 
 # scheduler
 output "scheduler_role_arn" {
-  value = module.scheduler_role[0].iam_role_arn
+  value = module.scheduler_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "scheduler_service_account_name" {
@@ -70,7 +70,7 @@ output "scheduler_service_account_name" {
 
 # worker, worker1, worker2 etc.
 output "worker_role_arn" {
-  value = module.worker_role[0].iam_role_arn
+  value = module.worker_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "worker_service_account_name" {
@@ -80,7 +80,7 @@ output "worker_service_account_name" {
 
 # worker_catalog
 output "worker_catalog_role_arn" {
-  value = module.worker_catalog_role[0].iam_role_arn
+  value = module.worker_catalog_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "worker_catalog_service_account_name" {
@@ -90,7 +90,7 @@ output "worker_catalog_service_account_name" {
 
 # worker_interactive
 output "worker_interactive_role_arn" {
-  value = module.worker_interactive_role[0].iam_role_arn
+  value = module.worker_interactive_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "worker_interactive_service_account_name" {
@@ -100,7 +100,7 @@ output "worker_interactive_service_account_name" {
 
 # worker_singletons
 output "worker_singletons_role_arn" {
-  value = module.worker_singletons_role[0].iam_role_arn
+  value = module.worker_singletons_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "worker_singletons_service_account_name" {
@@ -110,7 +110,7 @@ output "worker_singletons_service_account_name" {
 
 # worker_lineage
 output "worker_lineage_role_arn" {
-  value = module.worker_lineage_role[0].iam_role_arn
+  value = module.worker_lineage_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "worker_lineage_service_account_name" {
@@ -120,7 +120,7 @@ output "worker_lineage_service_account_name" {
 
 # worker_monitor
 output "worker_monitor_role_arn" {
-  value = module.worker_monitor_role[0].iam_role_arn
+  value = module.worker_monitor_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "worker_monitor_service_account_name" {
@@ -130,7 +130,7 @@ output "worker_monitor_service_account_name" {
 
 # storage_worker
 output "storage_worker_role_arn" {
-  value = module.storage_worker_role[0].iam_role_arn
+  value = module.storage_worker_role[0].arn
   description = "The ARN of the AWS Bedrock role"
 }
 output "storage_worker_service_account_name" {
@@ -140,6 +140,6 @@ output "storage_worker_service_account_name" {
 
 # Clickhouse backup
 output "clickhouse_backup_role_name" {
-  value = module.clickhouse_backup_role.iam_role_arn
+  value = module.clickhouse_backup_role.arn
   description = "The name of the role for clickhouse backups"
 }
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,9 @@ locals {`
`102`	`102`	`}`
`103`	`103`	`}`
`104`	`104`	`}`
	`105`	`+ metadata_options = {`
	`106`	`+ http_put_response_hop_limit = 2`
	`107`	`+ }`
`105`	`108`	`}, var.managed_node_grp1)`
`106`	`109`	`second_node_pool = merge(`
`107`	`110`	`{`
`@@ -125,6 +128,9 @@ locals {`
`125`	`128`	`}`
`126`	`129`	`}`
`127`	`130`	`}`
	`131`	`+ metadata_options = {`
	`132`	`+ http_put_response_hop_limit = 2`
	`133`	`+ }`
`128`	`134`	`}, var.managed_node_grp2)`
`129`	`135`	`third_node_pool = merge(`
`130`	`136`	`{`
`@@ -148,6 +154,9 @@ locals {`
`148`	`154`	`}`
`149`	`155`	`}`
`150`	`156`	`}`
	`157`	`+ metadata_options = {`
	`158`	`+ http_put_response_hop_limit = 2`
	`159`	`+ }`
`151`	`160`	`}, var.managed_node_grp3)`
`152`	`161`	`managed_node_groups = merge(`
`153`	`162`	`{"${var.deployment_name}-k8s": local.default_node_pool},`
`@@ -182,9 +191,7 @@ module "eks" {`
`182`	`191`	`lb_security_group_id = local.lb_security_group_id`
`183`	`192`	`db_security_group_id = local.db_security_group_id`
`184`	`193`	`self_managed_node_grp_instance_type = var.self_managed_node_grp_instance_type`
`185`		`- self_managed_node_grp_default = var.self_managed_node_grp_default`
`186`	`194`	`self_managed_node_grps = var.self_managed_node_grps`
`187`		`- managed_node_grp_default = var.managed_node_grp_default`
`188`	`195`	`managed_node_grps = local.managed_node_groups`
`189`	`196`	`k8s_api_access_roles = var.k8s_api_access_roles`
`190`	`197`