Skip to content

Commit 1c20c4b

Browse files
committed
fix: Upgrade to newest version provider
1 parent 8b5982d commit 1c20c4b

File tree

10 files changed

+147
-111
lines changed

10 files changed

+147
-111
lines changed

README.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,61 @@ All we need to is to run these commands:
134134

135135
Now all containers should be up and running.
136136

137+
### Upgrading to 1.15+
138+
139+
In this version the terraform providers were upgraded to newer versions and this introduces
140+
role name changes and a lot of other things. This means that after the upgrade, you can expect
141+
issues with certain kube-system pods in a crashloop.
142+
143+
The reason this happens is that the role names have changed that infra creates. They're using a
144+
prefix and a suffix now.
145+
146+
AWS authenticates the service accounts for certain kube-system pods like aws-loadbalancer-controller,
147+
but after this change that role mapping breaks.
148+
149+
There are ways to fix that manually:
150+
* Apply the application again after applying the infra. This should fix the role names for two pods.
151+
* Go to the service account of the aws-load-balancer-controller pod.
152+
* The service account has a forward mapping to the role ARN they need to assume on the cloud in the annotations
153+
* Update that annotation.
154+
155+
Example:
156+
157+
```yaml
158+
apiVersion: v1
159+
automountServiceAccountToken: true
160+
kind: ServiceAccount
161+
metadata:
162+
annotations:
163+
eks.amazonaws.com/role-arn: arn:aws:iam::1234567889:role/datafold-lb-controller-2025082013431968900000001 <-- This role ARN should correspond to the role.
164+
labels:
165+
app.kubernetes.io/component: controller
166+
app.kubernetes.io/name: aws-load-balancer-controller
167+
name: aws-load-balancer-controller
168+
namespace: kube-system
169+
```
170+
171+
Check kubernetes for any failing pods in the kube-system namespace, possibly these need updating in the same
172+
way if the pods continue in the crashloop backoff phase.
173+
174+
* In the newest version of Amazon Linux 3, Datadog cannot determine the local hostname, which it needs for tagging. Updating to the most recent datadog operator solves this issue:
175+
176+
```bash
177+
> helm repo add datadog https://helm.datadoghq.com
178+
> helm repo udpate datadog
179+
> helm update datafold-datadog-operator datadog/datadog-operator
180+
```
181+
182+
* The default version of kubernetes is now 1.33. Nodes will be replaced if you execute this upgrade.
183+
* The AWS LB controller must make calls to the metadata servers. But doing this from a pod means that the hop limit that is in place
184+
needs to be increased to 2. This avoids having explicit VPC ID's or regions in the configuration of the LB controller, but comes at a
185+
limited security impact:
186+
187+
https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/
188+
189+
https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#imds-considerations
190+
191+
137192
<!-- BEGIN_TF_DOCS -->
138193

139194
## Requirements

main.tf

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ locals {
102102
}
103103
}
104104
}
105+
metadata_options = {
106+
http_put_response_hop_limit = 2
107+
}
105108
}, var.managed_node_grp1)
106109
second_node_pool = merge(
107110
{
@@ -125,6 +128,9 @@ locals {
125128
}
126129
}
127130
}
131+
metadata_options = {
132+
http_put_response_hop_limit = 2
133+
}
128134
}, var.managed_node_grp2)
129135
third_node_pool = merge(
130136
{
@@ -148,6 +154,9 @@ locals {
148154
}
149155
}
150156
}
157+
metadata_options = {
158+
http_put_response_hop_limit = 2
159+
}
151160
}, var.managed_node_grp3)
152161
managed_node_groups = merge(
153162
{"${var.deployment_name}-k8s": local.default_node_pool},
@@ -182,9 +191,7 @@ module "eks" {
182191
lb_security_group_id = local.lb_security_group_id
183192
db_security_group_id = local.db_security_group_id
184193
self_managed_node_grp_instance_type = var.self_managed_node_grp_instance_type
185-
self_managed_node_grp_default = var.self_managed_node_grp_default
186194
self_managed_node_grps = var.self_managed_node_grps
187-
managed_node_grp_default = var.managed_node_grp_default
188195
managed_node_grps = local.managed_node_groups
189196
k8s_api_access_roles = var.k8s_api_access_roles
190197

modules/eks/main.tf

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
data "aws_caller_identity" "current" {}
22

33
module "ebs_csi_irsa_role" {
4-
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
4+
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts"
55

6-
role_name = "${var.deployment_name}-ebs-csi-controller"
6+
name = "${var.deployment_name}-ebs-csi-controller"
77
attach_ebs_csi_policy = true
88

99
oidc_providers = {
@@ -15,9 +15,9 @@ module "ebs_csi_irsa_role" {
1515
}
1616

1717
module "k8s_load_balancer_controller_role" {
18-
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
18+
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts"
1919

20-
role_name = "${var.deployment_name}-lb-controller"
20+
name = "${var.deployment_name}-lb-controller"
2121
attach_load_balancer_controller_policy = true
2222

2323
oidc_providers = {
@@ -29,9 +29,9 @@ module "k8s_load_balancer_controller_role" {
2929
}
3030

3131
module "cluster_autoscaler_role" {
32-
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
32+
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts"
3333

34-
role_name = "${var.deployment_name}-cluster-autoscaler"
34+
name = "${var.deployment_name}-cluster-autoscaler"
3535
attach_cluster_autoscaler_policy = true
3636
cluster_autoscaler_cluster_names = [module.eks.cluster_name]
3737

@@ -47,18 +47,18 @@ module "eks" {
4747
# https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/docs
4848

4949
source = "terraform-aws-modules/eks/aws"
50-
version = "~> 20.37.1"
50+
version = "~> 21.1.0"
5151
# version = var.eks_module_version
5252

53-
cluster_name = var.deployment_name
54-
cluster_version = var.k8s_cluster_version
53+
name = var.deployment_name
54+
kubernetes_version = var.k8s_cluster_version
5555

56-
cluster_endpoint_public_access = true
57-
cluster_endpoint_public_access_cidrs = var.k8s_public_access_cidrs
56+
endpoint_public_access = true
57+
endpoint_public_access_cidrs = var.k8s_public_access_cidrs
5858

5959
enable_irsa = true
6060

61-
cluster_addons = {
61+
addons = {
6262
coredns = {
6363
most_recent = true
6464
},
@@ -91,15 +91,7 @@ module "eks" {
9191
authentication_mode = "API"
9292

9393
# Self Managed Node Group(s)
94-
self_managed_node_group_defaults = var.self_managed_node_grp_default
95-
9694
self_managed_node_groups = var.self_managed_node_grps
97-
98-
# EKS Managed Node Group(s)
99-
eks_managed_node_group_defaults = {
100-
instance_types = var.managed_node_grp_default
101-
}
102-
10395
eks_managed_node_groups = var.managed_node_grps
10496

10597
# access_entries = {

modules/eks/outputs.tf

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@ output "cluster_name" {
33
}
44

55
output "k8s_load_balancer_controller_role_arn" {
6-
value = module.k8s_load_balancer_controller_role.iam_role_arn
6+
value = module.k8s_load_balancer_controller_role.arn
77
}
88

99
output "cluster_scaler_role_arn" {
10-
value = module.cluster_autoscaler_role.iam_role_arn
10+
value = module.cluster_autoscaler_role.arn
1111
}
1212

1313
output "control_plane_security_group_id" {
@@ -20,7 +20,7 @@ output "cluster_endpoint" {
2020

2121
# dfshell
2222
output "dfshell_role_arn" {
23-
value = module.dfshell_role[0].iam_role_arn
23+
value = module.dfshell_role[0].arn
2424
description = "The ARN of the AWS Bedrock role"
2525
}
2626
output "dfshell_service_account_name" {
@@ -30,7 +30,7 @@ output "dfshell_service_account_name" {
3030

3131
# worker_portal
3232
output "worker_portal_role_arn" {
33-
value = module.worker_portal_role[0].iam_role_arn
33+
value = module.worker_portal_role[0].arn
3434
description = "The ARN of the AWS Bedrock role"
3535
}
3636
output "worker_portal_service_account_name" {
@@ -40,7 +40,7 @@ output "worker_portal_service_account_name" {
4040

4141
# operator
4242
output "operator_role_arn" {
43-
value = module.operator_role[0].iam_role_arn
43+
value = module.operator_role[0].arn
4444
description = "The ARN of the AWS Bedrock role"
4545
}
4646
output "operator_service_account_name" {
@@ -50,7 +50,7 @@ output "operator_service_account_name" {
5050

5151
# server
5252
output "server_role_arn" {
53-
value = module.server_role[0].iam_role_arn
53+
value = module.server_role[0].arn
5454
description = "The ARN of the AWS Bedrock role"
5555
}
5656
output "server_service_account_name" {
@@ -60,7 +60,7 @@ output "server_service_account_name" {
6060

6161
# scheduler
6262
output "scheduler_role_arn" {
63-
value = module.scheduler_role[0].iam_role_arn
63+
value = module.scheduler_role[0].arn
6464
description = "The ARN of the AWS Bedrock role"
6565
}
6666
output "scheduler_service_account_name" {
@@ -70,7 +70,7 @@ output "scheduler_service_account_name" {
7070

7171
# worker, worker1, worker2 etc.
7272
output "worker_role_arn" {
73-
value = module.worker_role[0].iam_role_arn
73+
value = module.worker_role[0].arn
7474
description = "The ARN of the AWS Bedrock role"
7575
}
7676
output "worker_service_account_name" {
@@ -80,7 +80,7 @@ output "worker_service_account_name" {
8080

8181
# worker_catalog
8282
output "worker_catalog_role_arn" {
83-
value = module.worker_catalog_role[0].iam_role_arn
83+
value = module.worker_catalog_role[0].arn
8484
description = "The ARN of the AWS Bedrock role"
8585
}
8686
output "worker_catalog_service_account_name" {
@@ -90,7 +90,7 @@ output "worker_catalog_service_account_name" {
9090

9191
# worker_interactive
9292
output "worker_interactive_role_arn" {
93-
value = module.worker_interactive_role[0].iam_role_arn
93+
value = module.worker_interactive_role[0].arn
9494
description = "The ARN of the AWS Bedrock role"
9595
}
9696
output "worker_interactive_service_account_name" {
@@ -100,7 +100,7 @@ output "worker_interactive_service_account_name" {
100100

101101
# worker_singletons
102102
output "worker_singletons_role_arn" {
103-
value = module.worker_singletons_role[0].iam_role_arn
103+
value = module.worker_singletons_role[0].arn
104104
description = "The ARN of the AWS Bedrock role"
105105
}
106106
output "worker_singletons_service_account_name" {
@@ -110,7 +110,7 @@ output "worker_singletons_service_account_name" {
110110

111111
# worker_lineage
112112
output "worker_lineage_role_arn" {
113-
value = module.worker_lineage_role[0].iam_role_arn
113+
value = module.worker_lineage_role[0].arn
114114
description = "The ARN of the AWS Bedrock role"
115115
}
116116
output "worker_lineage_service_account_name" {
@@ -120,7 +120,7 @@ output "worker_lineage_service_account_name" {
120120

121121
# worker_monitor
122122
output "worker_monitor_role_arn" {
123-
value = module.worker_monitor_role[0].iam_role_arn
123+
value = module.worker_monitor_role[0].arn
124124
description = "The ARN of the AWS Bedrock role"
125125
}
126126
output "worker_monitor_service_account_name" {
@@ -130,7 +130,7 @@ output "worker_monitor_service_account_name" {
130130

131131
# storage_worker
132132
output "storage_worker_role_arn" {
133-
value = module.storage_worker_role[0].iam_role_arn
133+
value = module.storage_worker_role[0].arn
134134
description = "The ARN of the AWS Bedrock role"
135135
}
136136
output "storage_worker_service_account_name" {
@@ -140,6 +140,6 @@ output "storage_worker_service_account_name" {
140140

141141
# Clickhouse backup
142142
output "clickhouse_backup_role_name" {
143-
value = module.clickhouse_backup_role.iam_role_arn
143+
value = module.clickhouse_backup_role.arn
144144
description = "The name of the role for clickhouse backups"
145145
}

0 commit comments

Comments
 (0)