Skip to content

Commit 3abf44e

Browse files
Adam-D-Lewisclaudeviniciusdc
authored
Migrate AWS EKS AMI from Amazon Linux 2 to Amazon Linux 2023 (#3166)
Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Vinicius D. Cerutti <51954708+viniciusdc@users.noreply.github.com>
1 parent 423dfc9 commit 3abf44e

File tree

14 files changed

+123
-10
lines changed

14 files changed

+123
-10
lines changed

src/_nebari/stages/infrastructure/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ class AzureInputVars(schema.Base):
169169

170170

171171
class AWSAmiTypes(str, enum.Enum):
172-
AL2_x86_64 = "AL2_x86_64"
173-
AL2_x86_64_GPU = "AL2_x86_64_GPU"
172+
AL2023_x86_64_STANDARD = "AL2023_x86_64_STANDARD"
173+
AL2023_x86_64_NVIDIA = "AL2023_x86_64_NVIDIA"
174174
CUSTOM = "CUSTOM"
175175

176176

@@ -219,19 +219,19 @@ def construct_aws_ami_type(
219219
220220
Returns the AMI type (str) determined by the following rules:
221221
- Returns "CUSTOM" if a `launch_template` is provided and it includes a valid `ami_id`.
222-
- Returns "AL2_x86_64_GPU" if `gpu_enabled` is True and no valid
222+
- Returns "AL2023_x86_64_NVIDIA" if `gpu_enabled` is True and no valid
223223
`launch_template` is provided (None).
224-
- Returns "AL2_x86_64" as the default AMI type if `gpu_enabled` is False and no
224+
- Returns "AL2023_x86_64_STANDARD" as the default AMI type if `gpu_enabled` is False and no
225225
valid `launch_template` is provided (None).
226226
"""
227227

228228
if launch_template and getattr(launch_template, "ami_id", None):
229229
return "CUSTOM"
230230

231231
if gpu_enabled:
232-
return "AL2_x86_64_GPU"
232+
return "AL2023_x86_64_NVIDIA"
233233

234-
return "AL2_x86_64"
234+
return "AL2023_x86_64_STANDARD"
235235

236236

237237
class AWSInputVars(schema.Base):

src/_nebari/stages/infrastructure/template/aws/main.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ module "kubernetes" {
8787
tags = local.additional_tags
8888
region = var.region
8989
kubernetes_version = var.kubernetes_version
90+
environment = var.environment
9091

9192
cluster_subnets = local.subnet_ids
9293
cluster_security_groups = [local.security_group_id]

src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@ data "aws_iam_policy_document" "worker_autoscaling" {
1414
"autoscaling:DescribeAutoScalingInstances",
1515
"autoscaling:DescribeLaunchConfigurations",
1616
"autoscaling:DescribeTags",
17+
"ec2:DescribeImages",
18+
"ec2:DescribeInstanceTypes",
1719
"ec2:DescribeLaunchTemplateVersions",
20+
"ec2:GetInstanceTypesFromInstanceRequirements",
21+
"eks:DescribeNodegroup",
1822
]
1923

2024
resources = ["*"]

src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ locals {
88
node_group_policies = concat([
99
"arn:${local.partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy",
1010
"arn:${local.partition}:iam::aws:policy/AmazonEKS_CNI_Policy",
11-
"arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy",
12-
aws_iam_policy.worker_autoscaling.arn
11+
"arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
1312
], var.node_group_additional_policies)
1413

1514
gpu_node_group_names = [for node_group in var.node_groups : node_group.name if node_group.gpu == true]

src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,71 @@ resource "aws_iam_openid_connect_provider" "oidc_provider" {
206206
var.tags
207207
)
208208
}
209+
210+
# IAM role for EBS CSI driver using IRSA
211+
resource "aws_iam_role" "ebs_csi_driver" {
212+
name = "${var.name}-ebs-csi-driver"
213+
214+
# Trust policy - allows the Kubernetes service account to assume this role via OIDC
215+
assume_role_policy = jsonencode({
216+
Version = "2012-10-17"
217+
Statement = [{
218+
Effect = "Allow"
219+
Principal = {
220+
Federated = aws_iam_openid_connect_provider.oidc_provider.arn
221+
}
222+
Action = "sts:AssumeRoleWithWebIdentity"
223+
Condition = {
224+
StringEquals = {
225+
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:kube-system:ebs-csi-controller-sa"
226+
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com"
227+
}
228+
}
229+
}]
230+
})
231+
232+
tags = merge(
233+
{ Name = "${var.name}-ebs-csi-driver" },
234+
var.tags
235+
)
236+
}
237+
238+
# Attach the AWS managed policy for EBS CSI driver
239+
resource "aws_iam_role_policy_attachment" "ebs_csi_driver" {
240+
role = aws_iam_role.ebs_csi_driver.name
241+
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
242+
}
243+
244+
# IAM role for Cluster Autoscaler using IRSA
245+
resource "aws_iam_role" "cluster_autoscaler" {
246+
name = "${var.name}-cluster-autoscaler"
247+
248+
# Trust policy - allows the Kubernetes service account to assume this role via OIDC
249+
assume_role_policy = jsonencode({
250+
Version = "2012-10-17"
251+
Statement = [{
252+
Effect = "Allow"
253+
Principal = {
254+
Federated = aws_iam_openid_connect_provider.oidc_provider.arn
255+
}
256+
Action = "sts:AssumeRoleWithWebIdentity"
257+
Condition = {
258+
StringEquals = {
259+
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:${var.environment}:cluster-autoscaler"
260+
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com"
261+
}
262+
}
263+
}]
264+
})
265+
266+
tags = merge(
267+
{ Name = "${var.name}-cluster-autoscaler" },
268+
var.tags
269+
)
270+
}
271+
272+
# Attach the autoscaling policy to Cluster Autoscaler role
273+
resource "aws_iam_role_policy_attachment" "cluster_autoscaler" {
274+
role = aws_iam_role.cluster_autoscaler.name
275+
policy_arn = aws_iam_policy.worker_autoscaling.arn
276+
}

src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ output "oidc_provider_arn" {
2323
value = aws_iam_openid_connect_provider.oidc_provider.arn
2424
}
2525

26+
output "cluster_autoscaler_role_arn" {
27+
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
28+
value = aws_iam_role.cluster_autoscaler.arn
29+
}
30+
2631
# https://github.com/terraform-aws-modules/terraform-aws-eks/blob/16f46db94b7158fd762d9133119206aaa7cf6d63/examples/self_managed_node_group/main.tf
2732
output "kubeconfig" {
2833
description = "Kubernetes connection configuration kubeconfig"

src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,8 @@ variable "permissions_boundary" {
9494
type = string
9595
default = null
9696
}
97+
98+
variable "environment" {
99+
description = "Namespace/environment for Kubernetes resources (used in IRSA trust policies)"
100+
type = string
101+
}

src/_nebari/stages/infrastructure/template/aws/outputs.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,8 @@ output "oidc_provider_arn" {
3434
description = "The ARN of the OIDC Provider"
3535
value = module.kubernetes.oidc_provider_arn
3636
}
37+
38+
output "cluster_autoscaler_role_arn" {
39+
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
40+
value = module.kubernetes.cluster_autoscaler_role_arn
41+
}

src/_nebari/stages/infrastructure/template/aws/versions.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ terraform {
22
required_providers {
33
aws = {
44
source = "hashicorp/aws"
5-
version = "5.33.0"
5+
version = "6.18.0"
66
}
77
}
88
required_version = ">= 1.0"

src/_nebari/stages/kubernetes_initialize/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class InputVars(schema.Base):
4545
external_container_reg: Optional[ExtContainerReg] = None
4646
gpu_enabled: bool = False
4747
gpu_node_group_names: List[str] = []
48+
cluster_autoscaler_role_arn: Optional[str] = None
4849

4950

5051
class InputSchema(schema.Base):
@@ -94,6 +95,13 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
9495
if self.config.amazon_web_services.node_groups[group].gpu
9596
]
9697
input_vars.aws_region = self.config.amazon_web_services.region
98+
# Get the Cluster Autoscaler IAM role ARN from infrastructure stage output
99+
if "stages/02-infrastructure" in stage_outputs:
100+
input_vars.cluster_autoscaler_role_arn = (
101+
stage_outputs["stages/02-infrastructure"]
102+
.get("cluster_autoscaler_role_arn", {})
103+
.get("value", "")
104+
)
97105

98106
return input_vars.model_dump()
99107

0 commit comments

Comments
 (0)