From 4754db97a36bc561e53ba0f4dbf1292c79af6bea Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Oct 2025 14:45:45 +0000 Subject: [PATCH 1/3] Migrate AWS EKS AMI from Amazon Linux 2 to Amazon Linux 2023 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AWS is deprecating Amazon Linux 2 (AL2) AMIs for EKS after November 26, 2025. Kubernetes 1.32 is the last version that will support AL2 AMIs. From version 1.33 onwards, only AL2023 and Bottlerocket AMIs will be available. This change updates the default AMI types for EKS node groups: - AL2_x86_64 → AL2023_x86_64 - AL2_x86_64_GPU → AL2023_x86_64_GPU References: - https://docs.aws.amazon.com/eks/latest/userguide/eks-ami-deprecation-faqs.html 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/_nebari/stages/infrastructure/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index fb1d45d53..451eb0461 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -169,8 +169,8 @@ class AzureInputVars(schema.Base): class AWSAmiTypes(str, enum.Enum): - AL2_x86_64 = "AL2_x86_64" - AL2_x86_64_GPU = "AL2_x86_64_GPU" + AL2023_x86_64 = "AL2023_x86_64" + AL2023_x86_64_GPU = "AL2023_x86_64_GPU" CUSTOM = "CUSTOM" @@ -219,9 +219,9 @@ def construct_aws_ami_type( Returns the AMI type (str) determined by the following rules: - Returns "CUSTOM" if a `launch_template` is provided and it includes a valid `ami_id`. - - Returns "AL2_x86_64_GPU" if `gpu_enabled` is True and no valid + - Returns "AL2023_x86_64_GPU" if `gpu_enabled` is True and no valid `launch_template` is provided (None). - - Returns "AL2_x86_64" as the default AMI type if `gpu_enabled` is False and no + - Returns "AL2023_x86_64" as the default AMI type if `gpu_enabled` is False and no valid `launch_template` is provided (None). """ @@ -229,9 +229,9 @@ def construct_aws_ami_type( return "CUSTOM" if gpu_enabled: - return "AL2_x86_64_GPU" + return "AL2023_x86_64_GPU" - return "AL2_x86_64" + return "AL2023_x86_64" class AWSInputVars(schema.Base): From 25698ed4938aa8a113bdde33253442ee0cabb7a9 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 27 Oct 2025 14:23:17 -0500 Subject: [PATCH 2/3] correct AMI types and bump aws terraform version --- src/_nebari/stages/infrastructure/__init__.py | 12 ++++++------ .../stages/infrastructure/template/aws/versions.tf | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 451eb0461..802fe3153 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -169,8 +169,8 @@ class AzureInputVars(schema.Base): class AWSAmiTypes(str, enum.Enum): - AL2023_x86_64 = "AL2023_x86_64" - AL2023_x86_64_GPU = "AL2023_x86_64_GPU" + AL2023_x86_64_STANDARD = "AL2023_x86_64_STANDARD" + AL2023_x86_64_NVIDIA = "AL2023_x86_64_NVIDIA" CUSTOM = "CUSTOM" @@ -219,9 +219,9 @@ def construct_aws_ami_type( Returns the AMI type (str) determined by the following rules: - Returns "CUSTOM" if a `launch_template` is provided and it includes a valid `ami_id`. - - Returns "AL2023_x86_64_GPU" if `gpu_enabled` is True and no valid + - Returns "AL2023_x86_64_NVIDIA" if `gpu_enabled` is True and no valid `launch_template` is provided (None). - - Returns "AL2023_x86_64" as the default AMI type if `gpu_enabled` is False and no + - Returns "AL2023_x86_64_STANDARD" as the default AMI type if `gpu_enabled` is False and no valid `launch_template` is provided (None). """ @@ -229,9 +229,9 @@ def construct_aws_ami_type( return "CUSTOM" if gpu_enabled: - return "AL2023_x86_64_GPU" + return "AL2023_x86_64_NVIDIA" - return "AL2023_x86_64" + return "AL2023_x86_64_STANDARD" class AWSInputVars(schema.Base): diff --git a/src/_nebari/stages/infrastructure/template/aws/versions.tf b/src/_nebari/stages/infrastructure/template/aws/versions.tf index 68c0faf27..d349ed31b 100644 --- a/src/_nebari/stages/infrastructure/template/aws/versions.tf +++ b/src/_nebari/stages/infrastructure/template/aws/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "5.33.0" + version = "6.18.0" } } required_version = ">= 1.0" From 8b5aa3aa49dc71af4a926e7da9e887e9800d546e Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:09:09 -0500 Subject: [PATCH 3/3] add autoscaler IRSA --- .../infrastructure/template/aws/main.tf | 1 + .../aws/modules/kubernetes/autoscaling.tf | 4 ++ .../template/aws/modules/kubernetes/locals.tf | 3 +- .../template/aws/modules/kubernetes/main.tf | 68 +++++++++++++++++++ .../aws/modules/kubernetes/outputs.tf | 5 ++ .../aws/modules/kubernetes/variables.tf | 5 ++ .../infrastructure/template/aws/outputs.tf | 5 ++ .../stages/kubernetes_initialize/__init__.py | 8 +++ .../kubernetes_initialize/template/main.tf | 1 + .../modules/cluster-autoscaler/main.tf | 8 ++- .../modules/cluster-autoscaler/variables.tf | 5 ++ .../template/variables.tf | 6 ++ 12 files changed, 116 insertions(+), 3 deletions(-) diff --git a/src/_nebari/stages/infrastructure/template/aws/main.tf b/src/_nebari/stages/infrastructure/template/aws/main.tf index ec0cbb660..9d1f51b56 100644 --- a/src/_nebari/stages/infrastructure/template/aws/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/main.tf @@ -87,6 +87,7 @@ module "kubernetes" { tags = local.additional_tags region = var.region kubernetes_version = var.kubernetes_version + environment = var.environment cluster_subnets = local.subnet_ids cluster_security_groups = [local.security_group_id] diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf index 1d642208f..7989b0195 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf @@ -14,7 +14,11 @@ data "aws_iam_policy_document" "worker_autoscaling" { "autoscaling:DescribeAutoScalingInstances", "autoscaling:DescribeLaunchConfigurations", "autoscaling:DescribeTags", + "ec2:DescribeImages", + "ec2:DescribeInstanceTypes", "ec2:DescribeLaunchTemplateVersions", + "ec2:GetInstanceTypesFromInstanceRequirements", + "eks:DescribeNodegroup", ] resources = ["*"] diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf index f260091dc..90d683cb9 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf @@ -8,8 +8,7 @@ locals { node_group_policies = concat([ "arn:${local.partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy", "arn:${local.partition}:iam::aws:policy/AmazonEKS_CNI_Policy", - "arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy", - aws_iam_policy.worker_autoscaling.arn + "arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" ], var.node_group_additional_policies) gpu_node_group_names = [for node_group in var.node_groups : node_group.name if node_group.gpu == true] diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index b0db3ac4a..f48768f05 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -206,3 +206,71 @@ resource "aws_iam_openid_connect_provider" "oidc_provider" { var.tags ) } + +# IAM role for EBS CSI driver using IRSA +resource "aws_iam_role" "ebs_csi_driver" { + name = "${var.name}-ebs-csi-driver" + + # Trust policy - allows the Kubernetes service account to assume this role via OIDC + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { + Federated = aws_iam_openid_connect_provider.oidc_provider.arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:kube-system:ebs-csi-controller-sa" + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com" + } + } + }] + }) + + tags = merge( + { Name = "${var.name}-ebs-csi-driver" }, + var.tags + ) +} + +# Attach the AWS managed policy for EBS CSI driver +resource "aws_iam_role_policy_attachment" "ebs_csi_driver" { + role = aws_iam_role.ebs_csi_driver.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" +} + +# IAM role for Cluster Autoscaler using IRSA +resource "aws_iam_role" "cluster_autoscaler" { + name = "${var.name}-cluster-autoscaler" + + # Trust policy - allows the Kubernetes service account to assume this role via OIDC + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { + Federated = aws_iam_openid_connect_provider.oidc_provider.arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:${var.environment}:cluster-autoscaler" + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com" + } + } + }] + }) + + tags = merge( + { Name = "${var.name}-cluster-autoscaler" }, + var.tags + ) +} + +# Attach the autoscaling policy to Cluster Autoscaler role +resource "aws_iam_role_policy_attachment" "cluster_autoscaler" { + role = aws_iam_role.cluster_autoscaler.name + policy_arn = aws_iam_policy.worker_autoscaling.arn +} diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf index 48994bc7a..cb5613a91 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf @@ -23,6 +23,11 @@ output "oidc_provider_arn" { value = aws_iam_openid_connect_provider.oidc_provider.arn } +output "cluster_autoscaler_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + value = aws_iam_role.cluster_autoscaler.arn +} + # https://github.com/terraform-aws-modules/terraform-aws-eks/blob/16f46db94b7158fd762d9133119206aaa7cf6d63/examples/self_managed_node_group/main.tf output "kubeconfig" { description = "Kubernetes connection configuration kubeconfig" diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index bea46a468..e5486591b 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -94,3 +94,8 @@ variable "permissions_boundary" { type = string default = null } + +variable "environment" { + description = "Namespace/environment for Kubernetes resources (used in IRSA trust policies)" + type = string +} diff --git a/src/_nebari/stages/infrastructure/template/aws/outputs.tf b/src/_nebari/stages/infrastructure/template/aws/outputs.tf index 9c1113949..355b8ca9f 100644 --- a/src/_nebari/stages/infrastructure/template/aws/outputs.tf +++ b/src/_nebari/stages/infrastructure/template/aws/outputs.tf @@ -34,3 +34,8 @@ output "oidc_provider_arn" { description = "The ARN of the OIDC Provider" value = module.kubernetes.oidc_provider_arn } + +output "cluster_autoscaler_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + value = module.kubernetes.cluster_autoscaler_role_arn +} diff --git a/src/_nebari/stages/kubernetes_initialize/__init__.py b/src/_nebari/stages/kubernetes_initialize/__init__.py index 491c41b76..2aede705d 100644 --- a/src/_nebari/stages/kubernetes_initialize/__init__.py +++ b/src/_nebari/stages/kubernetes_initialize/__init__.py @@ -45,6 +45,7 @@ class InputVars(schema.Base): external_container_reg: Optional[ExtContainerReg] = None gpu_enabled: bool = False gpu_node_group_names: List[str] = [] + cluster_autoscaler_role_arn: Optional[str] = None class InputSchema(schema.Base): @@ -94,6 +95,13 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): if self.config.amazon_web_services.node_groups[group].gpu ] input_vars.aws_region = self.config.amazon_web_services.region + # Get the Cluster Autoscaler IAM role ARN from infrastructure stage output + if "stages/02-infrastructure" in stage_outputs: + input_vars.cluster_autoscaler_role_arn = ( + stage_outputs["stages/02-infrastructure"] + .get("cluster_autoscaler_role_arn", {}) + .get("value", "") + ) return input_vars.model_dump() diff --git a/src/_nebari/stages/kubernetes_initialize/template/main.tf b/src/_nebari/stages/kubernetes_initialize/template/main.tf index 402c68fb3..edc9b8995 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/main.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/main.tf @@ -14,6 +14,7 @@ module "kubernetes-autoscaling" { aws_region = var.aws_region cluster-name = local.cluster_name + iam_role_arn = var.cluster_autoscaler_role_arn } module "traefik-crds" { diff --git a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf index c07edd70d..cad276a7e 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf @@ -4,12 +4,18 @@ resource "helm_release" "autoscaler" { repository = "https://kubernetes.github.io/autoscaler" chart = "cluster-autoscaler" - version = "9.19.0" + version = "9.52.1" values = concat([ jsonencode({ rbac = { create = true + serviceAccount = { + name = "cluster-autoscaler" + annotations = { + "eks.amazonaws.com/role-arn" = var.iam_role_arn + } + } } cloudProvider = "aws" diff --git a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf index a7169abee..b6f57e17a 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf @@ -18,3 +18,8 @@ variable "overrides" { type = list(string) default = [] } + +variable "iam_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + type = string +} diff --git a/src/_nebari/stages/kubernetes_initialize/template/variables.tf b/src/_nebari/stages/kubernetes_initialize/template/variables.tf index f169f5bcf..aded2479c 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/variables.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/variables.tf @@ -30,3 +30,9 @@ variable "gpu_enabled" { variable "gpu_node_group_names" { description = "Names of node groups with GPU" } + +variable "cluster_autoscaler_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + type = string + default = "" +}