diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index fb1d45d530..802fe3153b 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -169,8 +169,8 @@ class AzureInputVars(schema.Base): class AWSAmiTypes(str, enum.Enum): - AL2_x86_64 = "AL2_x86_64" - AL2_x86_64_GPU = "AL2_x86_64_GPU" + AL2023_x86_64_STANDARD = "AL2023_x86_64_STANDARD" + AL2023_x86_64_NVIDIA = "AL2023_x86_64_NVIDIA" CUSTOM = "CUSTOM" @@ -219,9 +219,9 @@ def construct_aws_ami_type( Returns the AMI type (str) determined by the following rules: - Returns "CUSTOM" if a `launch_template` is provided and it includes a valid `ami_id`. - - Returns "AL2_x86_64_GPU" if `gpu_enabled` is True and no valid + - Returns "AL2023_x86_64_NVIDIA" if `gpu_enabled` is True and no valid `launch_template` is provided (None). - - Returns "AL2_x86_64" as the default AMI type if `gpu_enabled` is False and no + - Returns "AL2023_x86_64_STANDARD" as the default AMI type if `gpu_enabled` is False and no valid `launch_template` is provided (None). """ @@ -229,9 +229,9 @@ def construct_aws_ami_type( return "CUSTOM" if gpu_enabled: - return "AL2_x86_64_GPU" + return "AL2023_x86_64_NVIDIA" - return "AL2_x86_64" + return "AL2023_x86_64_STANDARD" class AWSInputVars(schema.Base): diff --git a/src/_nebari/stages/infrastructure/template/aws/main.tf b/src/_nebari/stages/infrastructure/template/aws/main.tf index ec0cbb6606..9d1f51b569 100644 --- a/src/_nebari/stages/infrastructure/template/aws/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/main.tf @@ -87,6 +87,7 @@ module "kubernetes" { tags = local.additional_tags region = var.region kubernetes_version = var.kubernetes_version + environment = var.environment cluster_subnets = local.subnet_ids cluster_security_groups = [local.security_group_id] diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf index 1d642208fb..7989b01950 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/autoscaling.tf @@ -14,7 +14,11 @@ data "aws_iam_policy_document" "worker_autoscaling" { "autoscaling:DescribeAutoScalingInstances", "autoscaling:DescribeLaunchConfigurations", "autoscaling:DescribeTags", + "ec2:DescribeImages", + "ec2:DescribeInstanceTypes", "ec2:DescribeLaunchTemplateVersions", + "ec2:GetInstanceTypesFromInstanceRequirements", + "eks:DescribeNodegroup", ] resources = ["*"] diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf index f260091dcb..90d683cb93 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/locals.tf @@ -8,8 +8,7 @@ locals { node_group_policies = concat([ "arn:${local.partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy", "arn:${local.partition}:iam::aws:policy/AmazonEKS_CNI_Policy", - "arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy", - aws_iam_policy.worker_autoscaling.arn + "arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" ], var.node_group_additional_policies) gpu_node_group_names = [for node_group in var.node_groups : node_group.name if node_group.gpu == true] diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index b0db3ac4a1..f48768f058 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -206,3 +206,71 @@ resource "aws_iam_openid_connect_provider" "oidc_provider" { var.tags ) } + +# IAM role for EBS CSI driver using IRSA +resource "aws_iam_role" "ebs_csi_driver" { + name = "${var.name}-ebs-csi-driver" + + # Trust policy - allows the Kubernetes service account to assume this role via OIDC + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { + Federated = aws_iam_openid_connect_provider.oidc_provider.arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:kube-system:ebs-csi-controller-sa" + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com" + } + } + }] + }) + + tags = merge( + { Name = "${var.name}-ebs-csi-driver" }, + var.tags + ) +} + +# Attach the AWS managed policy for EBS CSI driver +resource "aws_iam_role_policy_attachment" "ebs_csi_driver" { + role = aws_iam_role.ebs_csi_driver.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" +} + +# IAM role for Cluster Autoscaler using IRSA +resource "aws_iam_role" "cluster_autoscaler" { + name = "${var.name}-cluster-autoscaler" + + # Trust policy - allows the Kubernetes service account to assume this role via OIDC + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { + Federated = aws_iam_openid_connect_provider.oidc_provider.arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:${var.environment}:cluster-autoscaler" + "${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com" + } + } + }] + }) + + tags = merge( + { Name = "${var.name}-cluster-autoscaler" }, + var.tags + ) +} + +# Attach the autoscaling policy to Cluster Autoscaler role +resource "aws_iam_role_policy_attachment" "cluster_autoscaler" { + role = aws_iam_role.cluster_autoscaler.name + policy_arn = aws_iam_policy.worker_autoscaling.arn +} diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf index 48994bc7a2..cb5613a919 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/outputs.tf @@ -23,6 +23,11 @@ output "oidc_provider_arn" { value = aws_iam_openid_connect_provider.oidc_provider.arn } +output "cluster_autoscaler_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + value = aws_iam_role.cluster_autoscaler.arn +} + # https://github.com/terraform-aws-modules/terraform-aws-eks/blob/16f46db94b7158fd762d9133119206aaa7cf6d63/examples/self_managed_node_group/main.tf output "kubeconfig" { description = "Kubernetes connection configuration kubeconfig" diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index bea46a468d..e5486591b1 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -94,3 +94,8 @@ variable "permissions_boundary" { type = string default = null } + +variable "environment" { + description = "Namespace/environment for Kubernetes resources (used in IRSA trust policies)" + type = string +} diff --git a/src/_nebari/stages/infrastructure/template/aws/outputs.tf b/src/_nebari/stages/infrastructure/template/aws/outputs.tf index 9c11139498..355b8ca9f3 100644 --- a/src/_nebari/stages/infrastructure/template/aws/outputs.tf +++ b/src/_nebari/stages/infrastructure/template/aws/outputs.tf @@ -34,3 +34,8 @@ output "oidc_provider_arn" { description = "The ARN of the OIDC Provider" value = module.kubernetes.oidc_provider_arn } + +output "cluster_autoscaler_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + value = module.kubernetes.cluster_autoscaler_role_arn +} diff --git a/src/_nebari/stages/infrastructure/template/aws/versions.tf b/src/_nebari/stages/infrastructure/template/aws/versions.tf index 68c0faf27b..d349ed31b7 100644 --- a/src/_nebari/stages/infrastructure/template/aws/versions.tf +++ b/src/_nebari/stages/infrastructure/template/aws/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "5.33.0" + version = "6.18.0" } } required_version = ">= 1.0" diff --git a/src/_nebari/stages/kubernetes_initialize/__init__.py b/src/_nebari/stages/kubernetes_initialize/__init__.py index 491c41b767..2aede705df 100644 --- a/src/_nebari/stages/kubernetes_initialize/__init__.py +++ b/src/_nebari/stages/kubernetes_initialize/__init__.py @@ -45,6 +45,7 @@ class InputVars(schema.Base): external_container_reg: Optional[ExtContainerReg] = None gpu_enabled: bool = False gpu_node_group_names: List[str] = [] + cluster_autoscaler_role_arn: Optional[str] = None class InputSchema(schema.Base): @@ -94,6 +95,13 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): if self.config.amazon_web_services.node_groups[group].gpu ] input_vars.aws_region = self.config.amazon_web_services.region + # Get the Cluster Autoscaler IAM role ARN from infrastructure stage output + if "stages/02-infrastructure" in stage_outputs: + input_vars.cluster_autoscaler_role_arn = ( + stage_outputs["stages/02-infrastructure"] + .get("cluster_autoscaler_role_arn", {}) + .get("value", "") + ) return input_vars.model_dump() diff --git a/src/_nebari/stages/kubernetes_initialize/template/main.tf b/src/_nebari/stages/kubernetes_initialize/template/main.tf index 402c68fb3f..edc9b89955 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/main.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/main.tf @@ -14,6 +14,7 @@ module "kubernetes-autoscaling" { aws_region = var.aws_region cluster-name = local.cluster_name + iam_role_arn = var.cluster_autoscaler_role_arn } module "traefik-crds" { diff --git a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf index c07edd70dd..cad276a7ef 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/main.tf @@ -4,12 +4,18 @@ resource "helm_release" "autoscaler" { repository = "https://kubernetes.github.io/autoscaler" chart = "cluster-autoscaler" - version = "9.19.0" + version = "9.52.1" values = concat([ jsonencode({ rbac = { create = true + serviceAccount = { + name = "cluster-autoscaler" + annotations = { + "eks.amazonaws.com/role-arn" = var.iam_role_arn + } + } } cloudProvider = "aws" diff --git a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf index a7169abeea..b6f57e17ac 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/modules/cluster-autoscaler/variables.tf @@ -18,3 +18,8 @@ variable "overrides" { type = list(string) default = [] } + +variable "iam_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + type = string +} diff --git a/src/_nebari/stages/kubernetes_initialize/template/variables.tf b/src/_nebari/stages/kubernetes_initialize/template/variables.tf index f169f5bcf2..aded2479c8 100644 --- a/src/_nebari/stages/kubernetes_initialize/template/variables.tf +++ b/src/_nebari/stages/kubernetes_initialize/template/variables.tf @@ -30,3 +30,9 @@ variable "gpu_enabled" { variable "gpu_node_group_names" { description = "Names of node groups with GPU" } + +variable "cluster_autoscaler_role_arn" { + description = "IAM role ARN for Cluster Autoscaler (IRSA)" + type = string + default = "" +}