Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/_nebari/stages/infrastructure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ class AzureInputVars(schema.Base):


class AWSAmiTypes(str, enum.Enum):
AL2_x86_64 = "AL2_x86_64"
AL2_x86_64_GPU = "AL2_x86_64_GPU"
AL2023_x86_64_STANDARD = "AL2023_x86_64_STANDARD"
AL2023_x86_64_NVIDIA = "AL2023_x86_64_NVIDIA"
CUSTOM = "CUSTOM"


Expand Down Expand Up @@ -219,19 +219,19 @@ def construct_aws_ami_type(

Returns the AMI type (str) determined by the following rules:
- Returns "CUSTOM" if a `launch_template` is provided and it includes a valid `ami_id`.
- Returns "AL2_x86_64_GPU" if `gpu_enabled` is True and no valid
- Returns "AL2023_x86_64_NVIDIA" if `gpu_enabled` is True and no valid
`launch_template` is provided (None).
- Returns "AL2_x86_64" as the default AMI type if `gpu_enabled` is False and no
- Returns "AL2023_x86_64_STANDARD" as the default AMI type if `gpu_enabled` is False and no
valid `launch_template` is provided (None).
"""

if launch_template and getattr(launch_template, "ami_id", None):
return "CUSTOM"

if gpu_enabled:
return "AL2_x86_64_GPU"
return "AL2023_x86_64_NVIDIA"

return "AL2_x86_64"
return "AL2023_x86_64_STANDARD"


Comment on lines 228 to 236
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you check the nvidia works with those new ami's ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently waiting for quota increase approval since we're on OT Nebari dev AWS account now instead of QS.

class AWSInputVars(schema.Base):
Expand Down
1 change: 1 addition & 0 deletions src/_nebari/stages/infrastructure/template/aws/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ module "kubernetes" {
tags = local.additional_tags
region = var.region
kubernetes_version = var.kubernetes_version
environment = var.environment

cluster_subnets = local.subnet_ids
cluster_security_groups = [local.security_group_id]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ data "aws_iam_policy_document" "worker_autoscaling" {
"autoscaling:DescribeAutoScalingInstances",
"autoscaling:DescribeLaunchConfigurations",
"autoscaling:DescribeTags",
"ec2:DescribeImages",
"ec2:DescribeInstanceTypes",
"ec2:DescribeLaunchTemplateVersions",
"ec2:GetInstanceTypesFromInstanceRequirements",
"eks:DescribeNodegroup",
]

resources = ["*"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ locals {
node_group_policies = concat([
"arn:${local.partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy",
"arn:${local.partition}:iam::aws:policy/AmazonEKS_CNI_Policy",
"arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy",
aws_iam_policy.worker_autoscaling.arn
Copy link
Member Author

@Adam-D-Lewis Adam-D-Lewis Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the role is no longer needed on the node

"arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
], var.node_group_additional_policies)

gpu_node_group_names = [for node_group in var.node_groups : node_group.name if node_group.gpu == true]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,71 @@ resource "aws_iam_openid_connect_provider" "oidc_provider" {
var.tags
)
}

# IAM role for EBS CSI driver using IRSA
resource "aws_iam_role" "ebs_csi_driver" {
name = "${var.name}-ebs-csi-driver"

# Trust policy - allows the Kubernetes service account to assume this role via OIDC
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Effect = "Allow"
Principal = {
Federated = aws_iam_openid_connect_provider.oidc_provider.arn
}
Action = "sts:AssumeRoleWithWebIdentity"
Condition = {
StringEquals = {
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:kube-system:ebs-csi-controller-sa"
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com"
}
}
}]
})

tags = merge(
{ Name = "${var.name}-ebs-csi-driver" },
var.tags
)
}

# Attach the AWS managed policy for EBS CSI driver
resource "aws_iam_role_policy_attachment" "ebs_csi_driver" {
role = aws_iam_role.ebs_csi_driver.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
}

# IAM role for Cluster Autoscaler using IRSA
resource "aws_iam_role" "cluster_autoscaler" {
name = "${var.name}-cluster-autoscaler"

# Trust policy - allows the Kubernetes service account to assume this role via OIDC
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Effect = "Allow"
Principal = {
Federated = aws_iam_openid_connect_provider.oidc_provider.arn
}
Action = "sts:AssumeRoleWithWebIdentity"
Condition = {
StringEquals = {
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:${var.environment}:cluster-autoscaler"
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com"
}
}
}]
})

tags = merge(
{ Name = "${var.name}-cluster-autoscaler" },
var.tags
)
}

# Attach the autoscaling policy to Cluster Autoscaler role
resource "aws_iam_role_policy_attachment" "cluster_autoscaler" {
role = aws_iam_role.cluster_autoscaler.name
policy_arn = aws_iam_policy.worker_autoscaling.arn
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ output "oidc_provider_arn" {
value = aws_iam_openid_connect_provider.oidc_provider.arn
}

output "cluster_autoscaler_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
value = aws_iam_role.cluster_autoscaler.arn
}

# https://github.com/terraform-aws-modules/terraform-aws-eks/blob/16f46db94b7158fd762d9133119206aaa7cf6d63/examples/self_managed_node_group/main.tf
output "kubeconfig" {
description = "Kubernetes connection configuration kubeconfig"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,8 @@ variable "permissions_boundary" {
type = string
default = null
}

variable "environment" {
description = "Namespace/environment for Kubernetes resources (used in IRSA trust policies)"
type = string
}
5 changes: 5 additions & 0 deletions src/_nebari/stages/infrastructure/template/aws/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,8 @@ output "oidc_provider_arn" {
description = "The ARN of the OIDC Provider"
value = module.kubernetes.oidc_provider_arn
}

output "cluster_autoscaler_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
value = module.kubernetes.cluster_autoscaler_role_arn
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "5.33.0"
version = "6.18.0"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

}
}
required_version = ">= 1.0"
Expand Down
8 changes: 8 additions & 0 deletions src/_nebari/stages/kubernetes_initialize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class InputVars(schema.Base):
external_container_reg: Optional[ExtContainerReg] = None
gpu_enabled: bool = False
gpu_node_group_names: List[str] = []
cluster_autoscaler_role_arn: Optional[str] = None


class InputSchema(schema.Base):
Expand Down Expand Up @@ -94,6 +95,13 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
if self.config.amazon_web_services.node_groups[group].gpu
]
input_vars.aws_region = self.config.amazon_web_services.region
# Get the Cluster Autoscaler IAM role ARN from infrastructure stage output
if "stages/02-infrastructure" in stage_outputs:
input_vars.cluster_autoscaler_role_arn = (
stage_outputs["stages/02-infrastructure"]
.get("cluster_autoscaler_role_arn", {})
.get("value", "")
)

return input_vars.model_dump()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module "kubernetes-autoscaling" {

aws_region = var.aws_region
cluster-name = local.cluster_name
iam_role_arn = var.cluster_autoscaler_role_arn
}

module "traefik-crds" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@ resource "helm_release" "autoscaler" {

repository = "https://kubernetes.github.io/autoscaler"
chart = "cluster-autoscaler"
version = "9.19.0"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also took the opportunity to update the version here

version = "9.52.1"

values = concat([
jsonencode({
rbac = {
create = true
serviceAccount = {
name = "cluster-autoscaler"
annotations = {
"eks.amazonaws.com/role-arn" = var.iam_role_arn
}
}
}

cloudProvider = "aws"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@ variable "overrides" {
type = list(string)
default = []
}

variable "iam_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
type = string
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ variable "gpu_enabled" {
variable "gpu_node_group_names" {
description = "Names of node groups with GPU"
}

variable "cluster_autoscaler_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
type = string
default = ""
}
Loading