Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/_nebari/stages/infrastructure/template/aws/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ module "kubernetes" {
tags = local.additional_tags
region = var.region
kubernetes_version = var.kubernetes_version
environment = var.environment

cluster_subnets = local.subnet_ids
cluster_security_groups = [local.security_group_id]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ data "aws_iam_policy_document" "worker_autoscaling" {
"autoscaling:DescribeAutoScalingInstances",
"autoscaling:DescribeLaunchConfigurations",
"autoscaling:DescribeTags",
"ec2:DescribeImages",
"ec2:DescribeInstanceTypes",
"ec2:DescribeLaunchTemplateVersions",
"ec2:GetInstanceTypesFromInstanceRequirements",
"eks:DescribeNodegroup",
]

resources = ["*"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ locals {
node_group_policies = concat([
"arn:${local.partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy",
"arn:${local.partition}:iam::aws:policy/AmazonEKS_CNI_Policy",
"arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy",
aws_iam_policy.worker_autoscaling.arn
Copy link
Member Author

@Adam-D-Lewis Adam-D-Lewis Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the role is no longer needed on the node

"arn:${local.partition}:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
], var.node_group_additional_policies)

gpu_node_group_names = [for node_group in var.node_groups : node_group.name if node_group.gpu == true]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,71 @@ resource "aws_iam_openid_connect_provider" "oidc_provider" {
var.tags
)
}

# IAM role for EBS CSI driver using IRSA
resource "aws_iam_role" "ebs_csi_driver" {
name = "${var.name}-ebs-csi-driver"

# Trust policy - allows the Kubernetes service account to assume this role via OIDC
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Effect = "Allow"
Principal = {
Federated = aws_iam_openid_connect_provider.oidc_provider.arn
}
Action = "sts:AssumeRoleWithWebIdentity"
Condition = {
StringEquals = {
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:kube-system:ebs-csi-controller-sa"
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com"
}
}
}]
})

tags = merge(
{ Name = "${var.name}-ebs-csi-driver" },
var.tags
)
}

# Attach the AWS managed policy for EBS CSI driver
resource "aws_iam_role_policy_attachment" "ebs_csi_driver" {
role = aws_iam_role.ebs_csi_driver.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
}

# IAM role for Cluster Autoscaler using IRSA
resource "aws_iam_role" "cluster_autoscaler" {
name = "${var.name}-cluster-autoscaler"

# Trust policy - allows the Kubernetes service account to assume this role via OIDC
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Effect = "Allow"
Principal = {
Federated = aws_iam_openid_connect_provider.oidc_provider.arn
}
Action = "sts:AssumeRoleWithWebIdentity"
Condition = {
StringEquals = {
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:sub" = "system:serviceaccount:${var.environment}:cluster-autoscaler"
"${replace(aws_eks_cluster.main.identity[0].oidc[0].issuer, "https://", "")}:aud" = "sts.amazonaws.com"
}
}
}]
})

tags = merge(
{ Name = "${var.name}-cluster-autoscaler" },
var.tags
)
}

# Attach the autoscaling policy to Cluster Autoscaler role
resource "aws_iam_role_policy_attachment" "cluster_autoscaler" {
role = aws_iam_role.cluster_autoscaler.name
policy_arn = aws_iam_policy.worker_autoscaling.arn
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ output "oidc_provider_arn" {
value = aws_iam_openid_connect_provider.oidc_provider.arn
}

output "cluster_autoscaler_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
value = aws_iam_role.cluster_autoscaler.arn
}

# https://github.com/terraform-aws-modules/terraform-aws-eks/blob/16f46db94b7158fd762d9133119206aaa7cf6d63/examples/self_managed_node_group/main.tf
output "kubeconfig" {
description = "Kubernetes connection configuration kubeconfig"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,8 @@ variable "permissions_boundary" {
type = string
default = null
}

variable "environment" {
description = "Namespace/environment for Kubernetes resources (used in IRSA trust policies)"
type = string
}
5 changes: 5 additions & 0 deletions src/_nebari/stages/infrastructure/template/aws/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,8 @@ output "oidc_provider_arn" {
description = "The ARN of the OIDC Provider"
value = module.kubernetes.oidc_provider_arn
}

output "cluster_autoscaler_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
value = module.kubernetes.cluster_autoscaler_role_arn
}
8 changes: 8 additions & 0 deletions src/_nebari/stages/kubernetes_initialize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class InputVars(schema.Base):
external_container_reg: Optional[ExtContainerReg] = None
gpu_enabled: bool = False
gpu_node_group_names: List[str] = []
cluster_autoscaler_role_arn: Optional[str] = None


class InputSchema(schema.Base):
Expand Down Expand Up @@ -94,6 +95,13 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
if self.config.amazon_web_services.node_groups[group].gpu
]
input_vars.aws_region = self.config.amazon_web_services.region
# Get the Cluster Autoscaler IAM role ARN from infrastructure stage output
if "stages/02-infrastructure" in stage_outputs:
input_vars.cluster_autoscaler_role_arn = (
stage_outputs["stages/02-infrastructure"]
.get("cluster_autoscaler_role_arn", {})
.get("value", "")
)

return input_vars.model_dump()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module "kubernetes-autoscaling" {

aws_region = var.aws_region
cluster-name = local.cluster_name
iam_role_arn = var.cluster_autoscaler_role_arn
}

module "traefik-crds" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@ resource "helm_release" "autoscaler" {

repository = "https://kubernetes.github.io/autoscaler"
chart = "cluster-autoscaler"
version = "9.19.0"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also took the opportunity to update the version here

version = "9.52.1"

values = concat([
jsonencode({
rbac = {
create = true
serviceAccount = {
name = "cluster-autoscaler"
annotations = {
"eks.amazonaws.com/role-arn" = var.iam_role_arn
}
}
}

cloudProvider = "aws"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@ variable "overrides" {
type = list(string)
default = []
}

variable "iam_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
type = string
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ variable "gpu_enabled" {
variable "gpu_node_group_names" {
description = "Names of node groups with GPU"
}

variable "cluster_autoscaler_role_arn" {
description = "IAM role ARN for Cluster Autoscaler (IRSA)"
type = string
default = ""
}
Loading