From 570fdacd912343f711f2a4ee7b121cb01b4272b8 Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Fri, 12 Sep 2025 12:47:53 -0500 Subject: [PATCH 1/7] feat: Update provider and module versons for Neuron EFA pattern --- .pre-commit-config.yaml | 8 ++++---- patterns/aws-neuron-efa/eks.tf | 29 +++++++++------------------- patterns/aws-neuron-efa/helm.tf | 32 ++++++++++++++++++++++++------- patterns/aws-neuron-efa/main.tf | 29 ++++------------------------ patterns/ipv6-eks-cluster/main.tf | 2 +- 5 files changed, 43 insertions(+), 57 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 47a70b4f2f..a7e12a05e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,16 +1,16 @@ repos: - repo: https://github.com/streetsidesoftware/cspell-cli - rev: v9.0.1 + rev: v9.2.0 hooks: - id: cspell args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh'] - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.14.0 + rev: v2.15.0 hooks: - id: pretty-format-yaml args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -19,7 +19,7 @@ repos: - id: detect-aws-credentials args: [--allow-missing-credentials] - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.99.4 + rev: v1.100.0 hooks: - id: terraform_fmt - id: terraform_docs diff --git a/patterns/aws-neuron-efa/eks.tf b/patterns/aws-neuron-efa/eks.tf index 4533922640..4430f1f5a5 100644 --- a/patterns/aws-neuron-efa/eks.tf +++ b/patterns/aws-neuron-efa/eks.tf @@ -4,22 +4,17 @@ module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.34" + version = "~> 21.0" - cluster_name = local.name - cluster_version = "1.32" + name = local.name + kubernetes_version = "1.33" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster enable_cluster_creator_admin_permissions = true - cluster_endpoint_public_access = true + endpoint_public_access = true - # These will become the default in the next major version of the module - bootstrap_self_managed_addons = false - enable_irsa = false - enable_security_groups_for_pods = false - - cluster_addons = { + addons = { coredns = {} eks-node-monitoring-agent = {} eks-pod-identity-agent = { @@ -32,19 +27,9 @@ module "eks" { } } - # Add security group rules on the node group security group to - # allow EFA traffic - enable_efa_support = true - vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - eks_managed_node_group_defaults = { - node_repair_config = { - enabled = true - } - } - eks_managed_node_groups = { neuron-efa = { # The EKS AL2023 Neuron AMI provides all of the necessary components @@ -69,6 +54,10 @@ module "eks" { } ] + node_repair_config = { + enabled = true + } + min_size = 2 max_size = 2 desired_size = 2 diff --git a/patterns/aws-neuron-efa/helm.tf b/patterns/aws-neuron-efa/helm.tf index 6f1ca1f1fd..138bea0eb9 100644 --- a/patterns/aws-neuron-efa/helm.tf +++ b/patterns/aws-neuron-efa/helm.tf @@ -1,5 +1,27 @@ data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr + region = "us-east-1" +} + +provider "helm" { + kubernetes = { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec = { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + + registries = [ + { + url = "oci://public.ecr.aws/neuron" + username = data.aws_ecrpublic_authorization_token.token.user_name + password = data.aws_ecrpublic_authorization_token.token.password + } + ] + } } ################################################################################ @@ -10,15 +32,11 @@ resource "helm_release" "neuron" { name = "neuron" repository = "oci://public.ecr.aws/neuron" chart = "neuron-helm-chart" - version = "1.1.1" + version = "1.2.0" namespace = "neuron" create_namespace = true wait = false - # Public ECR - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - values = [ <<-EOT nodeSelector: @@ -33,7 +51,7 @@ resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.7" + version = "v0.5.17" namespace = "kube-system" wait = false diff --git a/patterns/aws-neuron-efa/main.tf b/patterns/aws-neuron-efa/main.tf index 228380d1cc..19c95fb080 100644 --- a/patterns/aws-neuron-efa/main.tf +++ b/patterns/aws-neuron-efa/main.tf @@ -1,14 +1,14 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.5.7" required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34, < 6.0" + version = ">= 6.9" } helm = { source = "hashicorp/helm" - version = ">= 2.9, < 3.0" + version = ">= 3.0" } } @@ -24,27 +24,6 @@ provider "aws" { region = local.region } -# This provider is required for Public ECR. Public ECR is only available in us-east-1 -# If your region is same as us-east-1 then you can just use one aws provider -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - ################################################################################ # Common data/locals ################################################################################ @@ -85,7 +64,7 @@ output "configure_kubectl" { module "vpc" { source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" + version = "~> 6.0" name = local.name cidr = local.vpc_cidr diff --git a/patterns/ipv6-eks-cluster/main.tf b/patterns/ipv6-eks-cluster/main.tf index 45596a7821..3c37724b3f 100644 --- a/patterns/ipv6-eks-cluster/main.tf +++ b/patterns/ipv6-eks-cluster/main.tf @@ -97,4 +97,4 @@ module "vpc" { } tags = local.tags -} \ No newline at end of file +} From 88447b95cc28cb45a4c99fdccd39ea6bdbd9e09b Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Fri, 12 Sep 2025 12:51:19 -0500 Subject: [PATCH 2/7] feat: Update provider and module versons for ML capacity block pattern --- patterns/aws-neuron-efa/eks.tf | 1 + patterns/ml-capacity-block/eks.tf | 30 ++++++++++-------------------- patterns/ml-capacity-block/helm.tf | 18 ++++++++++++++++-- patterns/ml-capacity-block/main.tf | 22 ++++------------------ 4 files changed, 31 insertions(+), 40 deletions(-) diff --git a/patterns/aws-neuron-efa/eks.tf b/patterns/aws-neuron-efa/eks.tf index 4430f1f5a5..28876b4420 100644 --- a/patterns/aws-neuron-efa/eks.tf +++ b/patterns/aws-neuron-efa/eks.tf @@ -66,6 +66,7 @@ module "eks" { # 1. Create a placement group to place the instances close to one another # 2. Ignore subnets that reside in AZs that do not support the instance type # 3. Expose all of the available EFA interfaces on the launch template + # 4. Add security group w/ rules to the node group to allow EFA traffic enable_efa_support = true labels = { diff --git a/patterns/ml-capacity-block/eks.tf b/patterns/ml-capacity-block/eks.tf index 1b9b674c25..b0b6acd5c3 100644 --- a/patterns/ml-capacity-block/eks.tf +++ b/patterns/ml-capacity-block/eks.tf @@ -16,22 +16,17 @@ variable "capacity_reservation_id" { module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.34" + version = "~> 21.0" - cluster_name = local.name - cluster_version = "1.32" + name = local.name + kubernetes_version = "1.33" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster enable_cluster_creator_admin_permissions = true - cluster_endpoint_public_access = true + endpoint_public_access = true - # These will become the default in the next major version of the module - bootstrap_self_managed_addons = false - enable_irsa = false - enable_security_groups_for_pods = false - - cluster_addons = { + addons = { coredns = {} eks-node-monitoring-agent = {} eks-pod-identity-agent = { @@ -44,19 +39,9 @@ module "eks" { } } - # Add security group rules on the node group security group to - # allow EFA traffic - enable_efa_support = true - vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - eks_managed_node_group_defaults = { - node_repair_config = { - enabled = true - } - } - eks_managed_node_groups = { cbr = { # The EKS AL2023 NVIDIA AMI provides all of the necessary components @@ -81,6 +66,10 @@ module "eks" { } ] + node_repair_config = { + enabled = true + } + min_size = 2 max_size = 2 desired_size = 2 @@ -89,6 +78,7 @@ module "eks" { # 1. Create a placement group to place the instances close to one another # 2. Ignore subnets that reside in AZs that do not support the instance type # 3. Expose all of the available EFA interfaces on the launch template + # 4. Add security group w/ rules to the node group to allow EFA traffic enable_efa_support = true labels = { diff --git a/patterns/ml-capacity-block/helm.tf b/patterns/ml-capacity-block/helm.tf index 8d11e73158..44246fe7f5 100644 --- a/patterns/ml-capacity-block/helm.tf +++ b/patterns/ml-capacity-block/helm.tf @@ -1,3 +1,17 @@ +provider "helm" { + kubernetes = { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec = { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + ################################################################################ # Helm charts ################################################################################ @@ -6,7 +20,7 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.17.1" + version = "0.17.4" namespace = "nvidia-device-plugin" create_namespace = true wait = false @@ -16,7 +30,7 @@ resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.7" + version = "v0.5.17" namespace = "kube-system" wait = false diff --git a/patterns/ml-capacity-block/main.tf b/patterns/ml-capacity-block/main.tf index 2ea32bf3f7..619e20382f 100644 --- a/patterns/ml-capacity-block/main.tf +++ b/patterns/ml-capacity-block/main.tf @@ -1,14 +1,14 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.5.7" required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34, < 6.0" + version = ">= 6.9" } helm = { source = "hashicorp/helm" - version = ">= 2.9, < 3.0" + version = ">= 3.0" } } @@ -24,20 +24,6 @@ provider "aws" { region = local.region } -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - ################################################################################ # Common data/locals ################################################################################ @@ -78,7 +64,7 @@ output "configure_kubectl" { module "vpc" { source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" + version = "~> 6.0" name = local.name cidr = local.vpc_cidr From c8df196813324665b6c1b0e947d050e37c8d2cd4 Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Fri, 12 Sep 2025 12:58:17 -0500 Subject: [PATCH 3/7] feat: Update provider and module versons for ML container cache pattern --- patterns/ml-container-cache/cache_builder.tf | 6 +-- patterns/ml-container-cache/eks.tf | 50 +++++++++---------- patterns/ml-container-cache/helm.tf | 16 +++++- patterns/ml-container-cache/main.tf | 22 ++------ patterns/ml-container-cache/pod-cached.yaml | 2 +- patterns/ml-container-cache/pod-uncached.yaml | 2 +- 6 files changed, 47 insertions(+), 51 deletions(-) diff --git a/patterns/ml-container-cache/cache_builder.tf b/patterns/ml-container-cache/cache_builder.tf index 5178a9d891..e35ec8c14f 100644 --- a/patterns/ml-container-cache/cache_builder.tf +++ b/patterns/ml-container-cache/cache_builder.tf @@ -1,13 +1,13 @@ module "ebs_snapshot_builder" { source = "clowdhaus/ebs-snapshot-builder/aws" - version = "~> 1.1" + version = "~> 2.0" name = local.name # Images to cache public_images = [ - "nvcr.io/nvidia/k8s-device-plugin:v0.17.1", # 120 MB compressed / 351 MB decompressed - "nvcr.io/nvidia/pytorch:25.02-py3", # 9.5 GB compressed / 20.4 GB decompressed + "nvcr.io/nvidia/k8s-device-plugin:v0.17.4", # 120 MB compressed / 351 MB decompressed + "nvcr.io/nvidia/pytorch:25.08-py3", # 9.5 GB compressed / 20.4 GB decompressed ] # AZs where EBS fast snapshot restore will be enabled diff --git a/patterns/ml-container-cache/eks.tf b/patterns/ml-container-cache/eks.tf index 84e58a6840..89d60c83e9 100644 --- a/patterns/ml-container-cache/eks.tf +++ b/patterns/ml-container-cache/eks.tf @@ -14,22 +14,17 @@ data "aws_ssm_parameter" "snapshot_id" { module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.34" + version = "~> 21.0" - cluster_name = local.name - cluster_version = "1.32" + name = local.name + kubernetes_version = "1.33" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster enable_cluster_creator_admin_permissions = true - cluster_endpoint_public_access = true + endpoint_public_access = true - # These will become the default in the next major version of the module - bootstrap_self_managed_addons = false - enable_irsa = false - enable_security_groups_for_pods = false - - cluster_addons = { + addons = { coredns = {} eks-node-monitoring-agent = {} eks-pod-identity-agent = { @@ -45,36 +40,37 @@ module "eks" { vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - eks_managed_node_group_defaults = { - node_repair_config = { - enabled = true - } - } - eks_managed_node_groups = { gpu = { - # The EKS AL2 GPU AMI provides all of the necessary components + # The EKS AL2023 NVIDIA AMI provides all of the necessary components # for accelerated workloads w/ EFA - ami_type = "AL2_x86_64_GPU" + ami_type = "AL2023_x86_64_NVIDIA" instance_types = ["g6e.xlarge"] + node_repair_config = { + enabled = true + } + min_size = 1 max_size = 1 desired_size = 1 - pre_bootstrap_user_data = <<-EOT - # Mount the second volume for containerd persistent data - # This volume contains the cached images and layers + cloudinit_pre_nodeadm = [{ + content = <<-EOT + # Mount the second volume for containerd persistent data + # This volume contains the cached images and layers - systemctl stop containerd kubelet + systemctl stop containerd kubelet - rm -rf /var/lib/containerd/* - echo '/dev/${local.dev_name} /var/lib/containerd xfs defaults 0 0' >> /etc/fstab - mount -a + rm -rf /var/lib/containerd/* + echo '/dev/${local.dev_name} /var/lib/containerd xfs defaults 0 0' >> /etc/fstab + mount -a - systemctl restart containerd kubelet + systemctl restart containerd kubelet - EOT + EOT + content_type = "text/x-shellscript; charset=\"us-ascii\"" + }] # Mount a second volume for containerd persistent data # using the snapshot that contains the cached images and layers diff --git a/patterns/ml-container-cache/helm.tf b/patterns/ml-container-cache/helm.tf index 7b43ce458c..30d0e1f146 100644 --- a/patterns/ml-container-cache/helm.tf +++ b/patterns/ml-container-cache/helm.tf @@ -1,3 +1,17 @@ +provider "helm" { + kubernetes = { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec = { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + ################################################################################ # Helm charts ################################################################################ @@ -6,7 +20,7 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.17.1" # Matches image that is cached + version = "0.17.4" # Matches image that is cached namespace = "nvidia-device-plugin" create_namespace = true wait = false diff --git a/patterns/ml-container-cache/main.tf b/patterns/ml-container-cache/main.tf index 565f4f881c..f03c4e759c 100644 --- a/patterns/ml-container-cache/main.tf +++ b/patterns/ml-container-cache/main.tf @@ -1,14 +1,14 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.5.7" required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34, < 6.0" + version = ">= 6.9" } helm = { source = "hashicorp/helm" - version = ">= 2.9, < 3.0" + version = ">= 3.0" } } @@ -24,20 +24,6 @@ provider "aws" { region = local.region } -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - ################################################################################ # Common data/locals ################################################################################ @@ -78,7 +64,7 @@ output "configure_kubectl" { module "vpc" { source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" + version = "~> 6.0" name = local.name cidr = local.vpc_cidr diff --git a/patterns/ml-container-cache/pod-cached.yaml b/patterns/ml-container-cache/pod-cached.yaml index 73f092a9a9..342dff0dcd 100644 --- a/patterns/ml-container-cache/pod-cached.yaml +++ b/patterns/ml-container-cache/pod-cached.yaml @@ -5,7 +5,7 @@ metadata: spec: containers: - name: example - image: nvcr.io/nvidia/pytorch:25.02-py3 + image: nvcr.io/nvidia/pytorch:25.08-py3 imagePullPolicy: IfNotPresent command: ['python3'] args: ['-c', 'import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())'] diff --git a/patterns/ml-container-cache/pod-uncached.yaml b/patterns/ml-container-cache/pod-uncached.yaml index 22d88ae81f..0442f9f8d3 100644 --- a/patterns/ml-container-cache/pod-uncached.yaml +++ b/patterns/ml-container-cache/pod-uncached.yaml @@ -5,7 +5,7 @@ metadata: spec: containers: - name: example - image: nvcr.io/nvidia/pytorch:25.02-py3 + image: nvcr.io/nvidia/pytorch:25.08-py3 imagePullPolicy: IfNotPresent command: ['python3'] args: ['-c', 'import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())'] From fc79c09c377507082de98cf88d42adae0ecf4279 Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Fri, 12 Sep 2025 13:03:05 -0500 Subject: [PATCH 4/7] feat: Update provider and module versons for multi-node vLLM pattern --- patterns/multi-node-vllm/ecr.tf | 2 +- patterns/multi-node-vllm/eks.tf | 30 +++++++++----------------- patterns/multi-node-vllm/helm.tf | 34 +++++++++++++++++++++++++++--- patterns/multi-node-vllm/main.tf | 36 ++++---------------------------- 4 files changed, 46 insertions(+), 56 deletions(-) diff --git a/patterns/multi-node-vllm/ecr.tf b/patterns/multi-node-vllm/ecr.tf index 45a91dea7c..f0ae5b9921 100644 --- a/patterns/multi-node-vllm/ecr.tf +++ b/patterns/multi-node-vllm/ecr.tf @@ -4,7 +4,7 @@ module "ecr" { source = "terraform-aws-modules/ecr/aws" - version = "~> 1.6" + version = "~> 3.1" repository_name = local.name diff --git a/patterns/multi-node-vllm/eks.tf b/patterns/multi-node-vllm/eks.tf index bed9594dad..28f66a0b85 100644 --- a/patterns/multi-node-vllm/eks.tf +++ b/patterns/multi-node-vllm/eks.tf @@ -4,22 +4,17 @@ module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.34" + version = "~> 21.0" - cluster_name = local.name - cluster_version = "1.32" + name = local.name + kubernetes_version = "1.33" # Gives Terraform identity admin access to cluster which will # allow deploying resources into the cluster enable_cluster_creator_admin_permissions = true - cluster_endpoint_public_access = true + endpoint_public_access = true - # These will become the default in the next major version of the module - bootstrap_self_managed_addons = false - enable_irsa = false - enable_security_groups_for_pods = false - - cluster_addons = { + addons = { coredns = {} eks-node-monitoring-agent = {} eks-pod-identity-agent = { @@ -32,19 +27,9 @@ module "eks" { } } - # Add security group rules on the node group security group to - # allow EFA traffic - enable_efa_support = true - vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - eks_managed_node_group_defaults = { - node_repair_config = { - enabled = true - } - } - eks_managed_node_groups = { g6e = { # The EKS AL2023 NVIDIA AMI provides all of the necessary components @@ -52,6 +37,10 @@ module "eks" { ami_type = "AL2023_x86_64_NVIDIA" instance_types = ["g6e.8xlarge"] + node_repair_config = { + enabled = true + } + min_size = 2 max_size = 5 desired_size = 2 @@ -77,6 +66,7 @@ module "eks" { # 1. Create a placement group to place the instances close to one another # 2. Ignore subnets that reside in AZs that do not support the instance type # 3. Expose all of the available EFA interfaces on the launch template + # 4. Add security group w/ rules to the node group to allow EFA traffic enable_efa_support = true subnet_ids = [element(module.vpc.private_subnets, 2)] diff --git a/patterns/multi-node-vllm/helm.tf b/patterns/multi-node-vllm/helm.tf index e809377c5c..d8fb15c2d9 100644 --- a/patterns/multi-node-vllm/helm.tf +++ b/patterns/multi-node-vllm/helm.tf @@ -1,3 +1,17 @@ +provider "helm" { + kubernetes = { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec = { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + ################################################################################ # Device Plugin(s) ################################################################################ @@ -6,7 +20,7 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.17.1" + version = "0.17.4" namespace = "nvidia-device-plugin" create_namespace = true wait = false @@ -16,7 +30,7 @@ resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.7" + version = "v0.5.17" namespace = "kube-system" wait = false @@ -36,8 +50,22 @@ resource "helm_release" "aws_efa_device_plugin" { # LWS (LeaderWorkerSet) ################################################################################ +provider "kubectl" { + apply_retry_count = 5 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + locals { - lws_version = "v0.5.1" + lws_version = "v0.6.3" } data "http" "lws" { diff --git a/patterns/multi-node-vllm/main.tf b/patterns/multi-node-vllm/main.tf index 5763d4dfb2..32f497bc11 100644 --- a/patterns/multi-node-vllm/main.tf +++ b/patterns/multi-node-vllm/main.tf @@ -1,14 +1,14 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.5.7" required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34, < 6.0" + version = ">= 6.10" } helm = { source = "hashicorp/helm" - version = ">= 2.9, < 3.0" + version = ">= 3.0" } http = { source = "hashicorp/http" @@ -37,34 +37,6 @@ provider "aws" { } -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - -provider "kubectl" { - apply_retry_count = 5 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - ################################################################################ # Common data/locals ################################################################################ @@ -105,7 +77,7 @@ output "configure_kubectl" { module "vpc" { source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" + version = "~> 6.0" name = local.name cidr = local.vpc_cidr From b5c98d332e19652eabd22264fd9c67e6aaf221c7 Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Fri, 12 Sep 2025 13:05:03 -0500 Subject: [PATCH 5/7] feat: Update provider and module versons for NVIDIA GPU w/ EFA pattern --- patterns/nvidia-gpu-efa/eks.tf | 30 ++++++++++-------------------- patterns/nvidia-gpu-efa/helm.tf | 18 ++++++++++++++++-- patterns/nvidia-gpu-efa/main.tf | 22 ++++------------------ 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/patterns/nvidia-gpu-efa/eks.tf b/patterns/nvidia-gpu-efa/eks.tf index 0f98b8ef8d..2ff3c06fee 100644 --- a/patterns/nvidia-gpu-efa/eks.tf +++ b/patterns/nvidia-gpu-efa/eks.tf @@ -4,22 +4,17 @@ module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.34" + version = "~> 21.0" - cluster_name = local.name - cluster_version = "1.32" + name = local.name + kubernetes_version = "1.33" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster enable_cluster_creator_admin_permissions = true - cluster_endpoint_public_access = true + endpoint_public_access = true - # These will become the default in the next major version of the module - bootstrap_self_managed_addons = false - enable_irsa = false - enable_security_groups_for_pods = false - - cluster_addons = { + addons = { coredns = {} eks-node-monitoring-agent = {} eks-pod-identity-agent = { @@ -32,19 +27,9 @@ module "eks" { } } - # Add security group rules on the node group security group to - # allow EFA traffic - enable_efa_support = true - vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - eks_managed_node_group_defaults = { - node_repair_config = { - enabled = true - } - } - eks_managed_node_groups = { nvidia-efa = { # The EKS AL2023 NVIDIA AMI provides all of the necessary components @@ -69,6 +54,10 @@ module "eks" { } ] + node_repair_config = { + enabled = true + } + min_size = 2 max_size = 2 desired_size = 2 @@ -77,6 +66,7 @@ module "eks" { # 1. Create a placement group to place the instances close to one another # 2. Ignore subnets that reside in AZs that do not support the instance type # 3. Expose all of the available EFA interfaces on the launch template + # 4. Add security group w/ rules to the node group to allow EFA traffic enable_efa_support = true labels = { diff --git a/patterns/nvidia-gpu-efa/helm.tf b/patterns/nvidia-gpu-efa/helm.tf index 8d11e73158..44246fe7f5 100644 --- a/patterns/nvidia-gpu-efa/helm.tf +++ b/patterns/nvidia-gpu-efa/helm.tf @@ -1,3 +1,17 @@ +provider "helm" { + kubernetes = { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec = { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + ################################################################################ # Helm charts ################################################################################ @@ -6,7 +20,7 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.17.1" + version = "0.17.4" namespace = "nvidia-device-plugin" create_namespace = true wait = false @@ -16,7 +30,7 @@ resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.7" + version = "v0.5.17" namespace = "kube-system" wait = false diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf index e247ea4950..9653594bad 100644 --- a/patterns/nvidia-gpu-efa/main.tf +++ b/patterns/nvidia-gpu-efa/main.tf @@ -1,14 +1,14 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.5.7" required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34, < 6.0" + version = ">= 6.0" } helm = { source = "hashicorp/helm" - version = ">= 2.9, < 3.0" + version = ">= 3.0" } } @@ -24,20 +24,6 @@ provider "aws" { region = local.region } -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - ################################################################################ # Common data/locals ################################################################################ @@ -78,7 +64,7 @@ output "configure_kubectl" { module "vpc" { source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" + version = "~> 6.0" name = local.name cidr = local.vpc_cidr From 405d441de3593d2b8609bc7227d65daed69cdcf6 Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Fri, 12 Sep 2025 13:07:01 -0500 Subject: [PATCH 6/7] feat: Update provider and module versons for targeted ODCR pattern --- patterns/targeted-odcr/eks.tf | 30 ++++++++++-------------------- patterns/targeted-odcr/helm.tf | 18 ++++++++++++++++-- patterns/targeted-odcr/main.tf | 22 ++++------------------ 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/patterns/targeted-odcr/eks.tf b/patterns/targeted-odcr/eks.tf index b816e050bc..e7e44499b7 100644 --- a/patterns/targeted-odcr/eks.tf +++ b/patterns/targeted-odcr/eks.tf @@ -13,22 +13,17 @@ variable "capacity_reservation_arns" { module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.34" + version = "~> 21.0" - cluster_name = local.name - cluster_version = "1.32" + name = local.name + kubernetes_version = "1.33" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster enable_cluster_creator_admin_permissions = true - cluster_endpoint_public_access = true + endpoint_public_access = true - # These will become the default in the next major version of the module - bootstrap_self_managed_addons = false - enable_irsa = false - enable_security_groups_for_pods = false - - cluster_addons = { + addons = { coredns = {} eks-node-monitoring-agent = {} eks-pod-identity-agent = { @@ -41,19 +36,9 @@ module "eks" { } } - # Add security group rules on the node group security group to - # allow EFA traffic - enable_efa_support = true - vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - eks_managed_node_group_defaults = { - node_repair_config = { - enabled = true - } - } - eks_managed_node_groups = { odcr = { # The EKS AL2023 NVIDIA AMI provides all of the necessary components @@ -78,6 +63,10 @@ module "eks" { } ] + node_repair_config = { + enabled = true + } + min_size = 2 max_size = 2 desired_size = 2 @@ -86,6 +75,7 @@ module "eks" { # 1. Create a placement group to place the instances close to one another # 2. Ignore subnets that reside in AZs that do not support the instance type # 3. Expose all of the available EFA interfaces on the launch template + # 4. Add security group w/ rules to the node group to allow EFA traffic enable_efa_support = true min_size = 4 diff --git a/patterns/targeted-odcr/helm.tf b/patterns/targeted-odcr/helm.tf index 8d11e73158..44246fe7f5 100644 --- a/patterns/targeted-odcr/helm.tf +++ b/patterns/targeted-odcr/helm.tf @@ -1,3 +1,17 @@ +provider "helm" { + kubernetes = { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec = { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + ################################################################################ # Helm charts ################################################################################ @@ -6,7 +20,7 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.17.1" + version = "0.17.4" namespace = "nvidia-device-plugin" create_namespace = true wait = false @@ -16,7 +30,7 @@ resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.7" + version = "v0.5.17" namespace = "kube-system" wait = false diff --git a/patterns/targeted-odcr/main.tf b/patterns/targeted-odcr/main.tf index 50ffcd1071..9660cbbd66 100644 --- a/patterns/targeted-odcr/main.tf +++ b/patterns/targeted-odcr/main.tf @@ -1,14 +1,14 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.5.7" required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34, < 6.0" + version = ">= 6.0" } helm = { source = "hashicorp/helm" - version = ">= 2.9, < 3.0" + version = ">= 3.0" } } @@ -24,20 +24,6 @@ provider "aws" { region = local.region } -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - ################################################################################ # Common data/locals ################################################################################ @@ -78,7 +64,7 @@ output "configure_kubectl" { module "vpc" { source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" + version = "~> 6.0" name = local.name cidr = local.vpc_cidr From bbe8ae9dce40db1383430aa768cacc896d5107e9 Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Fri, 12 Sep 2025 13:25:57 -0500 Subject: [PATCH 7/7] fix: Update docs line highlighting --- .github/workflows/publish-docs.yml | 6 +++--- mkdocs.yml | 1 + patterns/aws-neuron-efa/README.md | 4 ++-- patterns/ml-capacity-block/README.md | 2 +- patterns/ml-container-cache/README.md | 2 +- patterns/multi-node-vllm/README.md | 4 ++-- patterns/nvidia-gpu-efa/README.md | 4 ++-- patterns/targeted-odcr/README.md | 2 +- 8 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index 43f2aa9cd2..693a66c1ef 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -35,9 +35,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install mkdocs-material==9.5.21 \ - mkdocs-include-markdown-plugin==6.0.6 \ - mkdocs-awesome-pages-plugin==2.9.2 + python -m pip install mkdocs-material==9.6.19 \ + mkdocs-include-markdown-plugin==7.1.7 \ + mkdocs-awesome-pages-plugin==2.10.1 - name: git config run: | diff --git a/mkdocs.yml b/mkdocs.yml index 3a2983d501..f53b279d99 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -57,6 +57,7 @@ markdown_extensions: - codehilite - footnotes - md_in_html + - pymdownx.blocks.caption - pymdownx.critic - pymdownx.details - pymdownx.highlight: diff --git a/patterns/aws-neuron-efa/README.md b/patterns/aws-neuron-efa/README.md index 29c888286e..30c6e3afc0 100644 --- a/patterns/aws-neuron-efa/README.md +++ b/patterns/aws-neuron-efa/README.md @@ -19,13 +19,13 @@ The following components are demonstrated in this pattern: ### Cluster -```terraform hl_lines="35-37 49-95" +```terraform hl_lines="34-85" {% include "../../patterns/aws-neuron-efa/eks.tf" %} ``` ### Device Plugins -```terraform hl_lines="9-50" +```terraform hl_lines="31-68" {% include "../../patterns/aws-neuron-efa/helm.tf" %} ``` diff --git a/patterns/ml-capacity-block/README.md b/patterns/ml-capacity-block/README.md index 4921bd86e4..795d81b2c9 100644 --- a/patterns/ml-capacity-block/README.md +++ b/patterns/ml-capacity-block/README.md @@ -13,7 +13,7 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations ## Code -```terraform hl_lines="5-11 108-122" +```terraform hl_lines="5-11 98-112" {% include "../../patterns/ml-capacity-block/eks.tf" %} ``` diff --git a/patterns/ml-container-cache/README.md b/patterns/ml-container-cache/README.md index ebe3549ab0..21106bb227 100644 --- a/patterns/ml-container-cache/README.md +++ b/patterns/ml-container-cache/README.md @@ -45,7 +45,7 @@ When the PyTorch image is not present on the EBS volume, it takes roughly 6 minu ### Cluster -```terraform hl_lines="5-9 65-77 79-91" +```terraform hl_lines="5-9 58-73 75-87" {% include "../../patterns/ml-container-cache/eks.tf" %} ``` diff --git a/patterns/multi-node-vllm/README.md b/patterns/multi-node-vllm/README.md index ae4cdc4a72..b9c270e3f3 100644 --- a/patterns/multi-node-vllm/README.md +++ b/patterns/multi-node-vllm/README.md @@ -22,13 +22,13 @@ The following components are demonstrated in this pattern: ### Cluster -```terraform hl_lines="35-37 49-96" +```terraform hl_lines="34-86" {% include "../../patterns/multi-node-vllm/eks.tf" %} ``` ### Helm Charts -```terraform hl_lines="39-56" +```terraform hl_lines="67-84" {% include "../../patterns/multi-node-vllm/helm.tf" %} ``` diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md index 5a555f5ade..dded3e07e7 100644 --- a/patterns/nvidia-gpu-efa/README.md +++ b/patterns/nvidia-gpu-efa/README.md @@ -19,13 +19,13 @@ The following components are demonstrated in this pattern: ### Cluster -```terraform hl_lines="35-37 49-95" +```terraform hl_lines="34-85" {% include "../../patterns/nvidia-gpu-efa/eks.tf" %} ``` ### Device Plugins -```terraform hl_lines="5-33" +```terraform hl_lines="19-47" {% include "../../patterns/nvidia-gpu-efa/helm.tf" %} ``` diff --git a/patterns/targeted-odcr/README.md b/patterns/targeted-odcr/README.md index af33fefbb4..56236ea1f4 100644 --- a/patterns/targeted-odcr/README.md +++ b/patterns/targeted-odcr/README.md @@ -18,7 +18,7 @@ This pattern demonstrates how to consume/utilize on-demand capacity reservations ## Code -```terraform hl_lines="5-8 109-119 139-162" +```terraform hl_lines="5-8 99-109 129-152" {% include "../../patterns/targeted-odcr/eks.tf" %} ```