Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/antonbabenko/pre-commit-terraform
rev: v1.96.1
rev: v1.96.2
hooks:
- id: terraform_fmt
- id: terraform_docs
Expand Down
2 changes: 2 additions & 0 deletions modules/eks-managed-node-group/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,11 @@ module "eks_managed_node_group" {
| <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
| <a name="input_disk_size"></a> [disk\_size](#input\_disk\_size) | Disk size in GiB for nodes. Defaults to `20`. Only valid when `use_custom_launch_template` = `false` | `number` | `null` | no |
| <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance(s) will be EBS-optimized | `bool` | `null` | no |
| <a name="input_efa_indices"></a> [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` | <pre>[<br/> 0<br/>]</pre> | no |
| <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
| <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
| <a name="input_enable_bootstrap_user_data"></a> [enable\_bootstrap\_user\_data](#input\_enable\_bootstrap\_user\_data) | Determines whether the bootstrap configurations are populated within the user data template. Only valid when using a custom AMI via `ami_id` | `bool` | `false` | no |
| <a name="input_enable_efa_only"></a> [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no |
| <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
| <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
| <a name="input_enclave_options"></a> [enclave\_options](#input\_enclave\_options) | Enable Nitro Enclaves on launched instances | `map(string)` | `{}` | no |
Expand Down
3 changes: 2 additions & 1 deletion modules/eks-managed-node-group/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,14 @@ locals {
efa_instance_type = try(element(var.instance_types, 0), "")
num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)

# Primary network interface must be EFA, remaining can be EFA or EFA-only
efa_network_interfaces = [
for i in range(local.num_network_cards) : {
associate_public_ip_address = false
delete_on_termination = true
device_index = i == 0 ? 0 : 1
network_card_index = i
interface_type = "efa"
interface_type = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

contains(concat([0], var.efa_indices), i) looks almost like magic but I guess it works as expected. :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ya, hopefully on the next breaking change some of this can be simplified by changing the defaults 😅

}
]

Expand Down
13 changes: 13 additions & 0 deletions modules/eks-managed-node-group/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,19 @@ variable "enable_efa_support" {
default = false
}

# TODO - make this true by default at next breaking change (remove variable, only pass indices)
variable "enable_efa_only" {
description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later"
type = bool
default = false
}

variable "efa_indices" {
description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`"
type = list(number)
default = [0]
}

variable "network_interfaces" {
description = "Customize network interfaces to be attached at instance boot time"
type = list(any)
Expand Down
2 changes: 2 additions & 0 deletions modules/self-managed-node-group/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,10 @@ module "self_managed_node_group" {
| <a name="input_desired_size_type"></a> [desired\_size\_type](#input\_desired\_size\_type) | The unit of measurement for the value specified for `desired_size`. Supported for attribute-based instance type selection only. Valid values: `units`, `vcpu`, `memory-mib` | `string` | `null` | no |
| <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
| <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance will be EBS-optimized | `bool` | `null` | no |
| <a name="input_efa_indices"></a> [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` | <pre>[<br/> 0<br/>]</pre> | no |
| <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
| <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
| <a name="input_enable_efa_only"></a> [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no |
| <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
| <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
| <a name="input_enabled_metrics"></a> [enabled\_metrics](#input\_enabled\_metrics) | A list of metrics to collect. The allowed values are `GroupDesiredCapacity`, `GroupInServiceCapacity`, `GroupPendingCapacity`, `GroupMinSize`, `GroupMaxSize`, `GroupInServiceInstances`, `GroupPendingInstances`, `GroupStandbyInstances`, `GroupStandbyCapacity`, `GroupTerminatingCapacity`, `GroupTerminatingInstances`, `GroupTotalCapacity`, `GroupTotalInstances` | `list(string)` | `[]` | no |
Expand Down
5 changes: 3 additions & 2 deletions modules/self-managed-node-group/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ module "user_data" {
################################################################################

data "aws_ec2_instance_type" "this" {
count = local.enable_efa_support ? 1 : 0
count = var.create && var.enable_efa_support ? 1 : 0

instance_type = var.instance_type
}
Expand All @@ -101,13 +101,14 @@ locals {
instance_type_provided = var.instance_type != ""
num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)

# Primary network interface must be EFA, remaining can be EFA or EFA-only
efa_network_interfaces = [
for i in range(local.num_network_cards) : {
associate_public_ip_address = false
delete_on_termination = true
device_index = i == 0 ? 0 : 1
network_card_index = i
interface_type = "efa"
interface_type = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa"
}
]

Expand Down
13 changes: 13 additions & 0 deletions modules/self-managed-node-group/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,19 @@ variable "enable_efa_support" {
default = false
}

# TODO - make this true by default at next breaking change (remove variable, only pass indices)
variable "enable_efa_only" {
description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later"
type = bool
default = false
}

variable "efa_indices" {
description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`"
type = list(number)
default = [0]
}

variable "metadata_options" {
description = "Customize the metadata options for the instance"
type = map(string)
Expand Down
4 changes: 4 additions & 0 deletions node_groups.tf
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,8 @@ module "eks_managed_node_group" {
metadata_options = try(each.value.metadata_options, var.eks_managed_node_group_defaults.metadata_options, local.metadata_options)
enable_monitoring = try(each.value.enable_monitoring, var.eks_managed_node_group_defaults.enable_monitoring, true)
enable_efa_support = try(each.value.enable_efa_support, var.eks_managed_node_group_defaults.enable_efa_support, false)
enable_efa_only = try(each.value.enable_efa_only, var.eks_managed_node_group_defaults.enable_efa_only, false)
efa_indices = try(each.value.efa_indices, var.eks_managed_node_group_defaults.efa_indices, [0])
create_placement_group = try(each.value.create_placement_group, var.eks_managed_node_group_defaults.create_placement_group, false)
placement = try(each.value.placement, var.eks_managed_node_group_defaults.placement, {})
placement_group_az = try(each.value.placement_group_az, var.eks_managed_node_group_defaults.placement_group_az, null)
Expand Down Expand Up @@ -526,6 +528,8 @@ module "self_managed_node_group" {
metadata_options = try(each.value.metadata_options, var.self_managed_node_group_defaults.metadata_options, local.metadata_options)
enable_monitoring = try(each.value.enable_monitoring, var.self_managed_node_group_defaults.enable_monitoring, true)
enable_efa_support = try(each.value.enable_efa_support, var.self_managed_node_group_defaults.enable_efa_support, false)
enable_efa_only = try(each.value.enable_efa_only, var.self_managed_node_group_defaults.enable_efa_only, false)
efa_indices = try(each.value.efa_indices, var.self_managed_node_group_defaults.efa_indices, [0])
network_interfaces = try(each.value.network_interfaces, var.self_managed_node_group_defaults.network_interfaces, [])
placement = try(each.value.placement, var.self_managed_node_group_defaults.placement, {})
maintenance_options = try(each.value.maintenance_options, var.self_managed_node_group_defaults.maintenance_options, {})
Expand Down
101 changes: 71 additions & 30 deletions tests/eks-managed-node-group/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ module "eks" {
control_plane_subnet_ids = module.vpc.intra_subnets

eks_managed_node_group_defaults = {
ami_type = "AL2_x86_64"
ami_type = "AL2023_x86_64_STANDARD"
instance_types = ["m6i.large", "m5.large", "m5n.large", "m5zn.large"]
}

Expand Down Expand Up @@ -184,7 +184,7 @@ module "eks" {

# Use a custom AMI
custom_ami = {
ami_type = "AL2_ARM_64"
ami_type = "AL2023_ARM_64_STANDARD"
# Current default AMI used by managed node groups - pseudo "custom"
ami_id = data.aws_ami.eks_default_arm.image_id

Expand All @@ -211,13 +211,28 @@ module "eks" {
ami_id = data.aws_ami.eks_default.image_id
enable_bootstrap_user_data = true

pre_bootstrap_user_data = <<-EOT
export FOO=bar
EOT

post_bootstrap_user_data = <<-EOT
echo "you are free little kubelet!"
EOT
cloudinit_pre_nodeadm = [{
content = <<-EOT
---
apiVersion: node.eks.aws/v1alpha1
kind: NodeConfig
spec:
kubelet:
config:
shutdownGracePeriod: 30s
featureGates:
DisableKubeletCloudCredentialProviders: true
EOT
content_type = "application/node.eks.aws"
}]

# This is only possible with a custom AMI or self-managed node group
cloudinit_post_nodeadm = [{
content = <<-EOT
echo "All done"
EOT
content_type = "text/x-shellscript; charset=\"us-ascii\""
}]

capacity_type = "SPOT"
force_update_version = true
Expand All @@ -227,14 +242,6 @@ module "eks" {
GithubOrg = "terraform-aws-modules"
}

taints = [
{
key = "dedicated"
value = "gpuGroup"
effect = "NO_SCHEDULE"
}
]

update_config = {
max_unavailable_percentage = 33 # or set `max_unavailable`
}
Expand Down Expand Up @@ -306,19 +313,53 @@ module "eks" {
# Can be enabled when appropriate for testing/validation
create = false

ami_type = "AL2_x86_64_GPU"
instance_types = ["trn1n.32xlarge"]
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
# for accelerated workloads w/ EFA
ami_type = "AL2023_x86_64_NVIDIA"
instance_types = ["p5e.48xlarge"]

enable_efa_support = true
pre_bootstrap_user_data = <<-EOT
# Mount NVME instance store volumes since they are typically
# available on instances that support EFA
setup-local-disks raid0
EOT
# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
cloudinit_pre_nodeadm = [
{
content_type = "application/node.eks.aws"
content = <<-EOT
---
apiVersion: node.eks.aws/v1alpha1
kind: NodeConfig
spec:
instance:
localStorage:
strategy: RAID0
EOT
}
]

# This will:
# 1. Create a placement group to place the instances close to one another
# 2. Ignore subnets that reside in AZs that do not support the instance type
# 3. Expose all of the available EFA interfaces on the launch template
enable_efa_support = true
enable_efa_only = true
efa_indices = [0, 4, 8, 12]

min_size = 2
max_size = 2
desired_size = 2
min_size = 1
max_size = 1
desired_size = 1

labels = {
"vpc.amazonaws.com/efa.present" = "true"
"nvidia.com/gpu.present" = "true"
}

taints = {
# Ensure only GPU workloads are scheduled on this node group
gpu = {
key = "nvidia.com/gpu"
value = "true"
effect = "NO_SCHEDULE"
}
}
}
}

Expand Down Expand Up @@ -532,7 +573,7 @@ data "aws_ami" "eks_default" {

filter {
name = "name"
values = ["amazon-eks-node-${local.cluster_version}-v*"]
values = ["amazon-eks-node-al2023-x86_64-standard-${local.cluster_version}-v*"]
}
}

Expand All @@ -542,7 +583,7 @@ data "aws_ami" "eks_default_arm" {

filter {
name = "name"
values = ["amazon-eks-arm64-node-${local.cluster_version}-v*"]
values = ["amazon-eks-node-al2023-arm64-standard-${local.cluster_version}-v*"]
}
}

Expand Down
Loading