Skip to content

Commit a762974

Browse files
committed
aws: enable GPU AMI support for GPU instances
Add support for GPU-optimized AMIs when using GPU instance types. This includes: - AWS Deep Learning AMI with pre-installed NVIDIA drivers, CUDA, and ML frameworks - NVIDIA Deep Learning AMI option for NGC containers - Custom GPU AMI support for specialized images - Automatic detection of GPU instance types - Conditional display of GPU AMI options only for GPU instances - Update terraform.tfvars template to use GPU AMI when configured - Add defconfig for AWS G6e.2xlarge GPU instance with Deep Learning AMI The system automatically detects when you select a GPU instance family (like G6E) and provides appropriate GPU-optimized AMI options including the AWS Deep Learning AMI with all necessary drivers and frameworks pre-installed. Generated-by: Claude AI Signed-off-by: Luis Chamberlain <[email protected]>
1 parent 786475a commit a762974

File tree

5 files changed

+60
-2
lines changed

5 files changed

+60
-2
lines changed

defconfigs/aws-gpu-g6e-ai

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# AWS G6e.2xlarge GPU instance with Deep Learning AMI for AI/ML workloads
2+
# This configuration sets up an AWS G6e.2xlarge instance with NVIDIA L40S GPU
3+
# optimized for machine learning, AI inference, and GPU-accelerated workloads
4+
5+
# Cloud provider configuration
6+
CONFIG_KDEVOPS_ENABLE_TERRAFORM=y
7+
CONFIG_TERRAFORM=y
8+
CONFIG_TERRAFORM_AWS=y
9+
10+
# AWS Dynamic configuration (required for G6E instance family and GPU AMIs)
11+
CONFIG_TERRAFORM_AWS_USE_DYNAMIC_CONFIG=y
12+
13+
# AWS Instance configuration - G6E family with NVIDIA L40S GPU
14+
# G6E.2XLARGE specifications:
15+
# - 8 vCPUs (3rd Gen AMD EPYC processors)
16+
# - 32 GB system RAM
17+
# - 1x NVIDIA L40S Tensor Core GPU
18+
# - 48 GB GPU memory
19+
# - Up to 15 Gbps network performance
20+
# - Up to 10 Gbps EBS bandwidth
21+
CONFIG_TERRAFORM_AWS_INSTANCE_TYPE_G6E=y
22+
CONFIG_TERRAFORM_AWS_INSTANCE_G6E_2XLARGE=y
23+
24+
# AWS Region - US East (N. Virginia) - primary availability for G6E
25+
CONFIG_TERRAFORM_AWS_REGION_US_EAST_1=y
26+
27+
# GPU-optimized Deep Learning AMI
28+
# Includes: NVIDIA drivers 535+, CUDA 12.x, cuDNN, TensorFlow, PyTorch, MXNet
29+
CONFIG_TERRAFORM_AWS_USE_GPU_AMI=y
30+
CONFIG_TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING=y
31+
CONFIG_TERRAFORM_AWS_GPU_AMI_NAME="Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
32+
CONFIG_TERRAFORM_AWS_GPU_AMI_OWNER="amazon"
33+
34+
# Storage configuration optimized for ML workloads
35+
# 200 GB for datasets, models, and experiment artifacts
36+
CONFIG_TERRAFORM_AWS_DATA_VOLUME_SIZE=200
37+
38+
# Note: After provisioning, the instance will have:
39+
# - Jupyter notebook server ready for ML experiments
40+
# - Pre-installed deep learning frameworks
41+
# - NVIDIA GPU drivers and CUDA toolkit
42+
# - Docker with NVIDIA Container Toolkit for containerized ML workloads

playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
aws_profile = "{{ terraform_aws_profile }}"
22
aws_region = "{{ terraform_aws_region }}"
33
aws_availability_zone = "{{ terraform_aws_av_zone }}"
4+
{% if terraform_aws_use_gpu_ami is defined and terraform_aws_use_gpu_ami %}
5+
aws_name_search = "{{ terraform_aws_gpu_ami_name }}"
6+
aws_ami_owner = "{{ terraform_aws_gpu_ami_owner }}"
7+
{% else %}
48
aws_name_search = "{{ terraform_aws_ns }}"
59
aws_ami_owner = "{{ terraform_aws_ami_owner }}"
10+
{% endif %}
611
aws_instance_type = "{{ terraform_aws_instance_type }}"
712
aws_ebs_volumes_per_instance = "{{ terraform_aws_ebs_volumes_per_instance }}"
813
aws_ebs_volume_size = {{ terraform_aws_ebs_volume_size }}

scripts/aws_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,7 @@ def generate_gpu_amis_kconfig() -> str:
956956
config TERRAFORM_AWS_GPU_AMI_NAME
957957
string
958958
output yaml
959-
default "Deep Learning AMI GPU TensorFlow*"
959+
default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
960960
help
961961
AMI name pattern for AWS Deep Learning AMI.
962962
@@ -1061,7 +1061,7 @@ def generate_default_gpu_amis_kconfig() -> str:
10611061
config TERRAFORM_AWS_GPU_AMI_NAME
10621062
string
10631063
output yaml
1064-
default "Deep Learning AMI GPU TensorFlow*"
1064+
default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
10651065
10661066
config TERRAFORM_AWS_GPU_AMI_OWNER
10671067
string

scripts/dynamic-cloud-kconfig.Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ dynamic_aws_kconfig_touch:
4545
$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated
4646
$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.compute.static
4747
$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.location.static
48+
$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static
4849
$(Q)for family in $(AWS_INSTANCE_TYPE_FAMILIES); do \
4950
touch $(AWS_INSTANCE_TYPES_DIR)/Kconfig.$$family.generated; \
5051
touch $(AWS_INSTANCE_TYPES_DIR)/Kconfig.$$family.static; \
@@ -117,6 +118,11 @@ cloud-update:
117118
sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.location.static; \
118119
echo " Created $(AWS_KCONFIG_DIR)/Kconfig.location.static"; \
119120
fi
121+
$(Q)if [ -f $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated ]; then \
122+
cp $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \
123+
sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \
124+
echo " Created $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static"; \
125+
fi
120126
# AWS instance type families
121127
$(Q)for file in $(AWS_INSTANCE_TYPES_DIR)/Kconfig.*.generated; do \
122128
if [ -f "$$file" ]; then \

terraform/aws/kconfigs/Kconfig.compute

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,8 @@ source "terraform/aws/kconfigs/distros/Kconfig.fedora"
7676
source "terraform/aws/kconfigs/distros/Kconfig.rhel"
7777
source "terraform/aws/kconfigs/distros/Kconfig.sles"
7878
source "terraform/aws/kconfigs/distros/Kconfig.custom"
79+
80+
# Include GPU AMI configuration if available (generated by cloud-config)
81+
if TERRAFORM_AWS_USE_DYNAMIC_CONFIG
82+
source "terraform/aws/kconfigs/Kconfig.gpu-amis.static"
83+
endif

0 commit comments

Comments
 (0)