Skip to content

Commit db5c59c

Browse files
authored
feat: Add pattern to demonstrate how to cache large/ML container images to reduce time to start pods (#2010)
1 parent 37e2ad9 commit db5c59c

File tree

13 files changed

+431
-6
lines changed

13 files changed

+431
-6
lines changed

.github/scripts/mkdocs-hooks.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,36 @@ def on_page_markdown(markdown, **kwargs):
1010

1111
def on_files(files, config, **kwargs):
1212
# Add targeted-odcr screenshots to the generated build
13+
path = 'patterns/targeted-odcr/assets/'
1314
for odcr_file in [1, 2]:
1415
files.append(
1516
File(
16-
src_dir='./patterns/targeted-odcr/assets/',
17-
dest_dir=os.path.join(config.site_dir, 'patterns/targeted-odcr/assets/'),
17+
src_dir=f'./{path}',
18+
dest_dir=os.path.join(config.site_dir, path),
1819
path=f'odcr-screenshot{odcr_file}.png',
1920
use_directory_urls=True
2021
)
2122
)
2223

24+
path = 'patterns/kubecost/assets/'
2325
files.append(
2426
File(
25-
src_dir='./patterns/kubecost/assets/',
26-
dest_dir=os.path.join(config.site_dir, 'patterns/kubecost/assets/'),
27+
src_dir=f'./{path}',
28+
dest_dir=os.path.join(config.site_dir, path),
2729
path='screenshot.png',
2830
use_directory_urls=True
2931
)
3032
)
3133

34+
for svg in ['cached.svg', 'uncached.svg', 'state-machine.png']:
35+
files.append(
36+
File(
37+
src_dir=f'./patterns/ml-container-cache/assets/',
38+
dest_dir=os.path.join(config.site_dir, 'patterns/machine-learning/ml-container-cache/assets/'),
39+
path=svg,
40+
use_directory_urls=True
41+
)
42+
)
43+
44+
3245
return files

.github/workflows/pre-commit.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ on:
1111

1212
env:
1313
TERRAFORM_VERSION: 1.3.10
14-
TERRAFORM_DOCS_VERSION: v0.16.0
14+
TERRAFORM_DOCS_VERSION: v0.19.0
15+
TFLINT_VERSION: v0.53.0
1516
TF_PLUGIN_CACHE_DIR: ${{ github.workspace }}/.terraform.d/plugin-cache
16-
TFLINT_VERSION: v0.50.2
1717

1818
concurrency:
1919
group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
title: ML Container Cache
3+
---
4+
5+
{%
6+
include-markdown "../../../patterns/ml-container-cache/README.md"
7+
%}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# EKS Cluster w/ Cached ML Images
2+
3+
This pattern demonstrates how to cache images on an EBS volume snapshot that will be used by nodes in an EKS cluster. The solution is comprised of primarily of the following components:
4+
5+
1. An AWS Step Function implementation has been provided that demonstrates an example process for creating EBS volume snapshots that are pre-populated with the select container images. As part of this process, EBS Fast Snapshot Restore is enabled by default for the snapshots created to avoid the [EBS volume initialization time penalty](https://aws.amazon.com/blogs/storage/addressing-i-o-latency-when-restoring-amazon-ebs-volumes-from-ebs-snapshots/). The Step Function state machine diagram is captured below for reference.
6+
2. The node group demonstrates how to mount the generated EBS volume snapshot at the `/var/lib/containerd` location in order for containerd to utilize the pre-populated images. The snapshot ID is referenced via an SSM parameter data source which was populated by the Step Function cache builder; any new images created by the cache builder will automatically update the SSM parameter used by the node group.
7+
8+
The main benefit of caching, or pre-pulling, container images onto an EBS volume snapshot is faster time to start pods/containers on new nodes, especially for larger (multi-gigabyte) images that are common with machine-learning workloads. This process avoids the time and resources it takes to pull and un-pack container images from remote registries. Instead, those images are already present in the location that containerd expects, allowing for faster pod startup times.
9+
10+
### Cache Builder State Machine
11+
12+
<p align="center">
13+
<img src="assets/state-machine.png" alt="cached builder state machine" >
14+
</p>
15+
16+
## Results
17+
18+
The following results use the PyTorch [nvcr.io/nvidia/pytorch:24.08-py3](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) image which is 9.5 GB compressed and 20.4 GB decompressed on disk.
19+
20+
Pod start up time duration is captured via pod events using [ktime](https://github.com/clowdhaus/ktime).
21+
22+
### Cached
23+
24+
With the PyTorch image already present on the EBS volume, the pod starts up in less than 5 seconds:
25+
26+
<p align="center">
27+
<img src="assets/cached.svg" alt="cached image startup time" width="80%">
28+
</p>
29+
30+
### Uncached
31+
32+
When the PyTorch image is not present on the EBS volume, it takes roughly 6 minutes (334 seconds in the capture below) for the image to be pulled, unpacked, and the pod to start.
33+
34+
<p align="center">
35+
<img src="assets/uncached.svg" alt="uncached image startup time" width="80%">
36+
</p>
37+
38+
## Code
39+
40+
### Cache Builder
41+
42+
```terraform hl_lines="7-11 13-14"
43+
{% include "../../patterns/ml-container-cache/cache_builder.tf" %}
44+
```
45+
46+
### Cluster
47+
48+
```terraform hl_lines="5-9 52-64 66-78"
49+
{% include "../../patterns/ml-container-cache/eks.tf" %}
50+
```
51+
52+
## Deploy
53+
54+
See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#prerequisites) for the prerequisites and steps to deploy this pattern.
55+
56+
1. First, deploy the Step Function state machine that will create the EBS volume snapshots with the cached images.
57+
58+
```sh
59+
terraform init
60+
terraform apply -target=module.ebs_snapshot_builder -target=module.vpc --auto-approve
61+
```
62+
63+
2. Once the cache builder resources have been provisioned, execute the state machine by either navigating to the state machine within the AWS console and clicking `Start execution` (with the defaults or by passing in values to override the default values), or by using the provided output from the Terraform output value `start_execution_command` to start the state machine using the awscli. For example, the output looks similar to the following:
64+
65+
```hcl
66+
start_execution_command = <<EOT
67+
aws stepfunctions start-execution \
68+
--region us-west-2 \
69+
--state-machine-arn arn:aws:states:us-west-2:111111111111:stateMachine:cache-builder \
70+
--input "{\"SnapshotDescription\":\"ML container image cache\",\"SnapshotName\":\"ml-container-cache\"}"
71+
72+
EOT
73+
```
74+
75+
3. Once the state machine execution has completed successfully and created an EBS snapshot volume, provision the cluster and node group that will utilize the cached images.
76+
77+
```sh
78+
terraform apply --auto-approve
79+
```
80+
81+
4. Once the EKS cluster and node group have been provisioned, you can deploy the provided example pod that will use a cached image to verify the time it takes for the pod to reach a ready state.
82+
83+
```sh
84+
kubectl apply -f pod-cached.yaml
85+
```
86+
87+
You can contrast this with the time it takes for a pod that is not cached on a node by using the provided `pod-uncached.yaml` file. This works by simply using a pod that doesn't have a toleration for nodes that contain NVIDIA GPUs, which is where the cached images are provided in this example.
88+
89+
```sh
90+
kubectl apply -f pod-uncached.yaml
91+
```
92+
93+
You can also do the same steps above but using the small, utility CLI [ktime](https://github.com/clowdhaus/ktime) which can either collect the pod events to measure the time duration to reach a ready state, or it can deploy a pod manifest and return the same:
94+
95+
```sh
96+
ktime apply -f pod-cached.yaml
97+
-- or --
98+
ktime apply -f pod-uncached.yaml
99+
```
100+
101+
## Destroy
102+
103+
```sh
104+
terraform destroy --auto-approve
105+
```

patterns/ml-container-cache/assets/cached.svg

Lines changed: 1 addition & 0 deletions
Loading
157 KB
Loading

patterns/ml-container-cache/assets/uncached.svg

Lines changed: 1 addition & 0 deletions
Loading
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
module "ebs_snapshot_builder" {
2+
source = "clowdhaus/ebs-snapshot-builder/aws"
3+
version = "~> 1.1"
4+
5+
name = local.name
6+
7+
# Images to cache
8+
public_images = [
9+
"nvcr.io/nvidia/k8s-device-plugin:v0.16.2", # 120 MB compressed / 351 MB decompressed
10+
"nvcr.io/nvidia/pytorch:24.08-py3", # 9.5 GB compressed / 20.4 GB decompressed
11+
]
12+
13+
# AZs where EBS fast snapshot restore will be enabled
14+
fsr_availability_zone_names = local.azs
15+
16+
vpc_id = module.vpc.vpc_id
17+
subnet_id = element(module.vpc.private_subnets, 0)
18+
19+
tags = local.tags
20+
}

patterns/ml-container-cache/eks.tf

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
locals {
2+
dev_name = "xvdb"
3+
}
4+
5+
# SSM parameter where the `cache-builder` stores the generated snapshot ID
6+
# This will be used to reference the snapshot when creating the EKS node group
7+
data "aws_ssm_parameter" "snapshot_id" {
8+
name = module.ebs_snapshot_builder.ssm_parameter_name
9+
}
10+
11+
################################################################################
12+
# Cluster
13+
################################################################################
14+
15+
module "eks" {
16+
source = "terraform-aws-modules/eks/aws"
17+
version = "~> 20.24"
18+
19+
cluster_name = local.name
20+
cluster_version = "1.31"
21+
22+
# Give the Terraform identity admin access to the cluster
23+
# which will allow it to deploy resources into the cluster
24+
enable_cluster_creator_admin_permissions = true
25+
cluster_endpoint_public_access = true
26+
27+
cluster_addons = {
28+
coredns = {}
29+
eks-pod-identity-agent = {}
30+
kube-proxy = {}
31+
vpc-cni = {}
32+
}
33+
34+
vpc_id = module.vpc.vpc_id
35+
subnet_ids = module.vpc.private_subnets
36+
37+
eks_managed_node_group_defaults = {
38+
ebs_optimized = true
39+
}
40+
41+
eks_managed_node_groups = {
42+
gpu = {
43+
# The EKS AL2 GPU AMI provides all of the necessary components
44+
# for accelerated workloads w/ EFA
45+
ami_type = "AL2_x86_64_GPU"
46+
instance_types = ["g6e.xlarge"]
47+
48+
min_size = 1
49+
max_size = 1
50+
desired_size = 1
51+
52+
pre_bootstrap_user_data = <<-EOT
53+
# Mount the second volume for containerd persistent data
54+
# This volume contains the cached images and layers
55+
56+
systemctl stop containerd kubelet
57+
58+
rm -rf /var/lib/containerd/*
59+
echo '/dev/${local.dev_name} /var/lib/containerd xfs defaults 0 0' >> /etc/fstab
60+
mount -a
61+
62+
systemctl restart containerd kubelet
63+
64+
EOT
65+
66+
# Mount a second volume for containerd persistent data
67+
# using the snapshot that contains the cached images and layers
68+
block_device_mappings = {
69+
(local.dev_name) = {
70+
device_name = "/dev/${local.dev_name}"
71+
ebs = {
72+
# Snapshot ID from the cache builder
73+
snapshot_id = nonsensitive(data.aws_ssm_parameter.snapshot_id.value)
74+
volume_size = 64
75+
volume_type = "gp3"
76+
}
77+
}
78+
}
79+
80+
labels = {
81+
"nvidia.com/gpu.present" = "true"
82+
"ml-container-cache" = "true"
83+
}
84+
85+
taints = {
86+
# Ensure only GPU workloads are scheduled on this node group
87+
gpu = {
88+
key = "nvidia.com/gpu"
89+
value = "true"
90+
effect = "NO_SCHEDULE"
91+
}
92+
}
93+
}
94+
95+
# This node group is for core addons such as CoreDNS
96+
default = {
97+
instance_types = ["m5.large"]
98+
99+
min_size = 1
100+
max_size = 2
101+
desired_size = 2
102+
103+
# Not required - increased to demonstrate pulling the un-cached
104+
# image since the default volume size is too small for the image used
105+
block_device_mappings = {
106+
"xvda" = {
107+
device_name = "/dev/xvda"
108+
ebs = {
109+
volume_size = 64
110+
volume_type = "gp3"
111+
}
112+
}
113+
}
114+
}
115+
}
116+
117+
tags = local.tags
118+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
################################################################################
2+
# Helm charts
3+
################################################################################
4+
5+
resource "helm_release" "nvidia_device_plugin" {
6+
name = "nvidia-device-plugin"
7+
repository = "https://nvidia.github.io/k8s-device-plugin"
8+
chart = "nvidia-device-plugin"
9+
version = "0.16.2" # Matches image that is cached
10+
namespace = "nvidia-device-plugin"
11+
create_namespace = true
12+
wait = false
13+
14+
values = [
15+
<<-EOT
16+
affinity:
17+
nodeAffinity:
18+
requiredDuringSchedulingIgnoredDuringExecution:
19+
nodeSelectorTerms:
20+
- matchExpressions:
21+
- key: 'nvidia.com/gpu.present'
22+
operator: In
23+
values:
24+
- 'true'
25+
EOT
26+
]
27+
}

0 commit comments

Comments
 (0)