Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
90 commits
Select commit Hold shift + click to select a range
d27e523
pt 2.8 training ec2
jinyan-li1 Aug 10, 2025
ac530dd
fix typo
jinyan-li1 Aug 10, 2025
1793c2a
increase cpu image baseline size, add libcudnn9-dev-cuda-12 dependency
jinyan-li1 Aug 11, 2025
f94d72d
add test and update dockerfiles
jinyan-li1 Aug 11, 2025
4e7caea
bump fastai version and change CUDA version
jinyan-li1 Aug 11, 2025
26fd863
update buildspecs and confest to cu129
jinyan-li1 Aug 11, 2025
b5cde70
modify dockerfile gpu and add logging in efa test
jinyan-li1 Aug 12, 2025
d2632ed
add back libcudnn9-headers-cuda-12
jinyan-li1 Aug 12, 2025
ca1d99b
remove version pins, update cudnn header path check, change efa test …
jinyan-li1 Aug 13, 2025
4079253
pin opencv-python version and simplify efa test logging
jinyan-li1 Aug 13, 2025
e9da304
comment out some tests and change test_path to only run test_efa
jinyan-li1 Aug 13, 2025
bcd8969
fix syntax
jinyan-li1 Aug 13, 2025
915eea2
uncomment training log
jinyan-li1 Aug 13, 2025
2d98fb1
change log validation
jinyan-li1 Aug 13, 2025
454b693
change LD_LIBRARY path and rebuild
jinyan-li1 Aug 13, 2025
93b6143
Remove nvjpeg patching script and rebuild with normal test path
jinyan-li1 Aug 14, 2025
7d5851d
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 14, 2025
c333ecb
change TE version to 2.4
jinyan-li1 Aug 14, 2025
4c534ef
update flash attention to 2.8.3 and reorganize dockerfile
jinyan-li1 Aug 14, 2025
c3557ca
fix dockerfile
jinyan-li1 Aug 14, 2025
5733c55
revert dockerfile organization changes
jinyan-li1 Aug 14, 2025
e5ad021
build with PR base dlc
jinyan-li1 Aug 15, 2025
8cc73fe
fix FLASH_ATTN_VERSION arg
jinyan-li1 Aug 15, 2025
e7eeb33
fix arg
jinyan-li1 Aug 15, 2025
a5c1d14
increase image size baselines
jinyan-li1 Aug 15, 2025
190005b
rerun test LD_LIBRARY_PATH added back, skip build
jinyan-li1 Aug 15, 2025
7c227a1
rebuild image with updated dockerfile
jinyan-li1 Aug 15, 2025
0eb5e9d
fix redundancy in dockerfile
jinyan-li1 Aug 15, 2025
2a58ed9
move GDRCOPY_VERSION arg to common
jinyan-li1 Aug 16, 2025
9ab58d2
update cudnn_version.h location in test path, add telemetry script back
jinyan-li1 Aug 18, 2025
f64a296
move oss compliance into a script and modify dockerfile
jinyan-li1 Aug 19, 2025
2743af5
move some packages to ec2 and sm stages, remove awscli and boto3 sinc…
jinyan-li1 Aug 19, 2025
ba58253
rebuild ec2 image with base image updated
jinyan-li1 Aug 19, 2025
94f3e4e
build sm image and run tests
jinyan-li1 Aug 19, 2025
fa82f11
update comments and rebuild ec2 image
jinyan-li1 Aug 19, 2025
0d999e5
rebuild sm image with updated toml config
jinyan-li1 Aug 19, 2025
61d1c0b
build ec2 image with base from PR with libsqlite3-dev added
jinyan-li1 Aug 20, 2025
1cdff14
build ec2 image with safety check test and ecr scan allowlist feature…
jinyan-li1 Aug 20, 2025
dc5d8b0
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 20, 2025
4d0e610
rebuild sm image with PR base image
jinyan-li1 Aug 20, 2025
ad2b613
uncomment efa test log, add dgl to dockfiles and rebuild sm image
jinyan-li1 Aug 21, 2025
03dc0ab
remove dgl from dockerfiles, skip dgl and smdebug tests
jinyan-li1 Aug 21, 2025
0490e9d
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 21, 2025
5c3b37e
skip smddp and smppy tests, allowlist protobuf CVE 77740 - blocked by…
jinyan-li1 Aug 21, 2025
74be69b
add logging in test_safety to continue when there is an exception
jinyan-li1 Aug 22, 2025
fa5a04e
keep debug log, rebuild sm image with released base dlc
jinyan-li1 Aug 22, 2025
8fa869d
rebuild ec2 image with released base dlc
jinyan-li1 Aug 22, 2025
aa94d04
add CVE 77740 to ignore safety ids for test
jinyan-li1 Aug 22, 2025
f0e7d72
rebuild sm image with updated toml config
jinyan-li1 Aug 22, 2025
0a1ce87
remove loggings
jinyan-li1 Aug 22, 2025
8fce327
fix confest for sm tests
jinyan-li1 Aug 23, 2025
dd85d05
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 23, 2025
7008e2c
Revert config changes
jinyan-li1 Aug 23, 2025
4c76dc8
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 25, 2025
d329022
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 25, 2025
07f14cf
remove allowlist files and update gpu dockerfile
jinyan-li1 Aug 25, 2025
04df973
rebuild ec2 image without allowlist
jinyan-li1 Aug 25, 2025
90fb983
rebuild sm image without allowlist
jinyan-li1 Aug 25, 2025
5e53acf
install TE from pypi instead of git, rebuild ec2 image
jinyan-li1 Aug 26, 2025
e899808
change gdrcopy installation and rebuild ec2 image
jinyan-li1 Aug 26, 2025
ede5e9b
change TE version to 2.5 and reogranize cpu dockerfile, rebuild ec2 i…
jinyan-li1 Aug 26, 2025
1593492
remove skip_smdebug_v1_test
jinyan-li1 Aug 26, 2025
e42556e
reorganize cpu dockerfile
jinyan-li1 Aug 26, 2025
f317415
remove unnecessary fixtures and skip tests directly, comment out efa …
jinyan-li1 Aug 26, 2025
fb99b12
unpin opencv-python and rebuild ec2 image
jinyan-li1 Aug 26, 2025
bccd0ed
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 26, 2025
d96f804
pin opencv-python and rebuild ec2 image
jinyan-li1 Aug 26, 2025
98fd21a
formatting and rebuild sm image
jinyan-li1 Aug 26, 2025
9071542
pin thinc
jinyan-li1 Aug 27, 2025
284cc8c
rebuild ec2 image
jinyan-li1 Aug 27, 2025
c6c8db2
rebuild sm image
jinyan-li1 Aug 27, 2025
23724d7
Revert config changes
jinyan-li1 Aug 27, 2025
97a334c
reogranize gpu dockerfile
jinyan-li1 Aug 27, 2025
3ee468c
rebuild ec2 image
jinyan-li1 Aug 27, 2025
b50bbba
rebuild sm image
jinyan-li1 Aug 27, 2025
6804bb8
set MAX_JOBS to 18
jinyan-li1 Aug 27, 2025
3bef50f
fix license file access issue and rebuild ec2 image
jinyan-li1 Aug 27, 2025
7fed818
rebuild sm image
jinyan-li1 Aug 27, 2025
0ae5d9c
Revert config file
jinyan-li1 Aug 28, 2025
d391144
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 28, 2025
8afc1bd
use flash attn precompiled wheel
jinyan-li1 Aug 28, 2025
9640b16
Rebuild ec2 image
jinyan-li1 Aug 28, 2025
4827448
rebuild sm image
jinyan-li1 Aug 28, 2025
60d1f51
Revert toml changes
jinyan-li1 Aug 28, 2025
9a127a9
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 28, 2025
721b88e
resolve diff with master
jinyan-li1 Aug 28, 2025
258b61e
run ec2 tests again
jinyan-li1 Aug 28, 2025
15cd9d0
formatting and revert config changes
jinyan-li1 Aug 28, 2025
53e4e55
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 29, 2025
f7c88bb
Merge branch 'master' into pt-2.8-currency
jinyan-li1 Aug 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ deep_canary_mode = false
[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
build_frameworks = ["pytorch"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
build_training = true
build_inference = true
build_inference = false

# Set do_build to "false" to skip builds and test the latest image built by this PR
# Note: at least one build is required to set do_build to "false"
Expand All @@ -65,13 +65,13 @@ ecs_tests = true
eks_tests = true
ec2_tests = true
# Set it to true if you are preparing a Benchmark related PR
ec2_benchmark_tests = false
ec2_benchmark_tests = true

### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
### default. If false, these types of tests will be skipped while other tests will run as usual.
### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
### Off by default (set to false)
ec2_tests_on_heavy_instances = false
ec2_tests_on_heavy_instances = true
### SM specific tests
### On by default
sagemaker_local_tests = true
Expand Down Expand Up @@ -119,7 +119,7 @@ use_scheduler = false
### TRAINING PR JOBS ###

# Standard Framework Training
dlc-pr-pytorch-training = ""
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"
dlc-pr-tensorflow-2-training = ""
dlc-pr-autogluon-training = ""

Expand Down
72 changes: 72 additions & 0 deletions pytorch/training/buildspec-2-8-ec2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.8.0
short_version: &SHORT_VERSION "2.8"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildEC2CPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
BuildEC2GPUPTTrainPy3cu129DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 24000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu129
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
72 changes: 72 additions & 0 deletions pytorch/training/buildspec-2-8-sm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.8.0
short_version: &SHORT_VERSION "2.8"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildSageMakerCPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 24000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu129
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
2 changes: 1 addition & 1 deletion pytorch/training/buildspec.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
buildspec_pointer: buildspec-2-7-sm.yml
buildspec_pointer: buildspec-2-8-ec2.yml
Loading
Loading