Skip to content
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
8dbd698
Add PyTorch 2.10 Training DLC with CUDA 13.0 and Python 3.13
Feb 9, 2026
ca9fd66
Configure build for PyTorch 2.10 training EC2 images
Feb 9, 2026
9eaae59
fix: add setuptools for pkg_resources in Python 3.13 (OSS compliance)
Feb 10, 2026
f333b46
Merge remote-tracking branch 'upstream/master' into pytorch-2.10-curr…
Feb 10, 2026
a67b5b1
fix: move setuptools install to EC2/SageMaker stages for pkg_resources
Feb 10, 2026
4e2c069
fix: pin setuptools to 81.0.0 for pkg_resources compatibility
Feb 10, 2026
2d9b14e
fix: pin setuptools to 80.10.1 (pkg_resources removed in 81+)
Feb 11, 2026
20a7fe6
fix: pin setuptools to 81.0.0 and remove redundant installs
Feb 11, 2026
73dfc44
Fix torch 2.10 version pinning and increase CPU image size baseline
Feb 11, 2026
3b9e8b3
Merge branch 'master' into pytorch-2.10-currency
bhanutejagk Feb 11, 2026
a94e483
Fix torch 2.10 version pinning and remove setuptools pin
Feb 11, 2026
0bffd28
Revert pytorch install changes to match 2.9 style
Feb 12, 2026
89aeed1
Set build_inference to false
Feb 12, 2026
5df3ed9
Remove fastai - requires torch<2.10, not compatible with PyTorch 2.10
Feb 12, 2026
80b9b3f
Merge upstream/master into pytorch-2.10-currency
Feb 18, 2026
91d0000
Migrate PyTorch 2.10 training tests from SageMaker SDK v2 to v3
Feb 18, 2026
02af756
Merge remote-tracking branch 'upstream/master' into pytorch-2.10-curr…
Mar 10, 2026
c917cde
Run tests.
Mar 10, 2026
5886962
Run tests.
Mar 10, 2026
46c033c
Fix SageMaker v2 import errors for PyTorch 2.10 tests
Mar 10, 2026
a4f435a
Fix version fixture prefix collision in lookup_condition
Mar 10, 2026
ee3a20a
Enable extended tests and apply black formatting
Mar 11, 2026
3c251e7
Merge remote-tracking branch 'upstream/master' into pytorch-2.10-curr…
Mar 11, 2026
75e2e56
Disable sagemaker_benchmark_tests - all PT benchmarks are skipped and…
Mar 11, 2026
3af55f1
Revert dlc_developer_config.toml to defaults
Mar 11, 2026
029617b
Wrap SM SDK v2 imports in try/except to prevent pytest collection fai…
Mar 11, 2026
95fcf60
Also wrap sagemaker.utils import in try/except for SM SDK v3 compatib…
Mar 11, 2026
e30a718
fix: migrate sagemaker.exceptions imports for SM SDK v3 compatibility
Mar 11, 2026
86bc532
Add SM SDK v3 test files for PyTorch 2.10, route v3 tests in sagemake…
Mar 12, 2026
36574d7
Add fastai back to 2.10 Dockerfiles (fastai 2.8.7 supports torch<3), …
Mar 12, 2026
6c1fc9a
Switch to SM buildspec with do_build=true, enable all SM tests (efa, …
Mar 13, 2026
79ad620
Fix sanity test failures for SM SDK v3: sagemaker version check, remo…
Mar 13, 2026
cfb9f1d
fix: sanity test compatibility for SM SDK v3
Mar 13, 2026
6bd9cee
fix: SM v3 tests - fix sagemaker.modules imports and tighten boto bounds
Mar 13, 2026
1328a53
Fix safety scan CVE allowlists, v3 requirements, and trigger rebuild
Mar 13, 2026
4cb3681
Fix v3 requirements: boto3>=1.42.2, mock>=4.0; run SM tests only
Mar 13, 2026
f072a4c
Fix v2/v3 SDK conflicts: conditional imports + standalone v3 timeout
Mar 13, 2026
c38e56c
fix: guard all v2-only sagemaker imports for SM SDK v3 compatibility
Mar 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions pytorch/training/buildspec-2-10-ec2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.10.0
short_version: &SHORT_VERSION "2.10"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
setup_oss_compliance:
source: ../../scripts/setup_oss_compliance.sh
target: setup_oss_compliance.sh

images:
BuildEC2CPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
BuildEC2GPUPTTrainPy3cu130DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 28000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
75 changes: 75 additions & 0 deletions pytorch/training/buildspec-2-10-sm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.10.0
short_version: &SHORT_VERSION "2.10"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
setup_oss_compliance:
source: ../../scripts/setup_oss_compliance.sh
target: setup_oss_compliance.sh

images:
BuildSageMakerCPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 28000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
Loading
Loading