aws · fgbelidji · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
@@ -36,7 +36,7 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
 build_frameworks = []
 
 

@@ -0,0 +1,40 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK pytorch
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION 2.5.1
+short_version: &SHORT_VERSION "2.5"
+contributor: huggingface
+arch_type: x86
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    cuda-compatibility-lib:
+      source: ../../build_artifacts/training/cuda-compatibility-lib.sh
+      target: cuda-compatibility-lib.sh
+
+images:
+  BuildHuggingFacePytorchGpuPy311Cu124TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 21500
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py311
+    cuda_version: &CUDA_VERSION cu124
+    os_version: &OS_VERSION ubuntu22.04
+    transformers_version: &TRANSFORMERS_VERSION 4.49.0
+    datasets_version: &DATASETS_VERSION 3.3.2
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
+                 *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
+                         *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT 
@@ -2,8 +2,9 @@ account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 base_framework: &BASE_FRAMEWORK pytorch
 framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
-version: &VERSION 2.6.0
-short_version: &SHORT_VERSION "2.6"
+
+version: &VERSION 2.7.1
+short_version: &SHORT_VERSION "2.7"
 contributor: huggingface
 arch_type: x86
 
@@ -21,17 +22,17 @@ context:
       target: cuda-compatibility-lib.sh
 
 images:
-  BuildHuggingFacePytorchGpuPy312Cu126TrainingDockerImage:
+  BuildHuggingFacePytorchGpuPy312Cu128TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 21500
+    image_size_baseline: &IMAGE_SIZE_BASELINE 25000
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
-    cuda_version: &CUDA_VERSION cu126
+    cuda_version: &CUDA_VERSION cu128
     os_version: &OS_VERSION ubuntu22.04
-    transformers_version: &TRANSFORMERS_VERSION 4.51.3
-    datasets_version: &DATASETS_VERSION 3.5.0
+    transformers_version: &TRANSFORMERS_VERSION 4.55.0
+    datasets_version: &DATASETS_VERSION 4.0.0
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
                  *CUDA_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,

@@ -0,0 +1,85 @@
+# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
+# refer to the above page to pull latest Pytorch image
+
+# docker image region us-west-2
+FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.7.1-gpu-py312-cu128-ubuntu22.04-sagemaker
+
+RUN apt-get remove -y --purge emacs && \
+apt-get autoremove -y
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="2"
+
+# version args
+ARG TRANSFORMERS_VERSION=4.55.0
+ARG DATASETS_VERSION=4.0.0
+ARG HUGGINGFACE_HUB_VERSION=0.34.0
+ARG DIFFUSERS_VERSION=0.34.0
+ARG EVALUATE_VERSION=0.4.3
+ARG ACCELERATE_VERSION=1.4.0
+ARG TRL_VERSION=0.21.0
+ARG PEFT_VERSION=0.17.0
+ARG FLASH_ATTN_VERSION=2.8.2
+ARG NINJA_VERSION=1.11.1.4
+ARG KERNELS_VERSION=0.9.0
+ARG PYTHON=python3
+
+# TODO: Remove when the base image is updated
+RUN pip install --upgrade pip \
+ && pip uninstall -y transformer-engine flash-attn pyarrow cryptography \
+ && pip install --no-cache-dir -U pyarrow cryptography pyopenssl Pillow \
+ && pip --no-cache-dir install --upgrade wheel setuptools \
+ && pip install --no-cache-dir -U "werkzeug==3.0.6" 
+
+# Pre-install kenlm without build isolation so it uses system cmake
+RUN pip install --no-cache-dir --no-build-isolation kenlm
+
+# Install Hugging Face libraries and dependencies
+RUN pip install --no-cache-dir \
+    huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB_VERSION} \
+    transformers[sklearn,sentencepiece,audio,vision,pipelines]==${TRANSFORMERS_VERSION} \
+    datasets==${DATASETS_VERSION} \
+    diffusers==${DIFFUSERS_VERSION} \
+    Jinja2 \
+    tensorboard \
+    bitsandbytes \
+    kernels==${KERNELS_VERSION} \
+    evaluate==${EVALUATE_VERSION} \
+    accelerate==${ACCELERATE_VERSION} \
+    ninja==${NINJA_VERSION} \
+    trl==${TRL_VERSION} \
+    peft==${PEFT_VERSION} \
+    flash-attn==${FLASH_ATTN_VERSION}
+
+# hf_transfer will be a built-in feature, remove the env variable then
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+ENV HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:training"
+
+RUN apt-get update \
+ # TODO: Remove upgrade statements once packages are updated in base image
+ && apt-get -y upgrade --only-upgrade \
+     systemd openssl cryptsetup libkrb5-3 \
+     libgdk-pixbuf-2.0-0 libgdk-pixbuf2.0-bin libgdk-pixbuf2.0-common libglib2.0-0 \
+ && apt-get install -y git git-lfs wget tar \
+ # Remove gdk-pixbuf packages entirely to mitigate outstanding CVEs until base image contains fixed builds
+ && apt-get purge -y libgdk-pixbuf-2.0-0 libgdk-pixbuf2.0-bin libgdk-pixbuf2.0-common || true \
+ && apt-get autoremove -y \
+ && wget https://go.dev/dl/go1.22.3.linux-amd64.tar.gz \
+ && rm -rf /usr/local/go \
+ && tar -C /usr/local -xzf go1.22.3.linux-amd64.tar.gz \
+ && ln -s /usr/local/go/bin/go /usr/bin/go \
+ && rm go1.22.3.linux-amd64.tar.gz \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY cuda-compatibility-lib.sh /usr/local/bin/cuda-compatibility-lib.sh
+RUN chmod +x /usr/local/bin/cuda-compatibility-lib.sh
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip -o ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance*