diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 1962bfd69e21..c1521bb24e45 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -36,7 +36,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. -# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] +# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] build_frameworks = [] diff --git a/huggingface/pytorch/training/buildspec-2-5-1.yml b/huggingface/pytorch/training/buildspec-2-5-1.yml new file mode 100644 index 000000000000..8b92a4482803 --- /dev/null +++ b/huggingface/pytorch/training/buildspec-2-5-1.yml @@ -0,0 +1,40 @@ +account_id: &ACCOUNT_ID +region: ®ION +base_framework: &BASE_FRAMEWORK pytorch +framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] +version: &VERSION 2.5.1 +short_version: &SHORT_VERSION "2.5" +contributor: huggingface +arch_type: x86 + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + cuda-compatibility-lib: + source: ../../build_artifacts/training/cuda-compatibility-lib.sh + target: cuda-compatibility-lib.sh + +images: + BuildHuggingFacePytorchGpuPy311Cu124TrainingDockerImage: + <<: *TRAINING_REPOSITORY + build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 21500 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py311 + cuda_version: &CUDA_VERSION cu124 + os_version: &OS_VERSION ubuntu22.04 + transformers_version: &TRANSFORMERS_VERSION 4.49.0 + datasets_version: &DATASETS_VERSION 3.3.2 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', + *CUDA_VERSION, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, + *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] + context: + <<: *TRAINING_CONTEXT diff --git a/huggingface/pytorch/training/buildspec.yml b/huggingface/pytorch/training/buildspec.yml index 59ab91ed6402..7b6ee01d4bce 100644 --- a/huggingface/pytorch/training/buildspec.yml +++ b/huggingface/pytorch/training/buildspec.yml @@ -2,8 +2,9 @@ account_id: &ACCOUNT_ID region: ®ION base_framework: &BASE_FRAMEWORK pytorch framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] -version: &VERSION 2.6.0 -short_version: &SHORT_VERSION "2.6" + +version: &VERSION 2.7.1 +short_version: &SHORT_VERSION "2.7" contributor: huggingface arch_type: x86 @@ -21,17 +22,17 @@ context: target: cuda-compatibility-lib.sh images: - BuildHuggingFacePytorchGpuPy312Cu126TrainingDockerImage: + BuildHuggingFacePytorchGpuPy312Cu128TrainingDockerImage: <<: *TRAINING_REPOSITORY build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false - image_size_baseline: &IMAGE_SIZE_BASELINE 21500 + image_size_baseline: &IMAGE_SIZE_BASELINE 25000 device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py312 - cuda_version: &CUDA_VERSION cu126 + cuda_version: &CUDA_VERSION cu128 os_version: &OS_VERSION ubuntu22.04 - transformers_version: &TRANSFORMERS_VERSION 4.51.3 - datasets_version: &DATASETS_VERSION 3.5.0 + transformers_version: &TRANSFORMERS_VERSION 4.55.0 + datasets_version: &DATASETS_VERSION 4.0.0 tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', *CUDA_VERSION, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, diff --git a/huggingface/pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu b/huggingface/pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu new file mode 100644 index 000000000000..425497998b87 --- /dev/null +++ b/huggingface/pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu @@ -0,0 +1,85 @@ +# https://github.com/aws/deep-learning-containers/blob/master/available_images.md +# refer to the above page to pull latest Pytorch image + +# docker image region us-west-2 +FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.7.1-gpu-py312-cu128-ubuntu22.04-sagemaker + +RUN apt-get remove -y --purge emacs && \ +apt-get autoremove -y + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="2" + +# version args +ARG TRANSFORMERS_VERSION=4.55.0 +ARG DATASETS_VERSION=4.0.0 +ARG HUGGINGFACE_HUB_VERSION=0.34.0 +ARG DIFFUSERS_VERSION=0.34.0 +ARG EVALUATE_VERSION=0.4.3 +ARG ACCELERATE_VERSION=1.4.0 +ARG TRL_VERSION=0.21.0 +ARG PEFT_VERSION=0.17.0 +ARG FLASH_ATTN_VERSION=2.8.2 +ARG NINJA_VERSION=1.11.1.4 +ARG KERNELS_VERSION=0.9.0 +ARG PYTHON=python3 + +# TODO: Remove when the base image is updated +RUN pip install --upgrade pip \ + && pip uninstall -y transformer-engine flash-attn pyarrow cryptography \ + && pip install --no-cache-dir -U pyarrow cryptography pyopenssl Pillow \ + && pip --no-cache-dir install --upgrade wheel setuptools \ + && pip install --no-cache-dir -U "werkzeug==3.0.6" + +# Pre-install kenlm without build isolation so it uses system cmake +RUN pip install --no-cache-dir --no-build-isolation kenlm + +# Install Hugging Face libraries and dependencies +RUN pip install --no-cache-dir \ + huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB_VERSION} \ + transformers[sklearn,sentencepiece,audio,vision,pipelines]==${TRANSFORMERS_VERSION} \ + datasets==${DATASETS_VERSION} \ + diffusers==${DIFFUSERS_VERSION} \ + Jinja2 \ + tensorboard \ + bitsandbytes \ + kernels==${KERNELS_VERSION} \ + evaluate==${EVALUATE_VERSION} \ + accelerate==${ACCELERATE_VERSION} \ + ninja==${NINJA_VERSION} \ + trl==${TRL_VERSION} \ + peft==${PEFT_VERSION} \ + flash-attn==${FLASH_ATTN_VERSION} + +# hf_transfer will be a built-in feature, remove the env variable then +ENV HF_HUB_ENABLE_HF_TRANSFER="1" +ENV HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:training" + +RUN apt-get update \ + # TODO: Remove upgrade statements once packages are updated in base image + && apt-get -y upgrade --only-upgrade \ + systemd openssl cryptsetup libkrb5-3 \ + libgdk-pixbuf-2.0-0 libgdk-pixbuf2.0-bin libgdk-pixbuf2.0-common libglib2.0-0 \ + && apt-get install -y git git-lfs wget tar \ + # Remove gdk-pixbuf packages entirely to mitigate outstanding CVEs until base image contains fixed builds + && apt-get purge -y libgdk-pixbuf-2.0-0 libgdk-pixbuf2.0-bin libgdk-pixbuf2.0-common || true \ + && apt-get autoremove -y \ + && wget https://go.dev/dl/go1.22.3.linux-amd64.tar.gz \ + && rm -rf /usr/local/go \ + && tar -C /usr/local -xzf go1.22.3.linux-amd64.tar.gz \ + && ln -s /usr/local/go/bin/go /usr/bin/go \ + && rm go1.22.3.linux-amd64.tar.gz \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +COPY cuda-compatibility-lib.sh /usr/local/bin/cuda-compatibility-lib.sh +RUN chmod +x /usr/local/bin/cuda-compatibility-lib.sh + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip -o ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance*