diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index ce4cad98d4e8..06bf3c839302 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -69,13 +69,13 @@ ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false +ec2_benchmark_tests = true ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### On by default sagemaker_local_tests = true @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec-2-9-ec2.yml b/pytorch/training/buildspec-2-9-ec2.yml new file mode 100644 index 000000000000..3f2cef8d599f --- /dev/null +++ b/pytorch/training/buildspec-2-9-ec2.yml @@ -0,0 +1,75 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.9.0 +short_version: &SHORT_VERSION "2.9" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + setup_oss_compliance: + source: ../../scripts/setup_oss_compliance.sh + target: setup_oss_compliance.sh + +images: + BuildEC2CPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 7200 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT + BuildEC2GPUPTTrainPy3cu130DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 28000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec-2-9-sm.yml b/pytorch/training/buildspec-2-9-sm.yml new file mode 100644 index 000000000000..c4b2f8dcdff2 --- /dev/null +++ b/pytorch/training/buildspec-2-9-sm.yml @@ -0,0 +1,75 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.9.0 +short_version: &SHORT_VERSION "2.9" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + setup_oss_compliance: + source: ../../scripts/setup_oss_compliance.sh + target: setup_oss_compliance.sh + +images: + BuildSageMakerCPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 7200 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT + BuildSageMakerGPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 28000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e9f328177b4b..e7a0d5614f66 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-8-sm.yml +buildspec_pointer: buildspec-2-9-ec2.yml diff --git a/pytorch/training/docker/2.9/py3/Dockerfile.cpu b/pytorch/training/docker/2.9/py3/Dockerfile.cpu new file mode 100644 index 000000000000..34f1f71c786c --- /dev/null +++ b/pytorch/training/docker/2.9/py3/Dockerfile.cpu @@ -0,0 +1,312 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.10 +ARG PYTHON_SHORT_VERSION=3.12 +ARG PYTORCH_VERSION=2.9.0 + +ARG OPEN_MPI_VERSION=4.1.7 + +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHDATA_VERSION=0.11.0 +ARG TORCHAUDIO_VERSION=2.9.0 +ARG TORCHVISION_VERSION=0.24.0 + +FROM ubuntu:22.04 AS base_image + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM base_image AS common + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION +ARG PYTORCH_VERSION +ARG TORCHTNT_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION + +ARG OPEN_MPI_VERSION + +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + liblzma-dev \ + libffi-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \ + && cd openmpi-${OPEN_MPI_VERSION} \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && rm -rf openmpi-${OPEN_MPI_VERSION} + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="/home/.openmpi/bin:${PATH}" +ENV LD_LIBRARY_PATH="/home/.openmpi/lib:${LD_LIBRARY_PATH}" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# install python +RUN cd /tmp/ \ +&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ +&& tar xzf Python-${PYTHON_VERSION}.tgz \ +&& cd Python-${PYTHON_VERSION} \ +&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \ +&& make -j "$(nproc)" \ +&& make altinstall \ +&& cd .. \ +&& rm -rf Python-${PYTHON_VERSION} \ +&& rm Python-${PYTHON_VERSION}.tgz \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \ +# This installation generate a .python_history file in the root directory leads sanity check to fail +&& rm -f /root/.python_history + +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# this will add pip systemlink to pip${PYTHON_SHORT_VERSION} +RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org + +# Install common packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + boto3 \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3>=2.5.0" \ + "awscli" \ + opencv-python==4.11.0.86 \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cpu \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + s3torchconnector \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) + thinc \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +WORKDIR / + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] + +# Starts framework +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +WORKDIR / + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2" \ + sagemaker-experiments \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + scikit-learn \ + seaborn \ + shap \ + cloudpickle + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] + diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu new file mode 100644 index 000000000000..e727351a74d3 --- /dev/null +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -0,0 +1,286 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.10 +ARG PYTHON_SHORT_VERSION=3.12 +ARG PYTORCH_VERSION=2.9.0 +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHAUDIO_VERSION=2.9.0 +ARG TORCHVISION_VERSION=0.24.0 +ARG TORCHDATA_VERSION=0.11.0 + +ARG GDRCOPY_VERSION=2.5.1 +ARG TE_VERSION=2.8 +ARG FLASH_ATTN_VERSION=2.8.3 + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM public.ecr.aws/deep-learning-containers/base:13.0.0-gpu-py312-ubuntu22.04-ec2 AS common +# base has EFA, PYTHON and CUDA 13.0 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTORCH_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION +ARG TORCHTNT_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION +ARG GDRCOPY_VERSION + +ENV CUDA_HOME="/usr/local/cuda" +ENV PATH="${CUDA_HOME}/bin:${PATH}" +ENV EFA_PATH="/opt/amazon/efa" +ENV OPEN_MPI_PATH="/opt/amazon/openmpi" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + libgl1-mesa-glx \ + build-essential \ + ca-certificates \ + zlib1g-dev \ + openssl \ + python3-dev \ + pkg-config \ + check \ + llvm \ + xz-utils \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# Install common conda packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3>=2.5.0" \ + ninja \ + opencv-python==4.11.0.86 \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cu130 \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + triton \ + s3torchconnector \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) + thinc \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch + +RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \ + && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \ + && rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl + +RUN pip install --no-cache-dir nvidia-mathdx + +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt + +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +RUN chmod +x /usr/local/bin/start_cuda_compat.sh + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '13s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install common packages used by both EC2 and SageMaker +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl \ + wget \ + git \ + jq \ + emacs \ + vim \ + unzip \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON + +WORKDIR / + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +ARG PYTHON + +WORKDIR / + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2" \ + sagemaker-experiments \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + shap \ + scikit-learn \ + seaborn \ + cloudpickle + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] \ No newline at end of file diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index dc6fc2ea624e..7e7522995fca 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -55,6 +55,7 @@ # ECR repo name fixtures # PyTorch "pytorch_training", + "pytorch_training___2__9", "pytorch_training___2__8", "pytorch_training___2__7", "pytorch_training___2__6", diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 4b676249d816..52f5664625d8 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -89,7 +89,7 @@ check_efa_nccl_all_reduce(){ RETURN_VAL=${PIPESTATUS[0]} # In case, if you would like see logs, uncomment below line - # RESULT=$(cat ${TRAINING_LOG}) + RESULT=$(cat ${TRAINING_LOG}) if [ ${RETURN_VAL} -eq 0 ]; then echo "***************************** check_efa_nccl_all_reduce passed *****************************" diff --git a/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine index e057b957ab54..490c727c4840 100755 --- a/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine +++ b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine @@ -36,6 +36,26 @@ elif [ $(version $TE_VERSION) -lt $(version "2.0") ]; then pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py +elif [ $(version $TE_VERSION) -lt $(version "3.0") ]; then + pip install pytest==8.2.1 onnxruntime onnx expecttest + pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py + pytest -v -s $TE_PATH/tests/pytorch/test_recipe.py + pytest -v -s $TE_PATH/tests/pytorch/test_deferred_init.py + # Disabled test due to bug: https://github.com/NVIDIA/TransformerEngine/issues/1165 + # PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py + PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_cuda_graphs.py + pytest -v -s $TE_PATH/tests/pytorch/test_jit.py + # Skip test_fused_attn.py as it doesn't exist in TE 2.8 + # NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py + pytest -v -s $TE_PATH/tests/pytorch/test_fused_rope.py + # Disable onnx test due lack of TE prioritization on onnx: https://github.com/NVIDIA/TransformerEngine/issues/528 + # NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py + pytest -v -s $TE_PATH/tests/pytorch/test_float8tensor.py + pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py + pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py + pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py + pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py + else pip install pytest==8.2.1 onnxruntime onnx expecttest pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 14a1dc4a0ced..e5af63c44e93 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -259,6 +259,7 @@ def pytorch_gloo(pytorch_training, ec2_connection): container_name="pytorch_gloo", large_shm=True, timeout=1500, + host_network=True, ) @@ -274,6 +275,7 @@ def pytorch_gloo_inductor_gpu(pytorch_training, ec2_connection): container_name="pytorch_gloo_inductor", large_shm=True, timeout=1500, + host_network=True, ) @@ -286,7 +288,11 @@ def pytorch_mpi( """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 0" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo" + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_mpi_gloo", + host_network=True, ) @@ -296,7 +302,11 @@ def pytorch_mpi_inductor_gpu(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 1" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo_inductor" + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_mpi_gloo_inductor", + host_network=True, ) @@ -306,7 +316,12 @@ def pytorch_nccl(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_NCCL_CMD} 0" # input: inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_nccl", large_shm=True + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_nccl", + large_shm=True, + host_network=True, ) @@ -321,6 +336,7 @@ def pytorch_nccl_inductor(pytorch_training, ec2_connection): test_cmd, container_name="pytorch_nccl_inductor", large_shm=True, + host_network=True, ) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py new file mode 100644 index 000000000000..7441b4933750 --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py @@ -0,0 +1,137 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True +) +def test_pytorch_2_9_gpu( + pytorch_training___2__9, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__9 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_heavy_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +@pytest.mark.skipif( + test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), + reason="Skip GPU Heavy tests in PR context unless explicitly enabled", +) +def test_pytorch_2_9_gpu_heavy( + pytorch_training___2__9, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__9 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), + (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU Heavy") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +def test_pytorch_2_9_gpu_inductor( + pytorch_training___2__9, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__9 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU Inductor") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_cpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_pytorch_2_9_cpu(pytorch_training___2__9, ec2_connection, cpu_only): + pytorch_training = pytorch_training___2__9 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases += [ + (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 CPU")