From d27e5232ac3276d6eba5ce60e66f1868c1422877 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Sun, 10 Aug 2025 14:38:22 -0700 Subject: [PATCH 01/16] pt 2.8 training ec2 --- dlc_developer_config.toml | 10 +- pytorch/training/buildspec-2-8-ec2.yml | 72 +++ pytorch/training/buildspec-2-8-sm.yml | 72 +++ pytorch/training/buildspec.yml | 2 +- .../training/docker/2.8/py3/Dockerfile.cpu | 364 ++++++++++++++ .../docker/2.8/py3/cu128/Dockerfile.gpu | 464 ++++++++++++++++++ 6 files changed, 978 insertions(+), 6 deletions(-) create mode 100644 pytorch/training/buildspec-2-8-ec2.yml create mode 100644 pytorch/training/buildspec-2-8-sm.yml create mode 100644 pytorch/training/docker/2.8/py3/Dockerfile.cpu create mode 100644 pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 1962bfd69e21..f8a4b4214d7d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -65,13 +65,13 @@ ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false +ec2_benchmark_tests = true ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### On by default sagemaker_local_tests = true @@ -119,7 +119,7 @@ use_scheduler = false ### TRAINING PR JOBS ### # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec-2-8-ec2.yml b/pytorch/training/buildspec-2-8-ec2.yml new file mode 100644 index 000000000000..07bc4e559755 --- /dev/null +++ b/pytorch/training/buildspec-2-8-ec2.yml @@ -0,0 +1,72 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.8.0 +short_version: &SHORT_VERSION "2.8" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + +images: + BuildEC2CPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 6500 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT + BuildEC2GPUPTTrainPy3cu128DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 24000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu128 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec-2-8-sm.yml b/pytorch/training/buildspec-2-8-sm.yml new file mode 100644 index 000000000000..a29f64b2a761 --- /dev/null +++ b/pytorch/training/buildspec-2-8-sm.yml @@ -0,0 +1,72 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.8.0 +short_version: &SHORT_VERSION "2.8" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + +images: + BuildSageMakerCPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 6500 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT + BuildSageMakerGPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 24000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu128 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index b332931b2e40..78ac196ed806 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-7-sm.yml +buildspec_pointer: buildspec-2-8-ec2.yml diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu new file mode 100644 index 000000000000..2577f237b6f3 --- /dev/null +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -0,0 +1,364 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.10 +ARG PYTHON_SHORT_VERSION=3.12 +ARG PYTORCH_VERSION=2.8.0 + +ARG OPEN_MPI_VERSION=4.1.7 + +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHDATA_VERSION=0.11.0 +ARG TORCHAUDIO_VERSION=2.8.0 +ARG TORCHVISION_VERSION=0.23.0 + +FROM ubuntu:22.04 AS base_image + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM base_image AS common + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION + +ARG OPEN_MPI_VERSION + +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + liblzma-dev \ + libffi-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \ + && cd openmpi-${OPEN_MPI_VERSION} \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && rm -rf openmpi-${OPEN_MPI_VERSION} + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="/home/.openmpi/bin:${PATH}" +ENV LD_LIBRARY_PATH="/home/.openmpi/lib:${LD_LIBRARY_PATH}" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + + +# install python +RUN cd /tmp/ \ +&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ +&& tar xzf Python-${PYTHON_VERSION}.tgz \ +&& cd Python-${PYTHON_VERSION} \ +&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \ +&& make -j "$(nproc)" \ +&& make altinstall \ +&& cd .. \ +&& rm -rf Python-${PYTHON_VERSION} \ +&& rm Python-${PYTHON_VERSION}.tgz \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \ +# This installation generate a .python_history file in the root directory leads sanity check to fail +&& rm -f /root/.python_history + +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# this will add pip systemlink to pip${PYTHON_SHORT_VERSION} +RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org + +# Install common packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + boto3 \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3>=2.5.0" \ + "awscli<2" \ + "opencv-python==4.11.0.86" \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON +ARG PYTHON_SHORT_VERSION +ARG PYTORCH_VERSION +ARG TORCHTNT_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION + +WORKDIR / + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cpu \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + s3torchconnector \ + fastai==2.8.2 \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy) + thinc==8.3.4 \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] + +# Starts framework +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_SHORT_VERSION +ARG PYTORCH_VERSION +ARG TORCHTNT_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +WORKDIR / + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cpu \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + s3torchconnector \ + fastai==2.8.2 \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + thinc==8.3.4 \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2,<3" \ + "sagemaker-experiments<1" \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + scikit-learn \ + seaborn \ + shap \ + # pinned for sagemaker==2.233.0 + cloudpickle + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] + diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu new file mode 100644 index 000000000000..cbd2a01401f0 --- /dev/null +++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu @@ -0,0 +1,464 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.10 +ARG PYTHON_SHORT_VERSION=3.12 +ARG PYTORCH_VERSION=2.8.0 +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHAUDIO_VERSION=2.8.0 +ARG TORCHVISION_VERSION=0.23.0 +ARG TORCHDATA_VERSION=0.11.0 + +ARG CUDA_VERSION=12.8.1 +ARG CUDNN_VERSION=9.10.2.21 +ARG NCCL_VERSION=2.27.3 +ARG EFA_VERSION=1.43.1 +ARG GDRCOPY_VERSION=2.5 +ARG TE_VERSION=2.5 +ARG FLASH_ATTN_VERSION=2.8.2 + +FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base_image + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM base_image AS common + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION +ARG PYTORCH_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION +ARG TORCHTNT_VERSION + +ARG CUDA_VERSION +ARG CUDNN_VERSION +ARG NCCL_VERSION +ARG EFA_VERSION + +ENV CUDA_HOME="/usr/local/cuda" +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" +ENV PATH="${CUDA_HOME}/bin:${PATH}" +ENV EFA_PATH="/opt/amazon/efa" +ENV OPEN_MPI_PATH="/opt/amazon/openmpi" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + cuda-toolkit-12=${CUDA_VERSION}-1 \ + libcudnn9-cuda-12=${CUDNN_VERSION}-1 \ + libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \ + libhwloc-dev \ + libgomp1 \ + libibverbs-dev \ + libnuma1 \ + libnuma-dev \ + libtool \ + openssl \ + python3-dev \ + autoconf \ + pkg-config \ + check \ + libsubunit0 \ + libsubunit-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + liblzma-dev \ + libffi-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + + # patch nvjpeg to fix CVE +RUN mkdir -p /tmp/nvjpeg \ +&& cd /tmp/nvjpeg \ +&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ +&& rm -rf /tmp/nvjpeg \ +# patch cuobjdump and nvdisasm +&& rm -rf /usr/local/cuda/bin/cuobjdump* \ +&& rm -rf /usr/local/cuda/bin/nvdisasm* + +# For EFA, below flags are needed to install EFA on docker image +# -n, --no-verify Skip EFA device verification and test +# -l, --skip-limit-conf Skip EFA limit configuration +# -k, --skip-kmod Skip EFA kmod installation +# start from 0.38.0 EFA now bundles the AWS OFI NCCL plugin, +# which can now be found in /opt/amazon/ofi-nccl/lib/x86_64-linux-gnu rather than the original /opt/aws-ofi-nccl/. +RUN mkdir /tmp/efa \ + && cd /tmp/efa \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && apt-get update \ + && ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf /tmp/efa \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}" + +# Configure Open MPI and configure NCCL parameters +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \ + && echo NCCL_SOCKET_IFNAME=^lo,docker >> /etc/nccl.conf + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# install python +RUN cd /tmp/ \ +&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ +&& tar xzf Python-${PYTHON_VERSION}.tgz \ +&& cd Python-${PYTHON_VERSION} \ +&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \ +&& make -j "$(nproc)" \ +&& make altinstall \ +&& cd .. \ +&& rm -rf Python-${PYTHON_VERSION} \ +&& rm Python-${PYTHON_VERSION}.tgz \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \ +# This installation generate a .python_history file in the root directory leads sanity check to fail +&& rm -f /root/.python_history + +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# this will add pip systemlink to pip${PYTHON_SHORT_VERSION} +RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org + +# Install common conda packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + boto3 \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3>=2.5.0" \ + "awscli<2" \ + ninja \ + # pencv-python 4.12.0.88 reuqires numpy<2.3.0, which is not compatible with previous prod image(2.3.1) + opencv-python==4.11.0.86 \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cu128 \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + triton \ + s3torchconnector \ + fastai==2.8.2 \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy) + thinc==8.3.4 \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +RUN chmod +x /usr/local/bin/start_cuda_compat.sh + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON +ARG PYTHON_SHORT_VERSION +ARG NCCL_VERSION +ARG GDRCOPY_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION + +WORKDIR / + + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] +CMD ["/bin/bash"] + + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +ARG PYTHON +ARG PYTHON_SHORT_VERSION +ARG NCCL_VERSION +ARG GDRCOPY_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION + +WORKDIR / + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl + +RUN pip uninstall -y ninja && pip install ninja + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2,<3" \ + "sagemaker-experiments<1" \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + shap \ + scikit-learn \ + seaborn \ + # pinned for sagemaker==2.233.0 + cloudpickle + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] From ac530dd87f4e54d22bd9f7c14933ba086745eb07 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Sun, 10 Aug 2025 16:52:52 -0700 Subject: [PATCH 02/16] fix typo --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index f8a4b4214d7d..cd7d59b57a9c 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -119,7 +119,7 @@ use_scheduler = false ### TRAINING PR JOBS ### # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 1793c2a0913798514de517db690ba02c5c56845b Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Sun, 10 Aug 2025 18:51:25 -0700 Subject: [PATCH 03/16] increase cpu image baseline size, add libcudnn9-dev-cuda-12 dependency --- pytorch/training/buildspec-2-8-ec2.yml | 2 +- pytorch/training/buildspec-2-8-sm.yml | 2 +- pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch/training/buildspec-2-8-ec2.yml b/pytorch/training/buildspec-2-8-ec2.yml index 07bc4e559755..e7222c4ba584 100644 --- a/pytorch/training/buildspec-2-8-ec2.yml +++ b/pytorch/training/buildspec-2-8-ec2.yml @@ -41,7 +41,7 @@ images: BuildEC2CPUPTTrainPy3DockerImage: <<: *TRAINING_REPOSITORY build: &PYTORCH_CPU_TRAINING_PY3 false - image_size_baseline: 6500 + image_size_baseline: 7200 device_type: &DEVICE_TYPE cpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py312 diff --git a/pytorch/training/buildspec-2-8-sm.yml b/pytorch/training/buildspec-2-8-sm.yml index a29f64b2a761..bfb7c6195d5a 100644 --- a/pytorch/training/buildspec-2-8-sm.yml +++ b/pytorch/training/buildspec-2-8-sm.yml @@ -41,7 +41,7 @@ images: BuildSageMakerCPUPTTrainPy3DockerImage: <<: *TRAINING_REPOSITORY build: &PYTORCH_CPU_TRAINING_PY3 false - image_size_baseline: 6500 + image_size_baseline: 7200 device_type: &DEVICE_TYPE cpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py312 diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu index cbd2a01401f0..3e60f8b895ad 100644 --- a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu @@ -105,6 +105,7 @@ RUN apt-get update \ cuda-toolkit-12=${CUDA_VERSION}-1 \ libcudnn9-cuda-12=${CUDNN_VERSION}-1 \ libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \ + libcudnn9-headers-cuda-12=${CUDNN_VERSION}-1 \ libhwloc-dev \ libgomp1 \ libibverbs-dev \ From f94d72d97f384e0479d4871ccb075d023c49fb31 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Sun, 10 Aug 2025 22:56:44 -0700 Subject: [PATCH 04/16] add test and update dockerfiles --- .../training/docker/2.8/py3/Dockerfile.cpu | 2 +- .../docker/2.8/py3/cu128/Dockerfile.gpu | 2 +- test/dlc_tests/conftest.py | 7 +- .../training/test_pytorch_training_2_8.py | 137 ++++++++++++++++++ 4 files changed, 143 insertions(+), 5 deletions(-) create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 2577f237b6f3..9399ede0d55d 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -190,7 +190,7 @@ RUN pip install --no-cache-dir \ jinja2>=3.1.6 \ tornado>=6.5.1 -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu index 3e60f8b895ad..b26cd87ce89e 100644 --- a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu @@ -274,7 +274,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ numpy \ && pip uninstall -y dataclasses -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 788057ad4b9e..7f2d8651c1a3 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -55,6 +55,7 @@ # ECR repo name fixtures # PyTorch "pytorch_training", + "pytorch_training___2__8", "pytorch_training___2__7", "pytorch_training___2__6", "pytorch_training___2__5", @@ -942,7 +943,7 @@ def skip_smdebug_v1_test(request): ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4,<2.6": ["cpu", "cu124"], ">=2.6,<2.7.1": ["cpu", "cu126"], - ">=2.7.1,<2.8": ["cpu", "cu128"], + ">=2.7.1,<=2.8": ["cpu", "cu128"], } if _validate_pytorch_framework_version(request, image_uri, "skip_smdebug_v1_test", skip_dict): pytest.skip(f"SM Profiler v1 is on path for deprecation, skipping test") @@ -966,7 +967,7 @@ def skip_dgl_test(request): ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4,<2.6": ["cpu", "cu124"], ">=2.6,<2.7.1": ["cpu", "cu126"], - ">=2.7.1,<2.8": ["cpu", "cu128"], + ">=2.7.1,<=2.8": ["cpu", "cu128"], } if _validate_pytorch_framework_version(request, image_uri, "skip_dgl_test", skip_dict): pytest.skip(f"DGL binaries are removed, skipping test") @@ -1031,7 +1032,7 @@ def skip_serialized_release_pt_test(request): ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4,<2.6": ["cpu", "cu124"], ">=2.6,<2.7.1": ["cpu", "cu126"], - ">=2.7.1,<2.8": ["cpu", "cu128"], + ">=2.7.1,<=2.8": ["cpu", "cu128"], } if _validate_pytorch_framework_version( request, image_uri, "skip_serialized_release_pt_test", skip_dict diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py new file mode 100644 index 000000000000..ec2c00a6ef14 --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py @@ -0,0 +1,137 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True +) +def test_pytorch_2_8_gpu( + pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__8 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_heavy_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +@pytest.mark.skipif( + test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), + reason="Skip GPU Heavy tests in PR context unless explicitly enabled", +) +def test_pytorch_2_8_gpu_heavy( + pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__8 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), + (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +def test_pytorch_2_8_gpu_inductor( + pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__8 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Inductor") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_cpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only): + pytorch_training = pytorch_training___2__8 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases += [ + (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 CPU") From 4e7caeac3a15907ae18916d82af462a5e6e96485 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 11 Aug 2025 15:07:44 -0700 Subject: [PATCH 05/16] bump fastai version and change CUDA version --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 4 ++-- pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 9399ede0d55d..87fd09ac6513 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -238,7 +238,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ s3torchconnector \ - fastai==2.8.2 \ + fastai==2.8.3 \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis @@ -309,7 +309,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ s3torchconnector \ - fastai==2.8.2 \ + fastai==2.8.3 \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu index b26cd87ce89e..5c912c339de5 100644 --- a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu @@ -7,7 +7,7 @@ ARG TORCHAUDIO_VERSION=2.8.0 ARG TORCHVISION_VERSION=0.23.0 ARG TORCHDATA_VERSION=0.11.0 -ARG CUDA_VERSION=12.8.1 +ARG CUDA_VERSION=12.9.1 ARG CUDNN_VERSION=9.10.2.21 ARG NCCL_VERSION=2.27.3 ARG EFA_VERSION=1.43.1 @@ -15,7 +15,7 @@ ARG GDRCOPY_VERSION=2.5 ARG TE_VERSION=2.5 ARG FLASH_ATTN_VERSION=2.8.2 -FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base_image +FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 ENV DEBIAN_FRONTEND=noninteractive @@ -258,12 +258,12 @@ RUN pip install --no-cache-dir \ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ torchvision==${TORCHVISION_VERSION} \ torchaudio==${TORCHAUDIO_VERSION} \ - --index-url https://download.pytorch.org/whl/cu128 \ + --index-url https://download.pytorch.org/whl/cu129 \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ triton \ s3torchconnector \ - fastai==2.8.2 \ + fastai==2.8.3 \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis From 26fd8639c7e2b3615f9065c24cc0a233b85e1535 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 11 Aug 2025 16:18:08 -0700 Subject: [PATCH 06/16] update buildspecs and confest to cu129 --- pytorch/training/buildspec-2-8-ec2.yml | 4 ++-- pytorch/training/buildspec-2-8-sm.yml | 2 +- .../docker/2.8/py3/{cu128 => cu129}/Dockerfile.gpu | 0 test/dlc_tests/conftest.py | 9 ++++++--- 4 files changed, 9 insertions(+), 6 deletions(-) rename pytorch/training/docker/2.8/py3/{cu128 => cu129}/Dockerfile.gpu (100%) diff --git a/pytorch/training/buildspec-2-8-ec2.yml b/pytorch/training/buildspec-2-8-ec2.yml index e7222c4ba584..023d84b6867d 100644 --- a/pytorch/training/buildspec-2-8-ec2.yml +++ b/pytorch/training/buildspec-2-8-ec2.yml @@ -53,14 +53,14 @@ images: target: ec2 context: <<: *TRAINING_CONTEXT - BuildEC2GPUPTTrainPy3cu128DockerImage: + BuildEC2GPUPTTrainPy3cu129DockerImage: <<: *TRAINING_REPOSITORY build: &PYTORCH_GPU_TRAINING_PY3 false image_size_baseline: 24000 device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py312 - cuda_version: &CUDA_VERSION cu128 + cuda_version: &CUDA_VERSION cu129 os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] diff --git a/pytorch/training/buildspec-2-8-sm.yml b/pytorch/training/buildspec-2-8-sm.yml index bfb7c6195d5a..aa7372fb0ad5 100644 --- a/pytorch/training/buildspec-2-8-sm.yml +++ b/pytorch/training/buildspec-2-8-sm.yml @@ -60,7 +60,7 @@ images: device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py312 - cuda_version: &CUDA_VERSION cu128 + cuda_version: &CUDA_VERSION cu129 os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu similarity index 100% rename from pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu rename to pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 7f2d8651c1a3..f18a289e5f94 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -943,7 +943,8 @@ def skip_smdebug_v1_test(request): ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4,<2.6": ["cpu", "cu124"], ">=2.6,<2.7.1": ["cpu", "cu126"], - ">=2.7.1,<=2.8": ["cpu", "cu128"], + ">=2.7.1,<2.8": ["cpu", "cu128"], + ">=2.8,<2.9": ["cpu", "cu129"], } if _validate_pytorch_framework_version(request, image_uri, "skip_smdebug_v1_test", skip_dict): pytest.skip(f"SM Profiler v1 is on path for deprecation, skipping test") @@ -967,7 +968,8 @@ def skip_dgl_test(request): ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4,<2.6": ["cpu", "cu124"], ">=2.6,<2.7.1": ["cpu", "cu126"], - ">=2.7.1,<=2.8": ["cpu", "cu128"], + ">=2.7.1,<2.8": ["cpu", "cu128"], + ">=2.8,<2.9": ["cpu", "cu129"], } if _validate_pytorch_framework_version(request, image_uri, "skip_dgl_test", skip_dict): pytest.skip(f"DGL binaries are removed, skipping test") @@ -1032,7 +1034,8 @@ def skip_serialized_release_pt_test(request): ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4,<2.6": ["cpu", "cu124"], ">=2.6,<2.7.1": ["cpu", "cu126"], - ">=2.7.1,<=2.8": ["cpu", "cu128"], + ">=2.7.1,<2.8": ["cpu", "cu128"], + ">=2.8,<2.9": ["cpu", "cu129"], } if _validate_pytorch_framework_version( request, image_uri, "skip_serialized_release_pt_test", skip_dict From b5cde7048642addb9ed0e00cd87dd3757f681b88 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 11 Aug 2025 20:29:13 -0700 Subject: [PATCH 07/16] modify dockerfile gpu and add logging in efa test --- .../training/docker/2.8/py3/cu129/Dockerfile.gpu | 1 - test/dlc_tests/ec2/test_efa.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 5c912c339de5..ae14ae2032ee 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -105,7 +105,6 @@ RUN apt-get update \ cuda-toolkit-12=${CUDA_VERSION}-1 \ libcudnn9-cuda-12=${CUDNN_VERSION}-1 \ libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \ - libcudnn9-headers-cuda-12=${CUDNN_VERSION}-1 \ libhwloc-dev \ libgomp1 \ libibverbs-dev \ diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 9543d783f21c..4725950dccd8 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -376,9 +376,14 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst ) else: # Configure MPI hosts file with IP addresses and slots for worker nodes + # TODO: remove debug logging after testing + LOGGER.info(f"Creating hosts file with master_ip={master_ip}, slots={slots}") + LOGGER.info(f"Worker IPs: {worker_instance_private_ips}") + hosts_string = f"localhost slots={slots} " for worker_ip in worker_instance_private_ips: hosts_string += f"\n{worker_ip} slots={slots} " + LOGGER.info(f"Final hosts file content:\n{hosts_string}") run_cmd_on_container( MASTER_CONTAINER_NAME, @@ -386,6 +391,16 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""", ) + # TODO: remove debug logging after testing + # check to make sure file was created + LOGGER.info("Verifying hosts file creation:") + run_cmd_on_container( + MASTER_CONTAINER_NAME, + master_connection, + f"ls -l {HOSTS_FILE_LOCATION} && cat {HOSTS_FILE_LOCATION}", + hide=False + ) + def _setup_worker_efa_ssh_config(connection, master_pub_key): """ From d2632edc3665e520dfe788b8a59085f938b48349 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 11 Aug 2025 21:13:44 -0700 Subject: [PATCH 08/16] add back libcudnn9-headers-cuda-12 --- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index ae14ae2032ee..5c912c339de5 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -105,6 +105,7 @@ RUN apt-get update \ cuda-toolkit-12=${CUDA_VERSION}-1 \ libcudnn9-cuda-12=${CUDNN_VERSION}-1 \ libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \ + libcudnn9-headers-cuda-12=${CUDNN_VERSION}-1 \ libhwloc-dev \ libgomp1 \ libibverbs-dev \ From ca1d99b9765aed96ab9eae3eb5c6c05f217836b9 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 12 Aug 2025 17:25:18 -0700 Subject: [PATCH 09/16] remove version pins, update cudnn header path check, change efa test logging --- .../training/docker/2.8/py3/Dockerfile.cpu | 15 ++++---- .../docker/2.8/py3/cu129/Dockerfile.gpu | 14 ++++---- .../ec2/pytorch/training/common_cases.py | 36 ++++++++++++++++--- test/dlc_tests/ec2/test_efa.py | 1 + 4 files changed, 46 insertions(+), 20 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 87fd09ac6513..141a253463c9 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -184,8 +184,8 @@ RUN pip install --no-cache-dir \ "requests>=2.32.0" \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ - "awscli<2" \ - "opencv-python==4.11.0.86" \ + "awscli" \ + "opencv-python" \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 @@ -238,13 +238,12 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ s3torchconnector \ - fastai==2.8.3 \ + fastai \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis spacy \ - #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy) - thinc==8.3.4 \ + thinc \ blis \ numpy \ && pip uninstall -y dataclasses @@ -309,7 +308,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ s3torchconnector \ - fastai==2.8.3 \ + fastai \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis @@ -322,8 +321,8 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ # Install SM packages RUN pip install --no-cache-dir -U \ smclarify \ - "sagemaker>=2,<3" \ - "sagemaker-experiments<1" \ + "sagemaker>=2" \ + sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 5c912c339de5..eada47ca9e51 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -246,10 +246,9 @@ RUN pip install --no-cache-dir \ "requests>=2.32.0" \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ - "awscli<2" \ + "awscli" \ ninja \ - # pencv-python 4.12.0.88 reuqires numpy<2.3.0, which is not compatible with previous prod image(2.3.1) - opencv-python==4.11.0.86 \ + opencv-python \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 @@ -263,13 +262,12 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ torchdata==${TORCHDATA_VERSION} \ triton \ s3torchconnector \ - fastai==2.8.3 \ + fastai \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis spacy \ - #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy) - thinc==8.3.4 \ + thinc \ blis \ numpy \ && pip uninstall -y dataclasses @@ -425,8 +423,8 @@ RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.g # Install SM packages RUN pip install --no-cache-dir -U \ smclarify \ - "sagemaker>=2,<3" \ - "sagemaker-experiments<1" \ + "sagemaker>=2" \ + sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 8f8cc7cc03e3..cd58a03eca10 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -1,4 +1,6 @@ import os +import sys +import logging from packaging.version import Version from packaging.specifiers import SpecifierSet @@ -19,6 +21,10 @@ get_efa_ec2_instance_type, ) +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) +LOGGER.addHandler(logging.StreamHandler(sys.stderr)) + # Test functions PT_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchStandalone") PT_MNIST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorch") @@ -351,6 +357,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): """ Test cuDNN Package PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container. + Checks both /usr/include/ and /usr/include/x86_64-linux-gnu/ paths to support different cuDNN package installations. """ container_name = "pytorch_cudnn" account_id = get_account_id_from_image_uri(pytorch_training) @@ -360,9 +367,30 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): f"docker run --runtime=nvidia --gpus all --name {container_name} -itd {pytorch_training}", hide=True, ) - major_cmd = 'cat /usr/include/cudnn_version.h | grep "#define CUDNN_MAJOR"' - minor_cmd = 'cat /usr/include/cudnn_version.h | grep "#define CUDNN_MINOR"' - patch_cmd = 'cat /usr/include/cudnn_version.h | grep "#define CUDNN_PATCHLEVEL"' + + cudnn_paths = [ + "/usr/include/cudnn_version.h", + "/usr/include/x86_64-linux-gnu/cudnn_version.h" + ] + + for path in cudnn_paths: + check_cmd = f"[ -f {path} ] && echo 'Found'" + result = ec2_connection.run( + f"docker exec --user root {container_name} bash -c '{check_cmd}'", + hide=True, + warn=True + ) + if result.ok and result.stdout.strip() == 'Found': + cudnn_path = path + LOGGER.info(f"Found cuDNN header at: {cudnn_path}") + break + else: + raise FileNotFoundError("Could not find cudnn_version.h in any standard location") + + major_cmd = f'cat {cudnn_path} | grep "#define CUDNN_MAJOR"' + minor_cmd = f'cat {cudnn_path} | grep "#define CUDNN_MINOR"' + patch_cmd = f'cat {cudnn_path} | grep "#define CUDNN_PATCHLEVEL"' + major = ec2_connection.run( f"docker exec --user root {container_name} bash -c '{major_cmd}'", hide=True ).stdout.split()[-1] @@ -385,7 +413,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): assert ( system_cudnn == cudnn_from_torch - ), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson." + ), f"System CUDNN {system_cudnn} (from {cudnn_path}) and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson." def pytorch_curand_gpu(pytorch_training, ec2_connection): diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 4725950dccd8..faa193025fcc 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -377,6 +377,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst else: # Configure MPI hosts file with IP addresses and slots for worker nodes # TODO: remove debug logging after testing + master_ip = ec2_utils.get_private_ip(master_connection.host, region) LOGGER.info(f"Creating hosts file with master_ip={master_ip}, slots={slots}") LOGGER.info(f"Worker IPs: {worker_instance_private_ips}") From 4079253b43ab9696a1f54fd8fe6fba0cea7addeb Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 12 Aug 2025 21:25:08 -0700 Subject: [PATCH 10/16] pin opencv-python version and simplify efa test logging --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 3 ++- .../training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +- test/dlc_tests/ec2/test_efa.py | 14 +++++--------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 141a253463c9..7d78611d3290 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -185,7 +185,8 @@ RUN pip install --no-cache-dir \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ "awscli" \ - "opencv-python" \ + # opencv-python 4.12.0.88 reuqires numpy<2.3.0, which is not compatible with previous prod image(2.3.1) + opencv-python==4.11.0.86 \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index eada47ca9e51..90ffb7f565b2 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -248,7 +248,7 @@ RUN pip install --no-cache-dir \ "urllib3>=2.5.0" \ "awscli" \ ninja \ - opencv-python \ + opencv-python==4.11.0.86 \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index faa193025fcc..53ef210d4bdc 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -376,15 +376,12 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst ) else: # Configure MPI hosts file with IP addresses and slots for worker nodes - # TODO: remove debug logging after testing - master_ip = ec2_utils.get_private_ip(master_connection.host, region) - LOGGER.info(f"Creating hosts file with master_ip={master_ip}, slots={slots}") - LOGGER.info(f"Worker IPs: {worker_instance_private_ips}") - hosts_string = f"localhost slots={slots} " for worker_ip in worker_instance_private_ips: hosts_string += f"\n{worker_ip} slots={slots} " - LOGGER.info(f"Final hosts file content:\n{hosts_string}") + + # TODO: remove logging + LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}") run_cmd_on_container( MASTER_CONTAINER_NAME, @@ -392,9 +389,8 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""", ) - # TODO: remove debug logging after testing - # check to make sure file was created - LOGGER.info("Verifying hosts file creation:") + # TODO: remove logging + LOGGER.info("Verifying hosts file:") run_cmd_on_container( MASTER_CONTAINER_NAME, master_connection, From e9da30499772e7d69510393ad9dc99b1046cbfd7 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 13 Aug 2025 10:51:52 -0700 Subject: [PATCH 11/16] comment out some tests and change test_path to only run test_efa --- dlc_developer_config.toml | 2 +- .../training/test_pytorch_training_2_8.py | 46 +++++----- test/dlc_tests/ec2/test_efa.py | 91 ++++++++++--------- test/testrunner.py | 3 +- 4 files changed, 72 insertions(+), 70 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index cd7d59b57a9c..687079b9de8a 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -46,7 +46,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py index ec2c00a6ef14..2a9b678105ab 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py @@ -25,18 +25,18 @@ def test_pytorch_2_8_gpu( ) test_cases = [ - (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), - (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), - (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), - (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), - (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), - (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), - (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), - (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), - (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), - (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + # (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: @@ -74,8 +74,8 @@ def test_pytorch_2_8_gpu_heavy( ) test_cases = [ - (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), - (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), ] test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy") @@ -118,15 +118,15 @@ def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only): pytorch_training = pytorch_training___2__8 test_cases = [ - (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), - (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), - (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), - (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), - (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), - (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), - (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), - (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 53ef210d4bdc..6c692751841a 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -147,50 +147,50 @@ def test_efa_tensorflow( ) -@pytest.mark.skip( - "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise." -) -@pytest.mark.processor("gpu") -@pytest.mark.model("N/A") -@pytest.mark.integration("efa") -@pytest.mark.usefixtures("sagemaker_only") -@pytest.mark.usefixtures("pt201_and_above_only") -@pytest.mark.allow_p4de_use -@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION) -@pytest.mark.team("conda") -@pytest.mark.skipif( - is_pr_context() and not are_heavy_instance_ec2_tests_enabled(), - reason="Skip EFA test in PR context unless explicitly enabled", -) -def test_pytorch_efa_healthcheck( - pytorch_training, - efa_ec2_instances, - efa_ec2_connections, - ec2_instance_type, - region, - gpu_only, -): - """ - Run EFA Health Check tests on DLC. - :param pytorch_training: str PyTorch Training DLC image URI - :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances - :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances - :param ec2_instance_type: str Instance Type being tested - :param region: str Region in which EFA-enabled instances are launched - :param gpu_only: pytest fixture to limit test only to GPU DLCs - """ - _setup_multinode_efa_instances( - pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region - ) - master_connection = efa_ec2_connections[0] - run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) - run_cmd_on_container( - MASTER_CONTAINER_NAME, - master_connection, - f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}", - hide=False, - timeout=DEFAULT_EFA_TIMEOUT, - ) +# @pytest.mark.skip( +# "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise." +# ) +# @pytest.mark.processor("gpu") +# @pytest.mark.model("N/A") +# @pytest.mark.integration("efa") +# @pytest.mark.usefixtures("sagemaker_only") +# @pytest.mark.usefixtures("pt201_and_above_only") +# @pytest.mark.allow_p4de_use +# @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION) +# @pytest.mark.team("conda") +# @pytest.mark.skipif( +# is_pr_context() and not are_heavy_instance_ec2_tests_enabled(), +# reason="Skip EFA test in PR context unless explicitly enabled", +# ) +# def test_pytorch_efa_healthcheck( +# pytorch_training, +# efa_ec2_instances, +# efa_ec2_connections, +# ec2_instance_type, +# region, +# gpu_only, +# ): +# """ +# Run EFA Health Check tests on DLC. +# :param pytorch_training: str PyTorch Training DLC image URI +# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances +# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances +# :param ec2_instance_type: str Instance Type being tested +# :param region: str Region in which EFA-enabled instances are launched +# :param gpu_only: pytest fixture to limit test only to GPU DLCs +# """ +# _setup_multinode_efa_instances( +# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region +# ) +# master_connection = efa_ec2_connections[0] +# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) +# run_cmd_on_container( +# MASTER_CONTAINER_NAME, +# master_connection, +# f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}", +# hide=False, +# timeout=DEFAULT_EFA_TIMEOUT, +# ) def _setup_multinode_efa_instances( @@ -383,6 +383,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst # TODO: remove logging LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}") + LOGGER.info(f"Running command: {f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}"""}") run_cmd_on_container( MASTER_CONTAINER_NAME, master_connection, @@ -390,7 +391,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst ) # TODO: remove logging - LOGGER.info("Verifying hosts file:") + LOGGER.info(f"Verifying hosts file {HOSTS_FILE_LOCATION}:") run_cmd_on_container( MASTER_CONTAINER_NAME, master_connection, diff --git a/test/testrunner.py b/test/testrunner.py index 4746740437bc..86b2ed4692de 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -444,7 +444,8 @@ def main(): pytest_cmd = [ "-s", "-rA", - test_path, + # test_path, + os.path.join(test_path, "test_efa.py::test_pytorch_efa"), f"--junitxml={report}", "-n=auto", ] From bcd8969b77478231df237db28844b000365e898d Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 13 Aug 2025 11:07:37 -0700 Subject: [PATCH 12/16] fix syntax --- test/dlc_tests/ec2/test_efa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 6c692751841a..233f75056ed0 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -383,7 +383,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst # TODO: remove logging LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}") - LOGGER.info(f"Running command: {f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}"""}") + LOGGER.info(f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""") run_cmd_on_container( MASTER_CONTAINER_NAME, master_connection, From 915eea225221d2bdd0b8a6373ed73fc36e97c201 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 13 Aug 2025 11:48:10 -0700 Subject: [PATCH 13/16] uncomment training log --- test/dlc_tests/container_tests/bin/efa/testEFA | 2 +- test/dlc_tests/ec2/test_efa.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 420cd711dc18..043344639ccb 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -89,7 +89,7 @@ check_efa_nccl_all_reduce(){ RETURN_VAL=${PIPESTATUS[0]} # In case, if you would like see logs, uncomment below line - # RESULT=$(cat ${TRAINING_LOG}) + RESULT=$(cat ${TRAINING_LOG}) if [ ${RETURN_VAL} -eq 0 ]; then echo "***************************** check_efa_nccl_all_reduce passed *****************************" diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 233f75056ed0..053eb97c81ce 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -391,11 +391,19 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst ) # TODO: remove logging - LOGGER.info(f"Verifying hosts file {HOSTS_FILE_LOCATION}:") + LOGGER.info(f"Checking if hosts file exists:") run_cmd_on_container( MASTER_CONTAINER_NAME, master_connection, - f"ls -l {HOSTS_FILE_LOCATION} && cat {HOSTS_FILE_LOCATION}", + f"ls -l {HOSTS_FILE_LOCATION}", + hide=False + ) + + LOGGER.info(f"Checking hosts file contents:") + run_cmd_on_container( + MASTER_CONTAINER_NAME, + master_connection, + f"cat {HOSTS_FILE_LOCATION}", hide=False ) From 2d98fb1681764d7c1bf5fec1841b5f86f3489ffb Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 13 Aug 2025 12:32:09 -0700 Subject: [PATCH 14/16] change log validation --- test/dlc_tests/container_tests/bin/efa/testEFA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 043344639ccb..52f5664625d8 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -36,7 +36,7 @@ validate_all_reduce_performance_logs(){ # EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric" grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; } if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then - grep "Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG} + grep "NCCL_TOPO_FILE set by environment to" ${TRAINING_LOG} # EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG} fi From 454b69362999531b583b5a13fef95650c81ac59c Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 13 Aug 2025 15:57:27 -0700 Subject: [PATCH 15/16] change LD_LIBRARY path and rebuild --- dlc_developer_config.toml | 2 +- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 687079b9de8a..cd7d59b57a9c 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -46,7 +46,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true [notify] ### Notify on test failures diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 90ffb7f565b2..70d9ba03c31f 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -19,7 +19,7 @@ FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 ENV DEBIAN_FRONTEND=noninteractive -ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" RUN apt-get update \ && apt-get upgrade -y \ @@ -61,8 +61,8 @@ ARG NCCL_VERSION ARG EFA_VERSION ENV CUDA_HOME="/usr/local/cuda" -ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" -ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" +# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +# ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" ENV PATH="${CUDA_HOME}/bin:${PATH}" ENV EFA_PATH="/opt/amazon/efa" ENV OPEN_MPI_PATH="/opt/amazon/openmpi" @@ -164,7 +164,7 @@ RUN mkdir /tmp/efa \ && apt-get clean ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" # Configure Open MPI and configure NCCL parameters RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ From 93b6143b5eeb72781788773c3d4690ccc9566783 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 13 Aug 2025 20:30:09 -0700 Subject: [PATCH 16/16] Remove nvjpeg patching script and rebuild with normal test path --- dlc_developer_config.toml | 8 +- .../docker/2.8/py3/cu129/Dockerfile.gpu | 17 --- .../training/test_pytorch_training_2_8.py | 46 ++++---- test/dlc_tests/ec2/test_efa.py | 109 +++++++----------- test/testrunner.py | 3 +- 5 files changed, 72 insertions(+), 111 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index cd7d59b57a9c..308e46f01ace 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -41,12 +41,12 @@ build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true +build_training = true build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = true [notify] ### Notify on test failures @@ -71,7 +71,7 @@ ec2_benchmark_tests = true ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = true ### SM specific tests ### On by default sagemaker_local_tests = true @@ -119,7 +119,7 @@ use_scheduler = false ### TRAINING PR JOBS ### # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 70d9ba03c31f..0baf63080ce8 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -19,7 +19,6 @@ FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 ENV DEBIAN_FRONTEND=noninteractive -# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" RUN apt-get update \ && apt-get upgrade -y \ @@ -61,8 +60,6 @@ ARG NCCL_VERSION ARG EFA_VERSION ENV CUDA_HOME="/usr/local/cuda" -# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" -# ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" ENV PATH="${CUDA_HOME}/bin:${PATH}" ENV EFA_PATH="/opt/amazon/efa" ENV OPEN_MPI_PATH="/opt/amazon/openmpi" @@ -132,20 +129,6 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean - # patch nvjpeg to fix CVE -RUN mkdir -p /tmp/nvjpeg \ -&& cd /tmp/nvjpeg \ -&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ -&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ -&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ -&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ -&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ -&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ -&& rm -rf /tmp/nvjpeg \ -# patch cuobjdump and nvdisasm -&& rm -rf /usr/local/cuda/bin/cuobjdump* \ -&& rm -rf /usr/local/cuda/bin/nvdisasm* - # For EFA, below flags are needed to install EFA on docker image # -n, --no-verify Skip EFA device verification and test # -l, --skip-limit-conf Skip EFA limit configuration diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py index 2a9b678105ab..ec2c00a6ef14 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py @@ -25,18 +25,18 @@ def test_pytorch_2_8_gpu( ) test_cases = [ - # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), - # (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: @@ -74,8 +74,8 @@ def test_pytorch_2_8_gpu_heavy( ) test_cases = [ - # (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), + (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), ] test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy") @@ -118,15 +118,15 @@ def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only): pytorch_training = pytorch_training___2__8 test_cases = [ - # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), - # (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 053eb97c81ce..9543d783f21c 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -147,50 +147,50 @@ def test_efa_tensorflow( ) -# @pytest.mark.skip( -# "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise." -# ) -# @pytest.mark.processor("gpu") -# @pytest.mark.model("N/A") -# @pytest.mark.integration("efa") -# @pytest.mark.usefixtures("sagemaker_only") -# @pytest.mark.usefixtures("pt201_and_above_only") -# @pytest.mark.allow_p4de_use -# @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION) -# @pytest.mark.team("conda") -# @pytest.mark.skipif( -# is_pr_context() and not are_heavy_instance_ec2_tests_enabled(), -# reason="Skip EFA test in PR context unless explicitly enabled", -# ) -# def test_pytorch_efa_healthcheck( -# pytorch_training, -# efa_ec2_instances, -# efa_ec2_connections, -# ec2_instance_type, -# region, -# gpu_only, -# ): -# """ -# Run EFA Health Check tests on DLC. -# :param pytorch_training: str PyTorch Training DLC image URI -# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances -# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances -# :param ec2_instance_type: str Instance Type being tested -# :param region: str Region in which EFA-enabled instances are launched -# :param gpu_only: pytest fixture to limit test only to GPU DLCs -# """ -# _setup_multinode_efa_instances( -# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region -# ) -# master_connection = efa_ec2_connections[0] -# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) -# run_cmd_on_container( -# MASTER_CONTAINER_NAME, -# master_connection, -# f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}", -# hide=False, -# timeout=DEFAULT_EFA_TIMEOUT, -# ) +@pytest.mark.skip( + "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise." +) +@pytest.mark.processor("gpu") +@pytest.mark.model("N/A") +@pytest.mark.integration("efa") +@pytest.mark.usefixtures("sagemaker_only") +@pytest.mark.usefixtures("pt201_and_above_only") +@pytest.mark.allow_p4de_use +@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION) +@pytest.mark.team("conda") +@pytest.mark.skipif( + is_pr_context() and not are_heavy_instance_ec2_tests_enabled(), + reason="Skip EFA test in PR context unless explicitly enabled", +) +def test_pytorch_efa_healthcheck( + pytorch_training, + efa_ec2_instances, + efa_ec2_connections, + ec2_instance_type, + region, + gpu_only, +): + """ + Run EFA Health Check tests on DLC. + :param pytorch_training: str PyTorch Training DLC image URI + :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances + :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances + :param ec2_instance_type: str Instance Type being tested + :param region: str Region in which EFA-enabled instances are launched + :param gpu_only: pytest fixture to limit test only to GPU DLCs + """ + _setup_multinode_efa_instances( + pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region + ) + master_connection = efa_ec2_connections[0] + run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) + run_cmd_on_container( + MASTER_CONTAINER_NAME, + master_connection, + f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}", + hide=False, + timeout=DEFAULT_EFA_TIMEOUT, + ) def _setup_multinode_efa_instances( @@ -380,33 +380,12 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst for worker_ip in worker_instance_private_ips: hosts_string += f"\n{worker_ip} slots={slots} " - # TODO: remove logging - LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}") - - LOGGER.info(f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""") run_cmd_on_container( MASTER_CONTAINER_NAME, master_connection, f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""", ) - # TODO: remove logging - LOGGER.info(f"Checking if hosts file exists:") - run_cmd_on_container( - MASTER_CONTAINER_NAME, - master_connection, - f"ls -l {HOSTS_FILE_LOCATION}", - hide=False - ) - - LOGGER.info(f"Checking hosts file contents:") - run_cmd_on_container( - MASTER_CONTAINER_NAME, - master_connection, - f"cat {HOSTS_FILE_LOCATION}", - hide=False - ) - def _setup_worker_efa_ssh_config(connection, master_pub_key): """ diff --git a/test/testrunner.py b/test/testrunner.py index 86b2ed4692de..4746740437bc 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -444,8 +444,7 @@ def main(): pytest_cmd = [ "-s", "-rA", - # test_path, - os.path.join(test_path, "test_efa.py::test_pytorch_efa"), + test_path, f"--junitxml={report}", "-n=auto", ]