From d27e5232ac3276d6eba5ce60e66f1868c1422877 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Sun, 10 Aug 2025 14:38:22 -0700
Subject: [PATCH 01/16] pt 2.8 training ec2

---
 dlc_developer_config.toml                     |  10 +-
 pytorch/training/buildspec-2-8-ec2.yml        |  72 +++
 pytorch/training/buildspec-2-8-sm.yml         |  72 +++
 pytorch/training/buildspec.yml                |   2 +-
 .../training/docker/2.8/py3/Dockerfile.cpu    | 364 ++++++++++++++
 .../docker/2.8/py3/cu128/Dockerfile.gpu       | 464 ++++++++++++++++++
 6 files changed, 978 insertions(+), 6 deletions(-)
 create mode 100644 pytorch/training/buildspec-2-8-ec2.yml
 create mode 100644 pytorch/training/buildspec-2-8-sm.yml
 create mode 100644 pytorch/training/docker/2.8/py3/Dockerfile.cpu
 create mode 100644 pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 1962bfd69e21..f8a4b4214d7d 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -65,13 +65,13 @@ ecs_tests = true
 eks_tests = true
 ec2_tests = true
 # Set it to true if you are preparing a Benchmark related PR
-ec2_benchmark_tests = false
+ec2_benchmark_tests = true
 
 ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
 ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
 ### Off by default (set to false)
-ec2_tests_on_heavy_instances = false
+ec2_tests_on_heavy_instances = true
 ### SM specific tests
 ### On by default
 sagemaker_local_tests = true
@@ -119,7 +119,7 @@ use_scheduler = false
 ### TRAINING PR JOBS ###
 
 # Standard Framework Training
-dlc-pr-pytorch-training = ""
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml""
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 
diff --git a/pytorch/training/buildspec-2-8-ec2.yml b/pytorch/training/buildspec-2-8-ec2.yml
new file mode 100644
index 000000000000..07bc4e559755
--- /dev/null
+++ b/pytorch/training/buildspec-2-8-ec2.yml
@@ -0,0 +1,72 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 2.8.0
+short_version: &SHORT_VERSION "2.8"
+arch_type: x86
+# autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    start_cuda_compat:
+      source: docker/build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    dockerd_entrypoint:
+      source: docker/build_artifacts/dockerd_entrypoint.sh
+      target: dockerd_entrypoint.sh
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildEC2CPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_CPU_TRAINING_PY3 false
+    image_size_baseline: 6500
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
+    # skip_build: "False"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: ec2
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildEC2GPUPTTrainPy3cu128DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 24000
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    cuda_version: &CUDA_VERSION cu128
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    # skip_build: "False"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: ec2
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/pytorch/training/buildspec-2-8-sm.yml b/pytorch/training/buildspec-2-8-sm.yml
new file mode 100644
index 000000000000..a29f64b2a761
--- /dev/null
+++ b/pytorch/training/buildspec-2-8-sm.yml
@@ -0,0 +1,72 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 2.8.0
+short_version: &SHORT_VERSION "2.8"
+arch_type: x86
+# autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    start_cuda_compat:
+      source: docker/build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    dockerd_entrypoint:
+      source: docker/build_artifacts/dockerd_entrypoint.sh
+      target: dockerd_entrypoint.sh
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildSageMakerCPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_CPU_TRAINING_PY3 false
+    image_size_baseline: 6500
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    # skip_build: "False"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildSageMakerGPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 24000
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    cuda_version: &CUDA_VERSION cu128
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    # skip_build: "False"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml
index b332931b2e40..78ac196ed806 100644
--- a/pytorch/training/buildspec.yml
+++ b/pytorch/training/buildspec.yml
@@ -1 +1 @@
-buildspec_pointer: buildspec-2-7-sm.yml
+buildspec_pointer: buildspec-2-8-ec2.yml
diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
new file mode 100644
index 000000000000..2577f237b6f3
--- /dev/null
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -0,0 +1,364 @@
+ARG PYTHON=python3
+ARG PYTHON_VERSION=3.12.10
+ARG PYTHON_SHORT_VERSION=3.12
+ARG PYTORCH_VERSION=2.8.0
+
+ARG OPEN_MPI_VERSION=4.1.7
+
+ARG TORCHTNT_VERSION=0.2.4
+ARG TORCHDATA_VERSION=0.11.0
+ARG TORCHAUDIO_VERSION=2.8.0
+ARG TORCHVISION_VERSION=0.23.0
+
+FROM ubuntu:22.04 AS base_image
+
+# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get autoremove -y \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+#################################################################
+#   ____
+#  / ___| ___  _ __ ___  _ __ ___   ___  _ __
+# | |    / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \
+# | |___  (_) | | | | | | | | | | | (_) | | | |
+#  \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_|
+#  ___                              ____           _
+# |_ _|_ __ ___   __ _  __ _  ___  |  _ \ ___  ___(_)_ __   ___
+#  | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
+#  | || | | | | | (_| | (_| |  __/ |  _ <  __/ (__| | |_) |  __/
+# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
+#                      |___/                        |_|
+#################################################################
+
+FROM base_image AS common
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG PYTHON
+ARG PYTHON_VERSION
+ARG PYTHON_SHORT_VERSION
+
+ARG OPEN_MPI_VERSION
+
+ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+ENV DLC_CONTAINER_TYPE=training
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get -y upgrade --only-upgrade systemd \
+ && apt-get install -y --no-install-recommends \
+    automake \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    emacs \
+    git \
+    jq \
+    libcurl4-openssl-dev \
+    libglib2.0-0 \
+    libgl1-mesa-glx \
+    libsm6 \
+    libssl-dev \
+    libxext6 \
+    libxrender-dev \
+    zlib1g-dev \
+    unzip \
+    vim \
+    wget \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    llvm \
+    libncurses5-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    liblzma-dev \
+    libffi-dev \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Open MPI
+RUN wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-${OPEN_MPI_VERSION}.tar.gz \
+ && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \
+ && cd openmpi-${OPEN_MPI_VERSION} \
+ && ./configure --prefix=/home/.openmpi \
+ && make all install \
+ && cd .. \
+ && rm openmpi-${OPEN_MPI_VERSION}.tar.gz \
+ && rm -rf openmpi-${OPEN_MPI_VERSION}
+
+# The ENV variables declared below are changed in the previous section
+# Grouping these ENV variables in the first section causes
+# ompi_info to fail. This is only observed in CPU containers
+ENV PATH="/home/.openmpi/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/home/.openmpi/lib:${LD_LIBRARY_PATH}"
+RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
+
+# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends openssh-client openssh-server \
+ && mkdir -p /var/run/sshd \
+ && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# Configure OpenSSH so that nodes can communicate with each other
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+RUN rm -rf /root/.ssh/ \
+ && mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+
+# install python
+RUN cd /tmp/ \
+&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
+&& tar xzf Python-${PYTHON_VERSION}.tgz \
+&& cd Python-${PYTHON_VERSION} \
+&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \
+&& make -j "$(nproc)" \
+&& make altinstall \
+&& cd .. \
+&& rm -rf Python-${PYTHON_VERSION} \
+&& rm Python-${PYTHON_VERSION}.tgz \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \
+# This installation generate a .python_history file in the root directory leads sanity check to fail
+&& rm -f /root/.python_history
+
+# Python Path
+ENV PATH="/usr/local/bin:${PATH}"
+
+# this will add pip systemlink to pip${PYTHON_SHORT_VERSION}
+RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
+
+# Install common packages
+RUN pip install --no-cache-dir \
+    cython \
+    cryptography \
+    pyOpenSSL \
+    pybind11 \
+    mkl \
+    mkl-include \
+    parso \
+    typing \
+    charset-normalizer \
+    packaging \
+    boto3 \
+    PyYAML \
+    numpy \
+    scipy \
+    click \
+    psutil \
+    ipython \
+    ipykernel \
+    pillow \
+    h5py \
+    fsspec \
+    "idna>=3.7" \
+    "tqdm>=4.66.3" \
+    "requests>=2.32.0" \
+    "setuptools>=70.0.0" \
+    "urllib3>=2.5.0" \
+    "awscli<2" \
+    "opencv-python==4.11.0.86" \
+    mpi4py \
+    jinja2>=3.1.6 \
+    tornado>=6.5.1
+
+RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt
+
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+RUN chmod +x /usr/local/bin/bash_telemetry.sh
+RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+########################################################
+#  _____ ____ ____    ___
+# | ____/ ___|___ \  |_ _|_ __ ___   __ _  __ _  ___
+# |  _|| |     __) |  | || '_ ` _ \ / _` |/ _` |/ _ \
+# | |__| |___ / __/   | || | | | | | (_| | (_| |  __/
+# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
+#                                         |___/
+#  ____           _
+# |  _ \ ___  ___(_)_ __   ___
+# | |_) / _ \/ __| | '_ \ / _ \
+# |  _ <  __/ (__| | |_) |  __/
+# |_| \_\___|\___|_| .__/ \___|
+#                  |_|
+########################################################
+
+FROM common AS ec2
+
+ARG PYTHON
+ARG PYTHON_SHORT_VERSION
+ARG PYTORCH_VERSION
+ARG TORCHTNT_VERSION
+ARG TORCHDATA_VERSION
+ARG TORCHAUDIO_VERSION
+ARG TORCHVISION_VERSION
+
+WORKDIR /
+
+# Install PyTorch
+RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/cpu \
+    && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
+    torchdata==${TORCHDATA_VERSION} \
+    s3torchconnector \
+    fastai==2.8.2 \
+    accelerate \
+    # pin numpy requirement for fastai dependency
+    # requires explicit declaration of spacy, thic, blis
+    spacy \
+    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
+    thinc==8.3.4  \
+    blis \
+    numpy \
+ && pip uninstall -y dataclasses
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp*
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
+ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
+
+# Starts framework
+CMD ["/bin/bash"]
+
+#################################################################
+#  ____                   __  __       _
+# / ___|  __ _  __ _  ___|  \/  | __ _| | _____ _ __
+# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__|
+#  ___) | (_| | (_| |  __/ |  | | (_| |   <  __/ |
+# |____/ \__,_|\__, |\___|_|  |_|\__,_|_|\_\___|_|
+#              |___/
+#  ___                              ____           _
+# |_ _|_ __ ___   __ _  __ _  ___  |  _ \ ___  ___(_)_ __   ___
+#  | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
+#  | || | | | | | (_| | (_| |  __/ |  _ <  __/ (__| | |_) |  __/
+# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
+#                      |___/                        |_|
+#################################################################
+
+FROM common AS sagemaker
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG PYTHON
+ARG PYTHON_SHORT_VERSION
+ARG PYTORCH_VERSION
+ARG TORCHTNT_VERSION
+ARG TORCHDATA_VERSION
+ARG TORCHAUDIO_VERSION
+ARG TORCHVISION_VERSION
+
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
+
+WORKDIR /
+
+# Install PyTorch
+RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/cpu \
+    && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
+    torchdata==${TORCHDATA_VERSION} \
+    s3torchconnector \
+    fastai==2.8.2 \
+    accelerate \
+    # pin numpy requirement for fastai dependency
+    # requires explicit declaration of spacy, thic, blis
+    spacy \
+    thinc==8.3.4  \
+    blis \
+    numpy \
+ && pip uninstall -y dataclasses
+
+# Install SM packages
+RUN pip install --no-cache-dir -U \
+    smclarify \
+    "sagemaker>=2,<3" \
+    "sagemaker-experiments<1" \
+    sagemaker-pytorch-training \
+    sagemaker-training
+
+# Install extra packages
+RUN pip install --no-cache-dir -U \
+    bokeh \
+    imageio \
+    numba \
+    pandas \
+    plotly \
+    scikit-learn \
+    seaborn \
+    shap \
+    # pinned for sagemaker==2.233.0
+    cloudpickle 
+
+# Copy workaround script for incorrect hostname
+COPY changehostname.c /
+COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
+
+RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp*
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
+CMD ["/bin/bash"]
+
diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
new file mode 100644
index 000000000000..cbd2a01401f0
--- /dev/null
+++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
@@ -0,0 +1,464 @@
+ARG PYTHON=python3
+ARG PYTHON_VERSION=3.12.10
+ARG PYTHON_SHORT_VERSION=3.12
+ARG PYTORCH_VERSION=2.8.0
+ARG TORCHTNT_VERSION=0.2.4
+ARG TORCHAUDIO_VERSION=2.8.0
+ARG TORCHVISION_VERSION=0.23.0
+ARG TORCHDATA_VERSION=0.11.0
+
+ARG CUDA_VERSION=12.8.1
+ARG CUDNN_VERSION=9.10.2.21
+ARG NCCL_VERSION=2.27.3
+ARG EFA_VERSION=1.43.1
+ARG GDRCOPY_VERSION=2.5
+ARG TE_VERSION=2.5
+ARG FLASH_ATTN_VERSION=2.8.2
+
+FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base_image
+
+# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get autoremove -y \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+#################################################################
+#   ____
+#  / ___| ___  _ __ ___  _ __ ___   ___  _ __
+# | |    / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \
+# | |___  (_) | | | | | | | | | | | (_) | | | |
+#  \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_|
+#  ___                              ____           _
+# |_ _|_ __ ___   __ _  __ _  ___  |  _ \ ___  ___(_)_ __   ___
+#  | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
+#  | || | | | | | (_| | (_| |  __/ |  _ <  __/ (__| | |_) |  __/
+# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
+#                      |___/                        |_|
+#################################################################
+
+FROM base_image AS common
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG PYTHON
+ARG PYTHON_VERSION
+ARG PYTHON_SHORT_VERSION
+ARG PYTORCH_VERSION
+ARG TORCHDATA_VERSION
+ARG TORCHAUDIO_VERSION
+ARG TORCHVISION_VERSION
+ARG TORCHTNT_VERSION
+
+ARG CUDA_VERSION
+ARG CUDNN_VERSION
+ARG NCCL_VERSION
+ARG EFA_VERSION
+
+ENV CUDA_HOME="/usr/local/cuda"
+ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+ENV PATH="${CUDA_HOME}/bin:${PATH}"
+ENV EFA_PATH="/opt/amazon/efa"
+ENV OPEN_MPI_PATH="/opt/amazon/openmpi"
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+
+ENV DLC_CONTAINER_TYPE=training
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get -y upgrade --only-upgrade systemd \
+ && apt-get install -y --allow-change-held-packages --no-install-recommends \
+    automake \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    emacs \
+    git \
+    jq \
+    libcurl4-openssl-dev \
+    libglib2.0-0 \
+    libgl1-mesa-glx \
+    libsm6 \
+    libssl-dev \
+    libxext6 \
+    libxrender-dev \
+    zlib1g-dev \
+    unzip \
+    vim \
+    wget \
+    cuda-toolkit-12=${CUDA_VERSION}-1 \
+    libcudnn9-cuda-12=${CUDNN_VERSION}-1 \
+    libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \
+    libhwloc-dev \
+    libgomp1 \
+    libibverbs-dev \
+    libnuma1 \
+    libnuma-dev \
+    libtool \
+    openssl \
+    python3-dev \
+    autoconf \
+    pkg-config \
+    check \
+    libsubunit0 \
+    libsubunit-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    llvm \
+    libncurses5-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    liblzma-dev \
+    libffi-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+ # patch nvjpeg to fix CVE
+RUN mkdir -p /tmp/nvjpeg \
+&& cd /tmp/nvjpeg \
+&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
+&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
+&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
+&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
+&& rm -rf /tmp/nvjpeg \
+# patch cuobjdump and nvdisasm
+&& rm -rf /usr/local/cuda/bin/cuobjdump* \
+&& rm -rf /usr/local/cuda/bin/nvdisasm* 
+
+# For EFA, below flags are needed to install EFA on docker image
+#  -n, --no-verify       Skip EFA device verification and test
+#  -l, --skip-limit-conf Skip EFA limit configuration
+#  -k, --skip-kmod       Skip EFA kmod installation
+# start from 0.38.0 EFA now bundles the AWS OFI NCCL plugin, 
+# which can now be found in /opt/amazon/ofi-nccl/lib/x86_64-linux-gnu rather than the original /opt/aws-ofi-nccl/.
+RUN mkdir /tmp/efa \
+ && cd /tmp/efa \
+ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
+ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
+ && cd aws-efa-installer \
+ && apt-get update \
+ && ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \
+ && rm -rf /tmp/efa \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}"
+
+# Configure Open MPI and configure NCCL parameters
+RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
+ && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
+ && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \
+ && echo NCCL_SOCKET_IFNAME=^lo,docker >> /etc/nccl.conf
+
+# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends openssh-client openssh-server \
+ && mkdir -p /var/run/sshd \
+ && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Configure OpenSSH so that nodes can communicate with each other
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+RUN rm -rf /root/.ssh/ \
+ && mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+# install python
+RUN cd /tmp/ \
+&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
+&& tar xzf Python-${PYTHON_VERSION}.tgz \
+&& cd Python-${PYTHON_VERSION} \
+&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \
+&& make -j "$(nproc)" \
+&& make altinstall \
+&& cd .. \
+&& rm -rf Python-${PYTHON_VERSION} \
+&& rm Python-${PYTHON_VERSION}.tgz \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \
+# This installation generate a .python_history file in the root directory leads sanity check to fail
+&& rm -f /root/.python_history
+
+# Python Path
+ENV PATH="/usr/local/bin:${PATH}"
+
+# this will add pip systemlink to pip${PYTHON_SHORT_VERSION}
+RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
+
+# Install common conda packages
+RUN pip install --no-cache-dir \
+    cython \
+    cryptography \
+    pyOpenSSL \
+    pybind11 \
+    mkl \
+    mkl-include \
+    parso \
+    typing \
+    charset-normalizer \
+    packaging \
+    boto3 \
+    PyYAML \
+    numpy \
+    scipy \
+    click \
+    psutil \
+    ipython \
+    ipykernel \
+    pillow \
+    h5py \
+    fsspec \
+    "idna>=3.7" \
+    "tqdm>=4.66.3" \
+    "requests>=2.32.0" \
+    "setuptools>=70.0.0" \
+    "urllib3>=2.5.0" \
+    "awscli<2" \
+    ninja \
+    # pencv-python 4.12.0.88 reuqires numpy<2.3.0, which is not compatible with previous prod image(2.3.1)
+    opencv-python==4.11.0.86 \
+    mpi4py \
+    jinja2>=3.1.6 \
+    tornado>=6.5.1
+
+# Install PyTorch
+RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/cu128 \
+    && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
+    torchdata==${TORCHDATA_VERSION} \
+    triton \
+    s3torchconnector \
+    fastai==2.8.2 \
+    accelerate \
+    # pin numpy requirement for fastai dependency
+    # requires explicit declaration of spacy, thic, blis
+    spacy \
+    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
+    thinc==8.3.4 \
+    blis \
+    numpy \
+ && pip uninstall -y dataclasses
+
+RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt
+
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+RUN chmod +x /usr/local/bin/start_cuda_compat.sh
+
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+RUN chmod +x /usr/local/bin/bash_telemetry.sh
+RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+########################################################
+#  _____ ____ ____    ___
+# | ____/ ___|___ \  |_ _|_ __ ___   __ _  __ _  ___
+# |  _|| |     __) |  | || '_ ` _ \ / _` |/ _` |/ _ \
+# | |__| |___ / __/   | || | | | | | (_| | (_| |  __/
+# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
+#                                         |___/
+#  ____           _
+# |  _ \ ___  ___(_)_ __   ___
+# | |_) / _ \/ __| | '_ \ / _ \
+# |  _ <  __/ (__| | |_) |  __/
+# |_| \_\___|\___|_| .__/ \___|
+#                  |_|
+########################################################
+
+FROM common AS ec2
+
+ARG PYTHON
+ARG PYTHON_SHORT_VERSION
+ARG NCCL_VERSION
+ARG GDRCOPY_VERSION
+ARG TE_VERSION
+ARG FLASH_ATTN_VERSION
+
+WORKDIR /
+
+
+# Install GDRCopy which is a dependency of SM Distributed DataParallel binary
+# The test binaries requires cuda driver library which could be found in conda
+# So update the linker path to point to it to avoid -Lcuda not found
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+ && cd gdrcopy \
+ && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
+ && CUDA=${CUDA_HOME} make install \
+ && rm -rf /tmp/gdrcopy
+
+# Install NCCL
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
+ && cd nccl \
+ && make -j64 src.build BUILDDIR=/usr/local \
+ && rm -rf /tmp/nccl
+
+# Install flash attn and NVIDIA transformer engine.
+# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
+ENV NVTE_FRAMEWORK=pytorch
+# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
+# Set MAX_JOBS=4 to avoid OOM issues in installation process
+RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation
+# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
+
+COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp*
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
+CMD ["/bin/bash"]
+
+
+#################################################################
+#  ____                   __  __       _
+# / ___|  __ _  __ _  ___|  \/  | __ _| | _____ _ __
+# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__|
+#  ___) | (_| | (_| |  __/ |  | | (_| |   <  __/ |
+# |____/ \__,_|\__, |\___|_|  |_|\__,_|_|\_\___|_|
+#              |___/
+#  ___                              ____           _
+# |_ _|_ __ ___   __ _  __ _  ___  |  _ \ ___  ___(_)_ __   ___
+#  | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
+#  | || | | | | | (_| | (_| |  __/ |  _ <  __/ (__| | |_) |  __/
+# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
+#                      |___/                        |_|
+#################################################################
+
+FROM common AS sagemaker
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
+
+ARG PYTHON
+ARG PYTHON_SHORT_VERSION
+ARG NCCL_VERSION
+ARG GDRCOPY_VERSION
+ARG TE_VERSION
+ARG FLASH_ATTN_VERSION
+
+WORKDIR /
+
+# Install GDRCopy which is a dependency of SM Distributed DataParallel binary
+# The test binaries requires cuda driver library which could be found in conda
+# So update the linker path to point to it to avoid -Lcuda not found
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+ && cd gdrcopy \
+ && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
+ && CUDA=${CUDA_HOME} make install \
+ && rm -rf /tmp/gdrcopy
+
+# Install NCCL
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
+ && cd nccl \
+ && make -j64 src.build BUILDDIR=/usr/local \
+ && rm -rf /tmp/nccl
+
+RUN pip uninstall -y ninja && pip install ninja
+
+# Install flash attn and NVIDIA transformer engine.
+# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
+ENV NVTE_FRAMEWORK=pytorch
+# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
+# Set MAX_JOBS=4 to avoid OOM issues in installation process
+RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation
+# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
+
+# Install SM packages
+RUN pip install --no-cache-dir -U \
+    smclarify \
+    "sagemaker>=2,<3" \
+    "sagemaker-experiments<1" \
+    sagemaker-pytorch-training \
+    sagemaker-training
+
+# Install extra packages
+RUN pip install --no-cache-dir -U \
+    bokeh \
+    imageio \
+    numba \
+    pandas \
+    plotly \
+    shap \
+    scikit-learn \
+    seaborn \
+    # pinned for sagemaker==2.233.0
+    cloudpickle 
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp*
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+# Copy workaround script for incorrect hostname
+COPY changehostname.c /
+COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
+RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
+
+ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
+CMD ["/bin/bash"]

From ac530dd87f4e54d22bd9f7c14933ba086745eb07 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Sun, 10 Aug 2025 16:52:52 -0700
Subject: [PATCH 02/16] fix typo

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index f8a4b4214d7d..cd7d59b57a9c 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -119,7 +119,7 @@ use_scheduler = false
 ### TRAINING PR JOBS ###
 
 # Standard Framework Training
-dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml""
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 

From 1793c2a0913798514de517db690ba02c5c56845b Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Sun, 10 Aug 2025 18:51:25 -0700
Subject: [PATCH 03/16] increase cpu image baseline size, add
 libcudnn9-dev-cuda-12 dependency

---
 pytorch/training/buildspec-2-8-ec2.yml               | 2 +-
 pytorch/training/buildspec-2-8-sm.yml                | 2 +-
 pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch/training/buildspec-2-8-ec2.yml b/pytorch/training/buildspec-2-8-ec2.yml
index 07bc4e559755..e7222c4ba584 100644
--- a/pytorch/training/buildspec-2-8-ec2.yml
+++ b/pytorch/training/buildspec-2-8-ec2.yml
@@ -41,7 +41,7 @@ images:
   BuildEC2CPUPTTrainPy3DockerImage:
     <<: *TRAINING_REPOSITORY
     build: &PYTORCH_CPU_TRAINING_PY3 false
-    image_size_baseline: 6500
+    image_size_baseline: 7200
     device_type: &DEVICE_TYPE cpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
diff --git a/pytorch/training/buildspec-2-8-sm.yml b/pytorch/training/buildspec-2-8-sm.yml
index a29f64b2a761..bfb7c6195d5a 100644
--- a/pytorch/training/buildspec-2-8-sm.yml
+++ b/pytorch/training/buildspec-2-8-sm.yml
@@ -41,7 +41,7 @@ images:
   BuildSageMakerCPUPTTrainPy3DockerImage:
     <<: *TRAINING_REPOSITORY
     build: &PYTORCH_CPU_TRAINING_PY3 false
-    image_size_baseline: 6500
+    image_size_baseline: 7200
     device_type: &DEVICE_TYPE cpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
index cbd2a01401f0..3e60f8b895ad 100644
--- a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
@@ -105,6 +105,7 @@ RUN apt-get update \
     cuda-toolkit-12=${CUDA_VERSION}-1 \
     libcudnn9-cuda-12=${CUDNN_VERSION}-1 \
     libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \
+    libcudnn9-headers-cuda-12=${CUDNN_VERSION}-1 \
     libhwloc-dev \
     libgomp1 \
     libibverbs-dev \

From f94d72d97f384e0479d4871ccb075d023c49fb31 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Sun, 10 Aug 2025 22:56:44 -0700
Subject: [PATCH 04/16] add test and update dockerfiles

---
 .../training/docker/2.8/py3/Dockerfile.cpu    |   2 +-
 .../docker/2.8/py3/cu128/Dockerfile.gpu       |   2 +-
 test/dlc_tests/conftest.py                    |   7 +-
 .../training/test_pytorch_training_2_8.py     | 137 ++++++++++++++++++
 4 files changed, 143 insertions(+), 5 deletions(-)
 create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 2577f237b6f3..9399ede0d55d 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -190,7 +190,7 @@ RUN pip install --no-cache-dir \
     jinja2>=3.1.6 \
     tornado>=6.5.1
 
-RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt
+RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt
 
 COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
 
diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
index 3e60f8b895ad..b26cd87ce89e 100644
--- a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
@@ -274,7 +274,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     numpy \
  && pip uninstall -y dataclasses
 
-RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt
+RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt
 
 COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
 
diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
index 788057ad4b9e..7f2d8651c1a3 100644
--- a/test/dlc_tests/conftest.py
+++ b/test/dlc_tests/conftest.py
@@ -55,6 +55,7 @@
     # ECR repo name fixtures
     # PyTorch
     "pytorch_training",
+    "pytorch_training___2__8",
     "pytorch_training___2__7",
     "pytorch_training___2__6",
     "pytorch_training___2__5",
@@ -942,7 +943,7 @@ def skip_smdebug_v1_test(request):
         ">=2.1,<2.4": ["cpu", "cu121"],
         ">=2.4,<2.6": ["cpu", "cu124"],
         ">=2.6,<2.7.1": ["cpu", "cu126"],
-        ">=2.7.1,<2.8": ["cpu", "cu128"],
+        ">=2.7.1,<=2.8": ["cpu", "cu128"],
     }
     if _validate_pytorch_framework_version(request, image_uri, "skip_smdebug_v1_test", skip_dict):
         pytest.skip(f"SM Profiler v1 is on path for deprecation, skipping test")
@@ -966,7 +967,7 @@ def skip_dgl_test(request):
         ">=2.1,<2.4": ["cpu", "cu121"],
         ">=2.4,<2.6": ["cpu", "cu124"],
         ">=2.6,<2.7.1": ["cpu", "cu126"],
-        ">=2.7.1,<2.8": ["cpu", "cu128"],
+        ">=2.7.1,<=2.8": ["cpu", "cu128"],
     }
     if _validate_pytorch_framework_version(request, image_uri, "skip_dgl_test", skip_dict):
         pytest.skip(f"DGL binaries are removed, skipping test")
@@ -1031,7 +1032,7 @@ def skip_serialized_release_pt_test(request):
         ">=2.1,<2.4": ["cpu", "cu121"],
         ">=2.4,<2.6": ["cpu", "cu124"],
         ">=2.6,<2.7.1": ["cpu", "cu126"],
-        ">=2.7.1,<2.8": ["cpu", "cu128"],
+        ">=2.7.1,<=2.8": ["cpu", "cu128"],
     }
     if _validate_pytorch_framework_version(
         request, image_uri, "skip_serialized_release_pt_test", skip_dict
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
new file mode 100644
index 000000000000..ec2c00a6ef14
--- /dev/null
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
@@ -0,0 +1,137 @@
+import pytest
+
+import test.test_utils as test_utils
+
+from test.test_utils import ec2
+
+from test.dlc_tests.ec2.pytorch.training import common_cases
+from test.dlc_tests.ec2 import smclarify_cases
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("pytorch_gpu_tests")
+@pytest.mark.model("N/A")
+@pytest.mark.team("conda")
+@pytest.mark.parametrize(
+    "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True
+)
+def test_pytorch_2_8_gpu(
+    pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training___2__8
+    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
+        pytest.skip(
+            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
+        )
+
+    test_cases = [
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
+    ]
+
+    if "sagemaker" in pytorch_training:
+        test_cases.append(
+            (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)),
+        )
+
+    # AMP must be run on multi_gpu
+    if ec2.is_instance_multi_gpu(ec2_instance_type):
+        test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection)))
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU")
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("pytorch_gpu_heavy_tests")
+@pytest.mark.model("N/A")
+@pytest.mark.team("conda")
+@pytest.mark.parametrize(
+    "ec2_instance_type, region",
+    common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION,
+    indirect=True,
+)
+@pytest.mark.skipif(
+    test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(),
+    reason="Skip GPU Heavy tests in PR context unless explicitly enabled",
+)
+def test_pytorch_2_8_gpu_heavy(
+    pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training___2__8
+    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
+        pytest.skip(
+            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
+        )
+
+    test_cases = [
+        (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
+    ]
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy")
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("inductor")
+@pytest.mark.model("N/A")
+@pytest.mark.team("training-compiler")
+@pytest.mark.parametrize(
+    "ec2_instance_type, region",
+    common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION,
+    indirect=True,
+)
+def test_pytorch_2_8_gpu_inductor(
+    pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training___2__8
+    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
+        pytest.skip(
+            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
+        )
+
+    test_cases = [
+        (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
+    ]
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Inductor")
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("pytorch_cpu_tests")
+@pytest.mark.model("N/A")
+@pytest.mark.team("conda")
+@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
+def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only):
+    pytorch_training = pytorch_training___2__8
+
+    test_cases = [
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
+    ]
+
+    if "sagemaker" in pytorch_training:
+        test_cases += [
+            (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)),
+        ]
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 CPU")

From 4e7caeac3a15907ae18916d82af462a5e6e96485 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 11 Aug 2025 15:07:44 -0700
Subject: [PATCH 05/16] bump fastai version and change CUDA version

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 4 ++--
 pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 9399ede0d55d..87fd09ac6513 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -238,7 +238,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
     torchdata==${TORCHDATA_VERSION} \
     s3torchconnector \
-    fastai==2.8.2 \
+    fastai==2.8.3 \
     accelerate \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
@@ -309,7 +309,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
     torchdata==${TORCHDATA_VERSION} \
     s3torchconnector \
-    fastai==2.8.2 \
+    fastai==2.8.3 \
     accelerate \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
index b26cd87ce89e..5c912c339de5 100644
--- a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
@@ -7,7 +7,7 @@ ARG TORCHAUDIO_VERSION=2.8.0
 ARG TORCHVISION_VERSION=0.23.0
 ARG TORCHDATA_VERSION=0.11.0
 
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.9.1
 ARG CUDNN_VERSION=9.10.2.21
 ARG NCCL_VERSION=2.27.3
 ARG EFA_VERSION=1.43.1
@@ -15,7 +15,7 @@ ARG GDRCOPY_VERSION=2.5
 ARG TE_VERSION=2.5
 ARG FLASH_ATTN_VERSION=2.8.2
 
-FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base_image
+FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image
 
 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
 ENV DEBIAN_FRONTEND=noninteractive
@@ -258,12 +258,12 @@ RUN pip install --no-cache-dir \
 RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     torchvision==${TORCHVISION_VERSION} \
     torchaudio==${TORCHAUDIO_VERSION} \
-    --index-url https://download.pytorch.org/whl/cu128 \
+    --index-url https://download.pytorch.org/whl/cu129 \
     && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
     torchdata==${TORCHDATA_VERSION} \
     triton \
     s3torchconnector \
-    fastai==2.8.2 \
+    fastai==2.8.3 \
     accelerate \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis

From 26fd8639c7e2b3615f9065c24cc0a233b85e1535 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 11 Aug 2025 16:18:08 -0700
Subject: [PATCH 06/16] update buildspecs and confest to cu129

---
 pytorch/training/buildspec-2-8-ec2.yml                   | 4 ++--
 pytorch/training/buildspec-2-8-sm.yml                    | 2 +-
 .../docker/2.8/py3/{cu128 => cu129}/Dockerfile.gpu       | 0
 test/dlc_tests/conftest.py                               | 9 ++++++---
 4 files changed, 9 insertions(+), 6 deletions(-)
 rename pytorch/training/docker/2.8/py3/{cu128 => cu129}/Dockerfile.gpu (100%)

diff --git a/pytorch/training/buildspec-2-8-ec2.yml b/pytorch/training/buildspec-2-8-ec2.yml
index e7222c4ba584..023d84b6867d 100644
--- a/pytorch/training/buildspec-2-8-ec2.yml
+++ b/pytorch/training/buildspec-2-8-ec2.yml
@@ -53,14 +53,14 @@ images:
     target: ec2
     context:
       <<: *TRAINING_CONTEXT
-  BuildEC2GPUPTTrainPy3cu128DockerImage:
+  BuildEC2GPUPTTrainPy3cu129DockerImage:
     <<: *TRAINING_REPOSITORY
     build: &PYTORCH_GPU_TRAINING_PY3 false
     image_size_baseline: 24000
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
-    cuda_version: &CUDA_VERSION cu128
+    cuda_version: &CUDA_VERSION cu129
     os_version: &OS_VERSION ubuntu22.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
diff --git a/pytorch/training/buildspec-2-8-sm.yml b/pytorch/training/buildspec-2-8-sm.yml
index bfb7c6195d5a..aa7372fb0ad5 100644
--- a/pytorch/training/buildspec-2-8-sm.yml
+++ b/pytorch/training/buildspec-2-8-sm.yml
@@ -60,7 +60,7 @@ images:
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
-    cuda_version: &CUDA_VERSION cu128
+    cuda_version: &CUDA_VERSION cu129
     os_version: &OS_VERSION ubuntu22.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
diff --git a/pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
similarity index 100%
rename from pytorch/training/docker/2.8/py3/cu128/Dockerfile.gpu
rename to pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
index 7f2d8651c1a3..f18a289e5f94 100644
--- a/test/dlc_tests/conftest.py
+++ b/test/dlc_tests/conftest.py
@@ -943,7 +943,8 @@ def skip_smdebug_v1_test(request):
         ">=2.1,<2.4": ["cpu", "cu121"],
         ">=2.4,<2.6": ["cpu", "cu124"],
         ">=2.6,<2.7.1": ["cpu", "cu126"],
-        ">=2.7.1,<=2.8": ["cpu", "cu128"],
+        ">=2.7.1,<2.8": ["cpu", "cu128"],
+        ">=2.8,<2.9": ["cpu", "cu129"],
     }
     if _validate_pytorch_framework_version(request, image_uri, "skip_smdebug_v1_test", skip_dict):
         pytest.skip(f"SM Profiler v1 is on path for deprecation, skipping test")
@@ -967,7 +968,8 @@ def skip_dgl_test(request):
         ">=2.1,<2.4": ["cpu", "cu121"],
         ">=2.4,<2.6": ["cpu", "cu124"],
         ">=2.6,<2.7.1": ["cpu", "cu126"],
-        ">=2.7.1,<=2.8": ["cpu", "cu128"],
+        ">=2.7.1,<2.8": ["cpu", "cu128"],
+        ">=2.8,<2.9": ["cpu", "cu129"],
     }
     if _validate_pytorch_framework_version(request, image_uri, "skip_dgl_test", skip_dict):
         pytest.skip(f"DGL binaries are removed, skipping test")
@@ -1032,7 +1034,8 @@ def skip_serialized_release_pt_test(request):
         ">=2.1,<2.4": ["cpu", "cu121"],
         ">=2.4,<2.6": ["cpu", "cu124"],
         ">=2.6,<2.7.1": ["cpu", "cu126"],
-        ">=2.7.1,<=2.8": ["cpu", "cu128"],
+        ">=2.7.1,<2.8": ["cpu", "cu128"],
+        ">=2.8,<2.9": ["cpu", "cu129"],
     }
     if _validate_pytorch_framework_version(
         request, image_uri, "skip_serialized_release_pt_test", skip_dict

From b5cde7048642addb9ed0e00cd87dd3757f681b88 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 11 Aug 2025 20:29:13 -0700
Subject: [PATCH 07/16] modify dockerfile gpu and add logging in efa test

---
 .../training/docker/2.8/py3/cu129/Dockerfile.gpu  |  1 -
 test/dlc_tests/ec2/test_efa.py                    | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 5c912c339de5..ae14ae2032ee 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -105,7 +105,6 @@ RUN apt-get update \
     cuda-toolkit-12=${CUDA_VERSION}-1 \
     libcudnn9-cuda-12=${CUDNN_VERSION}-1 \
     libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \
-    libcudnn9-headers-cuda-12=${CUDNN_VERSION}-1 \
     libhwloc-dev \
     libgomp1 \
     libibverbs-dev \
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index 9543d783f21c..4725950dccd8 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -376,9 +376,14 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         )
     else:
         # Configure MPI hosts file with IP addresses and slots for worker nodes
+        # TODO: remove debug logging after testing
+        LOGGER.info(f"Creating hosts file with master_ip={master_ip}, slots={slots}")
+        LOGGER.info(f"Worker IPs: {worker_instance_private_ips}")
+
         hosts_string = f"localhost slots={slots} "
         for worker_ip in worker_instance_private_ips:
             hosts_string += f"\n{worker_ip} slots={slots} "
+        LOGGER.info(f"Final hosts file content:\n{hosts_string}")
 
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
@@ -386,6 +391,16 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
             f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
         )
 
+        # TODO: remove debug logging after testing
+        # check to make sure file was created
+        LOGGER.info("Verifying hosts file creation:")
+        run_cmd_on_container(
+            MASTER_CONTAINER_NAME,
+            master_connection,
+            f"ls -l {HOSTS_FILE_LOCATION} && cat {HOSTS_FILE_LOCATION}",
+            hide=False
+        )
+
 
 def _setup_worker_efa_ssh_config(connection, master_pub_key):
     """

From d2632edc3665e520dfe788b8a59085f938b48349 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 11 Aug 2025 21:13:44 -0700
Subject: [PATCH 08/16] add back libcudnn9-headers-cuda-12

---
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index ae14ae2032ee..5c912c339de5 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -105,6 +105,7 @@ RUN apt-get update \
     cuda-toolkit-12=${CUDA_VERSION}-1 \
     libcudnn9-cuda-12=${CUDNN_VERSION}-1 \
     libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \
+    libcudnn9-headers-cuda-12=${CUDNN_VERSION}-1 \
     libhwloc-dev \
     libgomp1 \
     libibverbs-dev \

From ca1d99b9765aed96ab9eae3eb5c6c05f217836b9 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Tue, 12 Aug 2025 17:25:18 -0700
Subject: [PATCH 09/16] remove version pins, update cudnn header path check,
 change efa test logging

---
 .../training/docker/2.8/py3/Dockerfile.cpu    | 15 ++++----
 .../docker/2.8/py3/cu129/Dockerfile.gpu       | 14 ++++----
 .../ec2/pytorch/training/common_cases.py      | 36 ++++++++++++++++---
 test/dlc_tests/ec2/test_efa.py                |  1 +
 4 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 87fd09ac6513..141a253463c9 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -184,8 +184,8 @@ RUN pip install --no-cache-dir \
     "requests>=2.32.0" \
     "setuptools>=70.0.0" \
     "urllib3>=2.5.0" \
-    "awscli<2" \
-    "opencv-python==4.11.0.86" \
+    "awscli" \
+    "opencv-python" \
     mpi4py \
     jinja2>=3.1.6 \
     tornado>=6.5.1
@@ -238,13 +238,12 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
     torchdata==${TORCHDATA_VERSION} \
     s3torchconnector \
-    fastai==2.8.3 \
+    fastai \
     accelerate \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
     spacy \
-    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
-    thinc==8.3.4  \
+    thinc  \
     blis \
     numpy \
  && pip uninstall -y dataclasses
@@ -309,7 +308,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
     torchdata==${TORCHDATA_VERSION} \
     s3torchconnector \
-    fastai==2.8.3 \
+    fastai \
     accelerate \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
@@ -322,8 +321,8 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker>=2,<3" \
-    "sagemaker-experiments<1" \
+    "sagemaker>=2" \
+    sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training
 
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 5c912c339de5..eada47ca9e51 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -246,10 +246,9 @@ RUN pip install --no-cache-dir \
     "requests>=2.32.0" \
     "setuptools>=70.0.0" \
     "urllib3>=2.5.0" \
-    "awscli<2" \
+    "awscli" \
     ninja \
-    # pencv-python 4.12.0.88 reuqires numpy<2.3.0, which is not compatible with previous prod image(2.3.1)
-    opencv-python==4.11.0.86 \
+    opencv-python \
     mpi4py \
     jinja2>=3.1.6 \
     tornado>=6.5.1
@@ -263,13 +262,12 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     torchdata==${TORCHDATA_VERSION} \
     triton \
     s3torchconnector \
-    fastai==2.8.3 \
+    fastai \
     accelerate \
     # pin numpy requirement for fastai dependency
     # requires explicit declaration of spacy, thic, blis
     spacy \
-    #thinc 8.3.6 is not compatible with numpy 1.26.4 (sagemaker doesn't support latest numpy)
-    thinc==8.3.4 \
+    thinc \
     blis \
     numpy \
  && pip uninstall -y dataclasses
@@ -425,8 +423,8 @@ RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.g
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker>=2,<3" \
-    "sagemaker-experiments<1" \
+    "sagemaker>=2" \
+    sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training
 
diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py
index 8f8cc7cc03e3..cd58a03eca10 100644
--- a/test/dlc_tests/ec2/pytorch/training/common_cases.py
+++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py
@@ -1,4 +1,6 @@
 import os
+import sys
+import logging
 
 from packaging.version import Version
 from packaging.specifiers import SpecifierSet
@@ -19,6 +21,10 @@
     get_efa_ec2_instance_type,
 )
 
+LOGGER = logging.getLogger(__name__)
+LOGGER.setLevel(logging.INFO)
+LOGGER.addHandler(logging.StreamHandler(sys.stderr))
+
 # Test functions
 PT_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchStandalone")
 PT_MNIST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorch")
@@ -351,6 +357,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
     """
     Test cuDNN Package
     PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
+    Checks both /usr/include/ and /usr/include/x86_64-linux-gnu/ paths to support different cuDNN package installations.
     """
     container_name = "pytorch_cudnn"
     account_id = get_account_id_from_image_uri(pytorch_training)
@@ -360,9 +367,30 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
         f"docker run --runtime=nvidia --gpus all --name {container_name} -itd {pytorch_training}",
         hide=True,
     )
-    major_cmd = 'cat /usr/include/cudnn_version.h | grep "#define CUDNN_MAJOR"'
-    minor_cmd = 'cat /usr/include/cudnn_version.h | grep "#define CUDNN_MINOR"'
-    patch_cmd = 'cat /usr/include/cudnn_version.h | grep "#define CUDNN_PATCHLEVEL"'
+
+    cudnn_paths = [
+        "/usr/include/cudnn_version.h",
+        "/usr/include/x86_64-linux-gnu/cudnn_version.h"
+    ]
+    
+    for path in cudnn_paths:
+        check_cmd = f"[ -f {path} ] && echo 'Found'"
+        result = ec2_connection.run(
+            f"docker exec --user root {container_name} bash -c '{check_cmd}'",
+            hide=True,
+            warn=True
+        )
+        if result.ok and result.stdout.strip() == 'Found':
+            cudnn_path = path
+            LOGGER.info(f"Found cuDNN header at: {cudnn_path}")
+            break
+    else:
+        raise FileNotFoundError("Could not find cudnn_version.h in any standard location")
+
+    major_cmd = f'cat {cudnn_path} | grep "#define CUDNN_MAJOR"'
+    minor_cmd = f'cat {cudnn_path} | grep "#define CUDNN_MINOR"'
+    patch_cmd = f'cat {cudnn_path} | grep "#define CUDNN_PATCHLEVEL"'
+
     major = ec2_connection.run(
         f"docker exec --user root {container_name} bash -c '{major_cmd}'", hide=True
     ).stdout.split()[-1]
@@ -385,7 +413,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
 
     assert (
         system_cudnn == cudnn_from_torch
-    ), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson."
+    ), f"System CUDNN {system_cudnn} (from {cudnn_path}) and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson."
 
 
 def pytorch_curand_gpu(pytorch_training, ec2_connection):
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index 4725950dccd8..faa193025fcc 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -377,6 +377,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
     else:
         # Configure MPI hosts file with IP addresses and slots for worker nodes
         # TODO: remove debug logging after testing
+        master_ip = ec2_utils.get_private_ip(master_connection.host, region)
         LOGGER.info(f"Creating hosts file with master_ip={master_ip}, slots={slots}")
         LOGGER.info(f"Worker IPs: {worker_instance_private_ips}")
 

From 4079253b43ab9696a1f54fd8fe6fba0cea7addeb Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Tue, 12 Aug 2025 21:25:08 -0700
Subject: [PATCH 10/16] pin opencv-python version and simplify efa test logging

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu     |  3 ++-
 .../training/docker/2.8/py3/cu129/Dockerfile.gpu   |  2 +-
 test/dlc_tests/ec2/test_efa.py                     | 14 +++++---------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 141a253463c9..7d78611d3290 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -185,7 +185,8 @@ RUN pip install --no-cache-dir \
     "setuptools>=70.0.0" \
     "urllib3>=2.5.0" \
     "awscli" \
-    "opencv-python" \
+    # opencv-python 4.12.0.88 reuqires numpy<2.3.0, which is not compatible with previous prod image(2.3.1)
+    opencv-python==4.11.0.86 \
     mpi4py \
     jinja2>=3.1.6 \
     tornado>=6.5.1
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index eada47ca9e51..90ffb7f565b2 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -248,7 +248,7 @@ RUN pip install --no-cache-dir \
     "urllib3>=2.5.0" \
     "awscli" \
     ninja \
-    opencv-python \
+    opencv-python==4.11.0.86 \
     mpi4py \
     jinja2>=3.1.6 \
     tornado>=6.5.1
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index faa193025fcc..53ef210d4bdc 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -376,15 +376,12 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         )
     else:
         # Configure MPI hosts file with IP addresses and slots for worker nodes
-        # TODO: remove debug logging after testing
-        master_ip = ec2_utils.get_private_ip(master_connection.host, region)
-        LOGGER.info(f"Creating hosts file with master_ip={master_ip}, slots={slots}")
-        LOGGER.info(f"Worker IPs: {worker_instance_private_ips}")
-
         hosts_string = f"localhost slots={slots} "
         for worker_ip in worker_instance_private_ips:
             hosts_string += f"\n{worker_ip} slots={slots} "
-        LOGGER.info(f"Final hosts file content:\n{hosts_string}")
+
+        # TODO: remove logging
+        LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}")
 
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
@@ -392,9 +389,8 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
             f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
         )
 
-        # TODO: remove debug logging after testing
-        # check to make sure file was created
-        LOGGER.info("Verifying hosts file creation:")
+        # TODO: remove logging
+        LOGGER.info("Verifying hosts file:")
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
             master_connection,

From e9da30499772e7d69510393ad9dc99b1046cbfd7 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 13 Aug 2025 10:51:52 -0700
Subject: [PATCH 11/16] comment out some tests and change test_path to only run
 test_efa

---
 dlc_developer_config.toml                     |  2 +-
 .../training/test_pytorch_training_2_8.py     | 46 +++++-----
 test/dlc_tests/ec2/test_efa.py                | 91 ++++++++++---------
 test/testrunner.py                            |  3 +-
 4 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index cd7d59b57a9c..687079b9de8a 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -46,7 +46,7 @@ build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
-do_build = true
+do_build = false
 
 [notify]
 ### Notify on test failures
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
index ec2c00a6ef14..2a9b678105ab 100644
--- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
@@ -25,18 +25,18 @@ def test_pytorch_2_8_gpu(
         )
 
     test_cases = [
-        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
-        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        # (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:
@@ -74,8 +74,8 @@ def test_pytorch_2_8_gpu_heavy(
         )
 
     test_cases = [
-        (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
     ]
 
     test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy")
@@ -118,15 +118,15 @@ def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only):
     pytorch_training = pytorch_training___2__8
 
     test_cases = [
-        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
-        (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
+        # (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index 53ef210d4bdc..6c692751841a 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -147,50 +147,50 @@ def test_efa_tensorflow(
     )
 
 
-@pytest.mark.skip(
-    "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
-)
-@pytest.mark.processor("gpu")
-@pytest.mark.model("N/A")
-@pytest.mark.integration("efa")
-@pytest.mark.usefixtures("sagemaker_only")
-@pytest.mark.usefixtures("pt201_and_above_only")
-@pytest.mark.allow_p4de_use
-@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
-@pytest.mark.team("conda")
-@pytest.mark.skipif(
-    is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
-    reason="Skip EFA test in PR context unless explicitly enabled",
-)
-def test_pytorch_efa_healthcheck(
-    pytorch_training,
-    efa_ec2_instances,
-    efa_ec2_connections,
-    ec2_instance_type,
-    region,
-    gpu_only,
-):
-    """
-    Run EFA Health Check tests on DLC.
-    :param pytorch_training: str PyTorch Training DLC image URI
-    :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
-    :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
-    :param ec2_instance_type: str Instance Type being tested
-    :param region: str Region in which EFA-enabled instances are launched
-    :param gpu_only: pytest fixture to limit test only to GPU DLCs
-    """
-    _setup_multinode_efa_instances(
-        pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
-    )
-    master_connection = efa_ec2_connections[0]
-    run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
-    run_cmd_on_container(
-        MASTER_CONTAINER_NAME,
-        master_connection,
-        f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
-        hide=False,
-        timeout=DEFAULT_EFA_TIMEOUT,
-    )
+# @pytest.mark.skip(
+#     "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
+# )
+# @pytest.mark.processor("gpu")
+# @pytest.mark.model("N/A")
+# @pytest.mark.integration("efa")
+# @pytest.mark.usefixtures("sagemaker_only")
+# @pytest.mark.usefixtures("pt201_and_above_only")
+# @pytest.mark.allow_p4de_use
+# @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
+# @pytest.mark.team("conda")
+# @pytest.mark.skipif(
+#     is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
+#     reason="Skip EFA test in PR context unless explicitly enabled",
+# )
+# def test_pytorch_efa_healthcheck(
+#     pytorch_training,
+#     efa_ec2_instances,
+#     efa_ec2_connections,
+#     ec2_instance_type,
+#     region,
+#     gpu_only,
+# ):
+#     """
+#     Run EFA Health Check tests on DLC.
+#     :param pytorch_training: str PyTorch Training DLC image URI
+#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
+#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
+#     :param ec2_instance_type: str Instance Type being tested
+#     :param region: str Region in which EFA-enabled instances are launched
+#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
+#     """
+#     _setup_multinode_efa_instances(
+#         pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
+#     )
+#     master_connection = efa_ec2_connections[0]
+#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
+#     run_cmd_on_container(
+#         MASTER_CONTAINER_NAME,
+#         master_connection,
+#         f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
+#         hide=False,
+#         timeout=DEFAULT_EFA_TIMEOUT,
+#     )
 
 
 def _setup_multinode_efa_instances(
@@ -383,6 +383,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         # TODO: remove logging
         LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}")
 
+        LOGGER.info(f"Running command: {f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}"""}")
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
             master_connection,
@@ -390,7 +391,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         )
 
         # TODO: remove logging
-        LOGGER.info("Verifying hosts file:")
+        LOGGER.info(f"Verifying hosts file {HOSTS_FILE_LOCATION}:")
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
             master_connection,
diff --git a/test/testrunner.py b/test/testrunner.py
index 4746740437bc..86b2ed4692de 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -444,7 +444,8 @@ def main():
         pytest_cmd = [
             "-s",
             "-rA",
-            test_path,
+            # test_path,
+            os.path.join(test_path, "test_efa.py::test_pytorch_efa"),
             f"--junitxml={report}",
             "-n=auto",
         ]

From bcd8969b77478231df237db28844b000365e898d Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 13 Aug 2025 11:07:37 -0700
Subject: [PATCH 12/16] fix syntax

---
 test/dlc_tests/ec2/test_efa.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index 6c692751841a..233f75056ed0 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -383,7 +383,7 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         # TODO: remove logging
         LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}")
 
-        LOGGER.info(f"Running command: {f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}"""}")
+        LOGGER.info(f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""")
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
             master_connection,

From 915eea225221d2bdd0b8a6373ed73fc36e97c201 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 13 Aug 2025 11:48:10 -0700
Subject: [PATCH 13/16] uncomment training log

---
 test/dlc_tests/container_tests/bin/efa/testEFA |  2 +-
 test/dlc_tests/ec2/test_efa.py                 | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA
index 420cd711dc18..043344639ccb 100755
--- a/test/dlc_tests/container_tests/bin/efa/testEFA
+++ b/test/dlc_tests/container_tests/bin/efa/testEFA
@@ -89,7 +89,7 @@ check_efa_nccl_all_reduce(){
     
     RETURN_VAL=${PIPESTATUS[0]}
     # In case, if you would like see logs, uncomment below line
-    # RESULT=$(cat ${TRAINING_LOG})
+    RESULT=$(cat ${TRAINING_LOG})
 
     if [ ${RETURN_VAL} -eq 0 ]; then
         echo "***************************** check_efa_nccl_all_reduce passed *****************************"
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index 233f75056ed0..053eb97c81ce 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -391,11 +391,19 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         )
 
         # TODO: remove logging
-        LOGGER.info(f"Verifying hosts file {HOSTS_FILE_LOCATION}:")
+        LOGGER.info(f"Checking if hosts file exists:")
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
             master_connection,
-            f"ls -l {HOSTS_FILE_LOCATION} && cat {HOSTS_FILE_LOCATION}",
+            f"ls -l {HOSTS_FILE_LOCATION}",
+            hide=False
+        )
+
+        LOGGER.info(f"Checking hosts file contents:")
+        run_cmd_on_container(
+            MASTER_CONTAINER_NAME,
+            master_connection,
+            f"cat {HOSTS_FILE_LOCATION}",
             hide=False
         )
 

From 2d98fb1681764d7c1bf5fec1841b5f86f3489ffb Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 13 Aug 2025 12:32:09 -0700
Subject: [PATCH 14/16] change log validation

---
 test/dlc_tests/container_tests/bin/efa/testEFA | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA
index 043344639ccb..52f5664625d8 100755
--- a/test/dlc_tests/container_tests/bin/efa/testEFA
+++ b/test/dlc_tests/container_tests/bin/efa/testEFA
@@ -36,7 +36,7 @@ validate_all_reduce_performance_logs(){
     # EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric"
     grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
     if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then
-        grep "Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG}
+        grep "NCCL_TOPO_FILE set by environment to" ${TRAINING_LOG}
         # EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA
         grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG}
     fi

From 454b69362999531b583b5a13fef95650c81ac59c Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 13 Aug 2025 15:57:27 -0700
Subject: [PATCH 15/16] change LD_LIBRARY path and rebuild

---
 dlc_developer_config.toml                            | 2 +-
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 687079b9de8a..cd7d59b57a9c 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -46,7 +46,7 @@ build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
-do_build = false
+do_build = true
 
 [notify]
 ### Notify on test failures
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 90ffb7f565b2..70d9ba03c31f 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -19,7 +19,7 @@ FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image
 
 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
 ENV DEBIAN_FRONTEND=noninteractive
-ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
 
 RUN apt-get update \
  && apt-get upgrade -y \
@@ -61,8 +61,8 @@ ARG NCCL_VERSION
 ARG EFA_VERSION
 
 ENV CUDA_HOME="/usr/local/cuda"
-ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
-ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+# ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
 ENV PATH="${CUDA_HOME}/bin:${PATH}"
 ENV EFA_PATH="/opt/amazon/efa"
 ENV OPEN_MPI_PATH="/opt/amazon/openmpi"
@@ -164,7 +164,7 @@ RUN mkdir /tmp/efa \
  && apt-get clean
 
 ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}"
-ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
 
 # Configure Open MPI and configure NCCL parameters
 RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \

From 93b6143b5eeb72781788773c3d4690ccc9566783 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 13 Aug 2025 20:30:09 -0700
Subject: [PATCH 16/16] Remove nvjpeg patching script and rebuild with normal
 test path

---
 dlc_developer_config.toml                     |   8 +-
 .../docker/2.8/py3/cu129/Dockerfile.gpu       |  17 ---
 .../training/test_pytorch_training_2_8.py     |  46 ++++----
 test/dlc_tests/ec2/test_efa.py                | 109 +++++++-----------
 test/testrunner.py                            |   3 +-
 5 files changed, 72 insertions(+), 111 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index cd7d59b57a9c..308e46f01ace 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -41,12 +41,12 @@ build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
-build_training = true
+build_training = true 
 build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
-do_build = true
+do_build = true 
 
 [notify]
 ### Notify on test failures
@@ -71,7 +71,7 @@ ec2_benchmark_tests = true
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
 ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
 ### Off by default (set to false)
-ec2_tests_on_heavy_instances = true
+ec2_tests_on_heavy_instances = true 
 ### SM specific tests
 ### On by default
 sagemaker_local_tests = true
@@ -119,7 +119,7 @@ use_scheduler = false
 ### TRAINING PR JOBS ###
 
 # Standard Framework Training
-dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" 
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 70d9ba03c31f..0baf63080ce8 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -19,7 +19,6 @@ FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image
 
 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
 ENV DEBIAN_FRONTEND=noninteractive
-# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
 
 RUN apt-get update \
  && apt-get upgrade -y \
@@ -61,8 +60,6 @@ ARG NCCL_VERSION
 ARG EFA_VERSION
 
 ENV CUDA_HOME="/usr/local/cuda"
-# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
-# ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
 ENV PATH="${CUDA_HOME}/bin:${PATH}"
 ENV EFA_PATH="/opt/amazon/efa"
 ENV OPEN_MPI_PATH="/opt/amazon/openmpi"
@@ -132,20 +129,6 @@ RUN apt-get update \
  && rm -rf /var/lib/apt/lists/* \
  && apt-get clean
 
- # patch nvjpeg to fix CVE
-RUN mkdir -p /tmp/nvjpeg \
-&& cd /tmp/nvjpeg \
-&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
-&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
-&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
-&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
-&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
-&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
-&& rm -rf /tmp/nvjpeg \
-# patch cuobjdump and nvdisasm
-&& rm -rf /usr/local/cuda/bin/cuobjdump* \
-&& rm -rf /usr/local/cuda/bin/nvdisasm* 
-
 # For EFA, below flags are needed to install EFA on docker image
 #  -n, --no-verify       Skip EFA device verification and test
 #  -l, --skip-limit-conf Skip EFA limit configuration
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
index 2a9b678105ab..ec2c00a6ef14 100644
--- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
@@ -25,18 +25,18 @@ def test_pytorch_2_8_gpu(
         )
 
     test_cases = [
-        # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
-        # (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:
@@ -74,8 +74,8 @@ def test_pytorch_2_8_gpu_heavy(
         )
 
     test_cases = [
-        # (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
     ]
 
     test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy")
@@ -118,15 +118,15 @@ def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only):
     pytorch_training = pytorch_training___2__8
 
     test_cases = [
-        # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index 053eb97c81ce..9543d783f21c 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -147,50 +147,50 @@ def test_efa_tensorflow(
     )
 
 
-# @pytest.mark.skip(
-#     "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
-# )
-# @pytest.mark.processor("gpu")
-# @pytest.mark.model("N/A")
-# @pytest.mark.integration("efa")
-# @pytest.mark.usefixtures("sagemaker_only")
-# @pytest.mark.usefixtures("pt201_and_above_only")
-# @pytest.mark.allow_p4de_use
-# @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
-# @pytest.mark.team("conda")
-# @pytest.mark.skipif(
-#     is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
-#     reason="Skip EFA test in PR context unless explicitly enabled",
-# )
-# def test_pytorch_efa_healthcheck(
-#     pytorch_training,
-#     efa_ec2_instances,
-#     efa_ec2_connections,
-#     ec2_instance_type,
-#     region,
-#     gpu_only,
-# ):
-#     """
-#     Run EFA Health Check tests on DLC.
-#     :param pytorch_training: str PyTorch Training DLC image URI
-#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
-#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
-#     :param ec2_instance_type: str Instance Type being tested
-#     :param region: str Region in which EFA-enabled instances are launched
-#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
-#     """
-#     _setup_multinode_efa_instances(
-#         pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
-#     )
-#     master_connection = efa_ec2_connections[0]
-#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
-#     run_cmd_on_container(
-#         MASTER_CONTAINER_NAME,
-#         master_connection,
-#         f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
-#         hide=False,
-#         timeout=DEFAULT_EFA_TIMEOUT,
-#     )
+@pytest.mark.skip(
+    "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
+)
+@pytest.mark.processor("gpu")
+@pytest.mark.model("N/A")
+@pytest.mark.integration("efa")
+@pytest.mark.usefixtures("sagemaker_only")
+@pytest.mark.usefixtures("pt201_and_above_only")
+@pytest.mark.allow_p4de_use
+@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
+@pytest.mark.team("conda")
+@pytest.mark.skipif(
+    is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
+    reason="Skip EFA test in PR context unless explicitly enabled",
+)
+def test_pytorch_efa_healthcheck(
+    pytorch_training,
+    efa_ec2_instances,
+    efa_ec2_connections,
+    ec2_instance_type,
+    region,
+    gpu_only,
+):
+    """
+    Run EFA Health Check tests on DLC.
+    :param pytorch_training: str PyTorch Training DLC image URI
+    :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
+    :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
+    :param ec2_instance_type: str Instance Type being tested
+    :param region: str Region in which EFA-enabled instances are launched
+    :param gpu_only: pytest fixture to limit test only to GPU DLCs
+    """
+    _setup_multinode_efa_instances(
+        pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
+    )
+    master_connection = efa_ec2_connections[0]
+    run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
+    run_cmd_on_container(
+        MASTER_CONTAINER_NAME,
+        master_connection,
+        f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
+        hide=False,
+        timeout=DEFAULT_EFA_TIMEOUT,
+    )
 
 
 def _setup_multinode_efa_instances(
@@ -380,33 +380,12 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         for worker_ip in worker_instance_private_ips:
             hosts_string += f"\n{worker_ip} slots={slots} "
 
-        # TODO: remove logging
-        LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}")
-
-        LOGGER.info(f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""")
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
             master_connection,
             f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
         )
 
-        # TODO: remove logging
-        LOGGER.info(f"Checking if hosts file exists:")
-        run_cmd_on_container(
-            MASTER_CONTAINER_NAME,
-            master_connection,
-            f"ls -l {HOSTS_FILE_LOCATION}",
-            hide=False
-        )
-
-        LOGGER.info(f"Checking hosts file contents:")
-        run_cmd_on_container(
-            MASTER_CONTAINER_NAME,
-            master_connection,
-            f"cat {HOSTS_FILE_LOCATION}",
-            hide=False
-        )
-
 
 def _setup_worker_efa_ssh_config(connection, master_pub_key):
     """
diff --git a/test/testrunner.py b/test/testrunner.py
index 86b2ed4692de..4746740437bc 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -444,8 +444,7 @@ def main():
         pytest_cmd = [
             "-s",
             "-rA",
-            # test_path,
-            os.path.join(test_path, "test_efa.py::test_pytorch_efa"),
+            test_path,
             f"--junitxml={report}",
             "-n=auto",
         ]