[PATCH] PyTorch Inference Images CVE Patch (#5017)

Yadan-Wei · Yadan Wei · web-flow · commit f69f96388729 · 2025-07-15T21:22:28.000-07:00
* Patch nvjpeg for PyTorch Inference Image

---------

Co-authored-by: Yadan Wei &lt;yadanwei@amazon.com&gt;
diff --git a/pytorch/inference/buildspec-2-5-ec2.yml b/pytorch/inference/buildspec-2-5-ec2.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.5.1
 short_version: &SHORT_VERSION "2.5"
 arch_type: x86
-autopatch_build: "True"
+#autopatch_build: "True"
 
 repository_info:
   inference_repository: &INFERENCE_REPOSITORY
diff --git a/pytorch/inference/buildspec-2-5-sm.yml b/pytorch/inference/buildspec-2-5-sm.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.5.1
 short_version: &SHORT_VERSION "2.5"
 arch_type: x86
-autopatch_build: "True"
+# autopatch_build: "True"
 
 repository_info:
   inference_repository: &INFERENCE_REPOSITORY
diff --git a/pytorch/inference/buildspec-2-6-ec2.yml b/pytorch/inference/buildspec-2-6-ec2.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.6.0
 short_version: &SHORT_VERSION "2.6"
 arch_type: x86
-autopatch_build: "True"
+# autopatch_build: "True"
 
 repository_info:
   inference_repository: &INFERENCE_REPOSITORY
diff --git a/pytorch/inference/buildspec-2-6-sm.yml b/pytorch/inference/buildspec-2-6-sm.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.6.0
 short_version: &SHORT_VERSION "2.6"
 arch_type: x86
-autopatch_build: "True"
+# autopatch_build: "True"
 
 repository_info:
   inference_repository: &INFERENCE_REPOSITORY
diff --git a/pytorch/inference/docker/2.5/py3/Dockerfile.cpu b/pytorch/inference/docker/2.5/py3/Dockerfile.cpu
@@ -123,7 +123,7 @@ RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases
  && ~/miniforge3.sh -b -p /opt/conda \
  && rm ~/miniforge3.sh
 
-RUN pip install --no-cache-dir --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
+RUN pip install --no-cache-dir --upgrade "pip>=25.1.1" --trusted-host pypi.org --trusted-host files.pythonhosted.org \
  && ln -s /opt/conda/bin/pip /usr/local/bin/pip3
 
 # Install common conda packages
@@ -150,7 +150,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     tqdm \
     requests \
     setuptools \
-    urllib3 \
+    "urllib3>=2.5.0" \
     awscli \
  && /opt/conda/bin/mamba clean -afy \
  && rm -rf /etc/apt/sources.list.d/*
@@ -175,6 +175,9 @@ RUN pip install --no-cache-dir -U -r https://raw.githubusercontent.com/pytorch/s
 # py-vuln: 71064
 RUN pip install --no-cache-dir -U "requests>=2.32.3"
 
+# address pip vulnerability
+RUN pip install --no-cache-dir --upgrade "pip>=25.1.1"
+
 # Create user and folders
 RUN useradd -m model-server \
  && mkdir -p ${TEMP} /opt/ml/model \
diff --git a/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.gpu b/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.gpu
@@ -130,10 +130,26 @@ RUN apt-get update \
     openssl \
     python3-dev \
     libgssapi-krb5-2 \
+    openssh-client \
+    openssh-server \
  && apt-get autoremove -y \
  && rm -rf /var/lib/apt/lists/* \
  && apt-get clean
 
+# patch nvjpeg
+RUN mkdir -p /tmp/nvjpeg \
+&& cd /tmp/nvjpeg \
+&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
+&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
+&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
+&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
+&& rm -rf /tmp/nvjpeg \
+# patch cuobjdump and nvdisasm
+&& rm -rf /usr/local/cuda/bin/cuobjdump* \
+&& rm -rf /usr/local/cuda/bin/nvdisasm* \
+
 # Install OpenMPI
 RUN wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPEN_MPI_VERSION}.tar.gz \
  && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \
@@ -158,7 +174,7 @@ RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases
  && ~/miniforge3.sh -b -p /opt/conda \
  && rm ~/miniforge3.sh
 
-RUN pip install --no-cache-dir --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
+RUN pip install --no-cache-dir --upgrade "pip>=25.1.1" --trusted-host pypi.org --trusted-host files.pythonhosted.org \
  && ln -s /opt/conda/bin/pip /usr/local/bin/pip3
 
 # Install common conda packages
@@ -187,7 +203,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     tqdm \
     requests \
     setuptools \
-    urllib3 \
+    "urllib3>=2.5.0" \
     awscli \
     libgcc \
     # ninja is needed for apex to speed up the build
@@ -209,7 +225,7 @@ RUN pip install --no-cache-dir -U \
     ${TORCHVISION_URL} \
     ${TORCHAUDIO_URL} \
     ${TORCHTEXT_URL} \
-    triton
+    "triton==3.1.0"
 
 # Install NCCL
 RUN cd /tmp \
@@ -228,6 +244,9 @@ RUN pip install --no-cache-dir -U -r https://raw.githubusercontent.com/pytorch/s
 # py-vuln: 71064
 RUN pip install --no-cache-dir -U "requests>=2.32.3"
 
+# address pip vulnerability
+RUN pip install --no-cache-dir --upgrade "pip>=25.1.1"
+
 # create user and folders
 RUN useradd -m model-server \
  && mkdir -p ${TEMP} /opt/ml/model \
diff --git a/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.gpu b/pytorch/inference/docker/2.6/py3/cu124/Dockerfile.gpu
@@ -139,10 +139,28 @@ RUN apt-get update \
     liblzma-dev \
     tk-dev \
     libffi-dev \
+    # make mpi4py work
+    openssh-client \
+    openssh-server \
  && apt-get autoremove -y \
  && rm -rf /var/lib/apt/lists/* \
  && apt-get clean
 
+
+# patch nvjpeg
+RUN mkdir -p /tmp/nvjpeg \
+&& cd /tmp/nvjpeg \
+&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
+&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
+&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
+&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
+&& rm -rf /tmp/nvjpeg \
+# patch cuobjdump and nvdisasm
+&& rm -rf /usr/local/cuda/bin/cuobjdump* \
+&& rm -rf /usr/local/cuda/bin/nvdisasm* \
+
 # Install OpenMPI
 RUN wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPEN_MPI_VERSION}.tar.gz \
  && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \
@@ -223,7 +241,7 @@ RUN pip install --no-cache-dir -U \
     ${TORCHAUDIO_URL} \
     ${TORCHTEXT_URL} \
     ${TORCHDATA_URL} \
-    triton
+    "triton==3.2.0"
 
 # Install NCCL
 RUN cd /tmp \
diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
@@ -25,6 +25,7 @@
     is_tf_version,
     is_above_framework_version,
     is_below_framework_version,
+    is_below_cuda_version,
     is_equal_to_framework_version,
     is_ec2_image,
     is_sagemaker_image,
@@ -1294,6 +1295,11 @@ def below_tf219_only():
     pass
 
 
+@pytest.fixture(scope="session")
+def below_cuda129_only():
+    pass
+
+
 @pytest.fixture(scope="session")
 def skip_tf216():
     pass
@@ -1409,6 +1415,23 @@ def _version_skip(img_uri, ver):
     return _version_skip
 
 
+def cuda_version_within_limit(metafunc_obj, image):
+    """
+    Test all pytest fixtures for CUDA version limits, and return True if all requirements are satisfied
+
+    :param metafunc_obj: pytest metafunc object from which fixture names used by test function will be obtained
+    :param image: Image URI for which the validation must be performed
+    :return: True if all validation succeeds, else False
+    """
+    cuda129_requirement_failed = (
+        "below_cuda129_only" in metafunc_obj.fixturenames
+        and not is_below_cuda_version("12.9", image)
+    )
+    if cuda129_requirement_failed:
+        return False
+    return True
+
+
 def framework_version_within_limit(metafunc_obj, image):
     """
     Test all pytest fixtures for TensorFlow version limits, and return True if all requirements are satisfied
@@ -1817,6 +1840,8 @@ def pytest_generate_tests(metafunc):
                         continue
                     if not framework_version_within_limit(metafunc, image):
                         continue
+                    if not cuda_version_within_limit(metafunc, image):
+                        continue
                     if "non_huggingface_only" in metafunc.fixturenames and "huggingface" in image:
                         continue
                     if (
diff --git a/test/dlc_tests/ec2/test_nvjpeg.py b/test/dlc_tests/ec2/test_nvjpeg.py
@@ -0,0 +1,82 @@
+import time
+
+import pytest
+
+from test import test_utils
+from test.test_utils import ec2 as ec2_utils
+from test.test_utils import LOGGER
+from packaging.version import Version
+from packaging.specifiers import SpecifierSet
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.model("N/A")
+@pytest.mark.processor("gpu")
+@pytest.mark.parametrize("ec2_instance_type", ["g5.8xlarge"], indirect=True)
+@pytest.mark.timeout(1200)
+@pytest.mark.skipif(
+    not test_utils.is_pr_context(),
+    reason="Only run nvjpeg test in PR context to avoid block MAINLINE",
+)
+def test_nvjpeg_gpu_x86(gpu, ec2_connection, ec2_instance, x86_compatible_only, below_cuda129_only):
+    _run_nvjpeg_test(gpu, ec2_connection)
+
+
+def _run_nvjpeg_test(image_uri, ec2_connection):
+    """
+    Runs the nvJPEG test on the specified image URI.
+    """
+    LOGGER.info(f"starting _run_nvjpeg_test with {image_uri}")
+
+    account_id = test_utils.get_account_id_from_image_uri(image_uri)
+    image_region = test_utils.get_region_from_image_uri(image_uri)
+    repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(image_uri)
+    cuda_version = test_utils.get_cuda_version_from_tag(image_uri)
+
+    container_name = f"{repo_name}-test-nvjpeg"
+
+    LOGGER.info(f"_run_nvjpeg_test pulling: {image_uri}")
+    test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
+
+    ec2_connection.run(f"docker pull {image_uri}", hide="out")
+
+    LOGGER.info(f"_run_nvjpeg_test running: {image_uri}")
+    ec2_connection.run(
+        f"docker run --runtime=nvidia --gpus all --name {container_name} -id {image_uri}"
+    )
+    cuda_version_numeric = cuda_version.strip("cu")
+    if Version(cuda_version_numeric) < Version("126"):
+        # 12.4.1 has a different branch tag in cuda-samples
+        if Version(cuda_version_numeric) == Version("124"):
+            git_branch_tag = "12.4.1"
+        else:
+            git_branch_tag = f"{cuda_version_numeric[:-1]}.{cuda_version_numeric[-1]}"
+        test_command = (
+            f"git clone -b v{git_branch_tag} https://github.com/NVIDIA/cuda-samples.git && "
+            "cd cuda-samples/Samples/4_CUDA_Libraries/nvJPEG && "
+            "make -j$(nproc) && "
+            "./nvJPEG"
+        )
+    else:
+        # For CUDA 12.6 and above, we use the v12.8 branch of cuda-samples
+        # This is a workaround for the issue where the nvJPEG sample in the
+        # cuda-samples repository does not support compute_100 architecture.
+        # The v12.8 branch is used to avoid the issue with compute_100 architecture.
+        # See
+        # sample 12.9 or master branch has compute_100 arch support issue
+        # https://github.com/NVIDIA/cuda-samples/issues/367
+        test_command = (
+            f"git clone -b v12.8 https://github.com/NVIDIA/cuda-samples.git && "
+            "cd cuda-samples && "
+            "mkdir build && cd build && "
+            "cmake .. && "
+            "cd Samples/4_CUDA_Libraries/nvJPEG && "
+            "make -j$(nproc) && "
+            "./nvJPEG"
+        )
+
+    output = ec2_connection.run(
+        f"docker exec {container_name} /bin/bash -c '{test_command}'"
+    ).stdout.strip("\n")
+
+    return output
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
@@ -631,6 +631,21 @@ def is_below_framework_version(version_upper_bound, image_uri, framework):
     )
 
 
+def is_below_cuda_version(version_upper_bound, image_uri):
+    """
+    Validate that image_uri has cuda version strictly less than version_upper_bound
+
+    :param version_upper_bound: str Cuda version that image_uri is required to be below
+    :param image_uri: str ECR Image URI for the image to be validated
+    :return: bool True if image_uri has cuda version less than version_upper_bound, else False
+    """
+    cuda_version = get_cuda_version_from_tag(image_uri)
+    numbers = cuda_version[2:]
+    numeric_version = f"{numbers[:-1]}.{numbers[-1]}"
+    required_version_specifier_set = SpecifierSet(f"<{version_upper_bound}")
+    return numeric_version in required_version_specifier_set
+
+
 def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
     """
     Check for all compatibility issues between DLC Image Types and EC2 Instance Types.
diff --git a/test/test_utils/test_reporting.py b/test/test_utils/test_reporting.py
@@ -50,6 +50,7 @@ class TestReportGenerator:
 
     ALLOWED_SINGLE_GPU_TESTS = (
         "telemetry",
+        "test_nvjpeg_gpu",
         "test_framework_and_cuda_version_gpu",
         "test_framework_and_cuda_version_graviton_gpu",
         "test_framework_and_cuda_version_arm64_gpu",