Skip to content

Commit f69f963

Browse files
Yadan-WeiYadan Wei
andauthored
[PATCH] PyTorch Inference Images CVE Patch (#5017)
* Patch nvjpeg for PyTorch Inference Image --------- Co-authored-by: Yadan Wei <[email protected]>
1 parent 35e5515 commit f69f963

File tree

11 files changed

+173
-10
lines changed

11 files changed

+173
-10
lines changed

pytorch/inference/buildspec-2-5-ec2.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.5.1
66
short_version: &SHORT_VERSION "2.5"
77
arch_type: x86
8-
autopatch_build: "True"
8+
#autopatch_build: "True"
99

1010
repository_info:
1111
inference_repository: &INFERENCE_REPOSITORY

pytorch/inference/buildspec-2-5-sm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.5.1
66
short_version: &SHORT_VERSION "2.5"
77
arch_type: x86
8-
autopatch_build: "True"
8+
# autopatch_build: "True"
99

1010
repository_info:
1111
inference_repository: &INFERENCE_REPOSITORY

pytorch/inference/buildspec-2-6-ec2.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.6.0
66
short_version: &SHORT_VERSION "2.6"
77
arch_type: x86
8-
autopatch_build: "True"
8+
# autopatch_build: "True"
99

1010
repository_info:
1111
inference_repository: &INFERENCE_REPOSITORY

pytorch/inference/buildspec-2-6-sm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.6.0
66
short_version: &SHORT_VERSION "2.6"
77
arch_type: x86
8-
autopatch_build: "True"
8+
# autopatch_build: "True"
99

1010
repository_info:
1111
inference_repository: &INFERENCE_REPOSITORY

pytorch/inference/docker/2.5/py3/Dockerfile.cpu

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases
123123
&& ~/miniforge3.sh -b -p /opt/conda \
124124
&& rm ~/miniforge3.sh
125125

126-
RUN pip install --no-cache-dir --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
126+
RUN pip install --no-cache-dir --upgrade "pip>=25.1.1" --trusted-host pypi.org --trusted-host files.pythonhosted.org \
127127
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3
128128

129129
# Install common conda packages
@@ -150,7 +150,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
150150
tqdm \
151151
requests \
152152
setuptools \
153-
urllib3 \
153+
"urllib3>=2.5.0" \
154154
awscli \
155155
&& /opt/conda/bin/mamba clean -afy \
156156
&& rm -rf /etc/apt/sources.list.d/*
@@ -175,6 +175,9 @@ RUN pip install --no-cache-dir -U -r https://raw.githubusercontent.com/pytorch/s
175175
# py-vuln: 71064
176176
RUN pip install --no-cache-dir -U "requests>=2.32.3"
177177

178+
# address pip vulnerability
179+
RUN pip install --no-cache-dir --upgrade "pip>=25.1.1"
180+
178181
# Create user and folders
179182
RUN useradd -m model-server \
180183
&& mkdir -p ${TEMP} /opt/ml/model \

pytorch/inference/docker/2.5/py3/cu124/Dockerfile.gpu

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,26 @@ RUN apt-get update \
130130
openssl \
131131
python3-dev \
132132
libgssapi-krb5-2 \
133+
openssh-client \
134+
openssh-server \
133135
&& apt-get autoremove -y \
134136
&& rm -rf /var/lib/apt/lists/* \
135137
&& apt-get clean
136138

139+
# patch nvjpeg
140+
RUN mkdir -p /tmp/nvjpeg \
141+
&& cd /tmp/nvjpeg \
142+
&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
143+
&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
144+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
145+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
146+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
147+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
148+
&& rm -rf /tmp/nvjpeg \
149+
# patch cuobjdump and nvdisasm
150+
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
151+
&& rm -rf /usr/local/cuda/bin/nvdisasm* \
152+
137153
# Install OpenMPI
138154
RUN wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPEN_MPI_VERSION}.tar.gz \
139155
&& gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \
@@ -158,7 +174,7 @@ RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases
158174
&& ~/miniforge3.sh -b -p /opt/conda \
159175
&& rm ~/miniforge3.sh
160176

161-
RUN pip install --no-cache-dir --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
177+
RUN pip install --no-cache-dir --upgrade "pip>=25.1.1" --trusted-host pypi.org --trusted-host files.pythonhosted.org \
162178
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3
163179

164180
# Install common conda packages
@@ -187,7 +203,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
187203
tqdm \
188204
requests \
189205
setuptools \
190-
urllib3 \
206+
"urllib3>=2.5.0" \
191207
awscli \
192208
libgcc \
193209
# ninja is needed for apex to speed up the build
@@ -209,7 +225,7 @@ RUN pip install --no-cache-dir -U \
209225
${TORCHVISION_URL} \
210226
${TORCHAUDIO_URL} \
211227
${TORCHTEXT_URL} \
212-
triton
228+
"triton==3.1.0"
213229

214230
# Install NCCL
215231
RUN cd /tmp \
@@ -228,6 +244,9 @@ RUN pip install --no-cache-dir -U -r https://raw.githubusercontent.com/pytorch/s
228244
# py-vuln: 71064
229245
RUN pip install --no-cache-dir -U "requests>=2.32.3"
230246

247+
# address pip vulnerability
248+
RUN pip install --no-cache-dir --upgrade "pip>=25.1.1"
249+
231250
# create user and folders
232251
RUN useradd -m model-server \
233252
&& mkdir -p ${TEMP} /opt/ml/model \

pytorch/inference/docker/2.6/py3/cu124/Dockerfile.gpu

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,28 @@ RUN apt-get update \
139139
liblzma-dev \
140140
tk-dev \
141141
libffi-dev \
142+
# make mpi4py work
143+
openssh-client \
144+
openssh-server \
142145
&& apt-get autoremove -y \
143146
&& rm -rf /var/lib/apt/lists/* \
144147
&& apt-get clean
145148

149+
150+
# patch nvjpeg
151+
RUN mkdir -p /tmp/nvjpeg \
152+
&& cd /tmp/nvjpeg \
153+
&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
154+
&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
155+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
156+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
157+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
158+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
159+
&& rm -rf /tmp/nvjpeg \
160+
# patch cuobjdump and nvdisasm
161+
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
162+
&& rm -rf /usr/local/cuda/bin/nvdisasm* \
163+
146164
# Install OpenMPI
147165
RUN wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPEN_MPI_VERSION}.tar.gz \
148166
&& gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \
@@ -223,7 +241,7 @@ RUN pip install --no-cache-dir -U \
223241
${TORCHAUDIO_URL} \
224242
${TORCHTEXT_URL} \
225243
${TORCHDATA_URL} \
226-
triton
244+
"triton==3.2.0"
227245

228246
# Install NCCL
229247
RUN cd /tmp \

test/dlc_tests/conftest.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
is_tf_version,
2626
is_above_framework_version,
2727
is_below_framework_version,
28+
is_below_cuda_version,
2829
is_equal_to_framework_version,
2930
is_ec2_image,
3031
is_sagemaker_image,
@@ -1294,6 +1295,11 @@ def below_tf219_only():
12941295
pass
12951296

12961297

1298+
@pytest.fixture(scope="session")
1299+
def below_cuda129_only():
1300+
pass
1301+
1302+
12971303
@pytest.fixture(scope="session")
12981304
def skip_tf216():
12991305
pass
@@ -1409,6 +1415,23 @@ def _version_skip(img_uri, ver):
14091415
return _version_skip
14101416

14111417

1418+
def cuda_version_within_limit(metafunc_obj, image):
1419+
"""
1420+
Test all pytest fixtures for CUDA version limits, and return True if all requirements are satisfied
1421+
1422+
:param metafunc_obj: pytest metafunc object from which fixture names used by test function will be obtained
1423+
:param image: Image URI for which the validation must be performed
1424+
:return: True if all validation succeeds, else False
1425+
"""
1426+
cuda129_requirement_failed = (
1427+
"below_cuda129_only" in metafunc_obj.fixturenames
1428+
and not is_below_cuda_version("12.9", image)
1429+
)
1430+
if cuda129_requirement_failed:
1431+
return False
1432+
return True
1433+
1434+
14121435
def framework_version_within_limit(metafunc_obj, image):
14131436
"""
14141437
Test all pytest fixtures for TensorFlow version limits, and return True if all requirements are satisfied
@@ -1817,6 +1840,8 @@ def pytest_generate_tests(metafunc):
18171840
continue
18181841
if not framework_version_within_limit(metafunc, image):
18191842
continue
1843+
if not cuda_version_within_limit(metafunc, image):
1844+
continue
18201845
if "non_huggingface_only" in metafunc.fixturenames and "huggingface" in image:
18211846
continue
18221847
if (

test/dlc_tests/ec2/test_nvjpeg.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import time
2+
3+
import pytest
4+
5+
from test import test_utils
6+
from test.test_utils import ec2 as ec2_utils
7+
from test.test_utils import LOGGER
8+
from packaging.version import Version
9+
from packaging.specifiers import SpecifierSet
10+
11+
12+
@pytest.mark.usefixtures("sagemaker")
13+
@pytest.mark.model("N/A")
14+
@pytest.mark.processor("gpu")
15+
@pytest.mark.parametrize("ec2_instance_type", ["g5.8xlarge"], indirect=True)
16+
@pytest.mark.timeout(1200)
17+
@pytest.mark.skipif(
18+
not test_utils.is_pr_context(),
19+
reason="Only run nvjpeg test in PR context to avoid block MAINLINE",
20+
)
21+
def test_nvjpeg_gpu_x86(gpu, ec2_connection, ec2_instance, x86_compatible_only, below_cuda129_only):
22+
_run_nvjpeg_test(gpu, ec2_connection)
23+
24+
25+
def _run_nvjpeg_test(image_uri, ec2_connection):
26+
"""
27+
Runs the nvJPEG test on the specified image URI.
28+
"""
29+
LOGGER.info(f"starting _run_nvjpeg_test with {image_uri}")
30+
31+
account_id = test_utils.get_account_id_from_image_uri(image_uri)
32+
image_region = test_utils.get_region_from_image_uri(image_uri)
33+
repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(image_uri)
34+
cuda_version = test_utils.get_cuda_version_from_tag(image_uri)
35+
36+
container_name = f"{repo_name}-test-nvjpeg"
37+
38+
LOGGER.info(f"_run_nvjpeg_test pulling: {image_uri}")
39+
test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
40+
41+
ec2_connection.run(f"docker pull {image_uri}", hide="out")
42+
43+
LOGGER.info(f"_run_nvjpeg_test running: {image_uri}")
44+
ec2_connection.run(
45+
f"docker run --runtime=nvidia --gpus all --name {container_name} -id {image_uri}"
46+
)
47+
cuda_version_numeric = cuda_version.strip("cu")
48+
if Version(cuda_version_numeric) < Version("126"):
49+
# 12.4.1 has a different branch tag in cuda-samples
50+
if Version(cuda_version_numeric) == Version("124"):
51+
git_branch_tag = "12.4.1"
52+
else:
53+
git_branch_tag = f"{cuda_version_numeric[:-1]}.{cuda_version_numeric[-1]}"
54+
test_command = (
55+
f"git clone -b v{git_branch_tag} https://github.com/NVIDIA/cuda-samples.git && "
56+
"cd cuda-samples/Samples/4_CUDA_Libraries/nvJPEG && "
57+
"make -j$(nproc) && "
58+
"./nvJPEG"
59+
)
60+
else:
61+
# For CUDA 12.6 and above, we use the v12.8 branch of cuda-samples
62+
# This is a workaround for the issue where the nvJPEG sample in the
63+
# cuda-samples repository does not support compute_100 architecture.
64+
# The v12.8 branch is used to avoid the issue with compute_100 architecture.
65+
# See
66+
# sample 12.9 or master branch has compute_100 arch support issue
67+
# https://github.com/NVIDIA/cuda-samples/issues/367
68+
test_command = (
69+
f"git clone -b v12.8 https://github.com/NVIDIA/cuda-samples.git && "
70+
"cd cuda-samples && "
71+
"mkdir build && cd build && "
72+
"cmake .. && "
73+
"cd Samples/4_CUDA_Libraries/nvJPEG && "
74+
"make -j$(nproc) && "
75+
"./nvJPEG"
76+
)
77+
78+
output = ec2_connection.run(
79+
f"docker exec {container_name} /bin/bash -c '{test_command}'"
80+
).stdout.strip("\n")
81+
82+
return output

test/test_utils/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,21 @@ def is_below_framework_version(version_upper_bound, image_uri, framework):
631631
)
632632

633633

634+
def is_below_cuda_version(version_upper_bound, image_uri):
635+
"""
636+
Validate that image_uri has cuda version strictly less than version_upper_bound
637+
638+
:param version_upper_bound: str Cuda version that image_uri is required to be below
639+
:param image_uri: str ECR Image URI for the image to be validated
640+
:return: bool True if image_uri has cuda version less than version_upper_bound, else False
641+
"""
642+
cuda_version = get_cuda_version_from_tag(image_uri)
643+
numbers = cuda_version[2:]
644+
numeric_version = f"{numbers[:-1]}.{numbers[-1]}"
645+
required_version_specifier_set = SpecifierSet(f"<{version_upper_bound}")
646+
return numeric_version in required_version_specifier_set
647+
648+
634649
def is_image_incompatible_with_instance_type(image_uri, ec2_instance_type):
635650
"""
636651
Check for all compatibility issues between DLC Image Types and EC2 Instance Types.

0 commit comments

Comments
 (0)