Skip to content

Commit b3b573a

Browse files
committed
temp: DCGM - internal repository.
1 parent 14e1bfb commit b3b573a

File tree

2 files changed

+36
-39
lines changed

2 files changed

+36
-39
lines changed

Dockerfile.sdk

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,13 +259,19 @@ RUN pip3 install --upgrade "numpy<2" pillow attrdict && \
259259
"tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
260260
xargs pip3 install --upgrade
261261

262+
ARG DCGM_SOURCE_LIST
262263
# Install DCGM
264+
RUN if [ -n "${DCGM_SOURCE_LIST}" ]; then \
265+
echo "deb [trusted=yes] $DCGM_SOURCE_LIST / " > /etc/apt/sources.list.d/dcgm-list.list && \
266+
cat /etc/apt/sources.list.d/dcgm-list.list; \
267+
fi
268+
263269
RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
264270
[ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \
265271
curl -o /tmp/cuda-keyring.deb \
266272
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/$arch/cuda-keyring_1.1-1_all.deb \
267273
&& apt install /tmp/cuda-keyring.deb && rm /tmp/cuda-keyring.deb && \
268-
apt-get update && apt-get install -y datacenter-gpu-manager=1:${DCGM_VERSION}; \
274+
apt-get update && apt-get install -y datacenter-gpu-manager-4-dev; \
269275
fi
270276

271277
# Build expects "python" executable (not python3).

build.py

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@
7777
"ort_version": "1.22.0",
7878
"ort_openvino_version": "2025.2.0",
7979
"standalone_openvino_version": "2025.2.0",
80-
"dcgm_version": "3.3.6",
80+
"dcgm_version": "4",
8181
"vllm_version": "0.9.0.1",
8282
"rhel_py_version": "3.12.3",
8383
}
@@ -841,7 +841,15 @@ def tensorrtllm_cmake_args(images):
841841
return cargs
842842

843843

844-
def install_dcgm_libraries(dcgm_version, target_machine):
844+
def install_dcgm_libraries(dcgm_version):
845+
if os.getenv("DCGM_SOURCE_LIST"):
846+
dcgm_source_list = """
847+
RUN echo "deb [trusted=yes] {} / " > /etc/apt/sources.list.d/dcgm-list.list \\
848+
&& cat /etc/apt/sources.list.d/dcgm-list.list""".format(
849+
os.getenv("DCGM_SOURCE_LIST")
850+
)
851+
else:
852+
dcgm_source_list = ""
845853
if dcgm_version == "":
846854
fail(
847855
"unable to determine default repo-tag, DCGM version not known for {}".format(
@@ -852,53 +860,36 @@ def install_dcgm_libraries(dcgm_version, target_machine):
852860
else:
853861
# RHEL has the same install instructions for both aarch64 and x86
854862
if target_platform() == "rhel":
855-
if target_machine == "aarch64":
856-
return """
857-
ENV DCGM_VERSION {}
858-
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
859-
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
860-
&& dnf clean expire-cache \\
861-
&& dnf install -y datacenter-gpu-manager-{}
862-
""".format(
863-
dcgm_version, dcgm_version
864-
)
865-
else:
866-
return """
863+
return (
864+
dcgm_source_list
865+
+ """
867866
ENV DCGM_VERSION {}
868867
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
869-
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
868+
RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) && \\
869+
&& dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${{ARCH}}/cuda-rhel8.repo \\
870870
&& dnf clean expire-cache \\
871-
&& dnf install -y datacenter-gpu-manager-{}
871+
&& dnf install -y datacenter-gpu-manager-{}-dev
872872
""".format(
873873
dcgm_version, dcgm_version
874874
)
875+
)
875876
else:
876-
if target_machine == "aarch64":
877-
return """
877+
return (
878+
dcgm_source_list
879+
+ """
878880
ENV DCGM_VERSION {}
879881
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
880-
RUN curl -o /tmp/cuda-keyring.deb \\
881-
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \\
882+
RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) \\
883+
&& curl -o /tmp/cuda-keyring.deb \\
884+
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${{ARCH}}/cuda-keyring_1.1-1_all.deb \\
882885
&& apt install /tmp/cuda-keyring.deb \\
883-
&& rm /tmp/cuda-keyring.deb \\
886+
&& rm /tmp/cuda-keyring.deb \\
884887
&& apt-get update \\
885-
&& apt-get install -y datacenter-gpu-manager=1:{}
886-
""".format(
887-
dcgm_version, dcgm_version
888-
)
889-
else:
890-
return """
891-
ENV DCGM_VERSION {}
892-
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
893-
RUN curl -o /tmp/cuda-keyring.deb \\
894-
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \\
895-
&& apt install /tmp/cuda-keyring.deb \\
896-
&& rm /tmp/cuda-keyring.deb \\
897-
&& apt-get update \\
898-
&& apt-get install -y datacenter-gpu-manager=1:{}
888+
&& apt-get install -y datacenter-gpu-manager-{}-dev
899889
""".format(
900890
dcgm_version, dcgm_version
901891
)
892+
)
902893

903894

904895
def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
@@ -999,7 +990,7 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
999990
&& mv /tmp/boost_1_80_0/boost /usr/include/boost
1000991
"""
1001992
if FLAGS.enable_gpu:
1002-
df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
993+
df += install_dcgm_libraries(argmap["DCGM_VERSION"])
1003994
df += """
1004995
ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
1005996
ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
@@ -1112,7 +1103,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
11121103
"""
11131104

11141105
if FLAGS.enable_gpu:
1115-
df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
1106+
df += install_dcgm_libraries(argmap["DCGM_VERSION"])
11161107

11171108
df += """
11181109
ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
@@ -1404,7 +1395,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
14041395
df += fastertransformer_buildscript.create_postbuild(is_multistage_build=False)
14051396

14061397
if enable_gpu:
1407-
df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
1398+
df += install_dcgm_libraries(argmap["DCGM_VERSION"])
14081399
# This segment will break the RHEL SBSA build. Need to determine whether
14091400
# this is necessary to incorporate.
14101401
if target_platform() != "rhel":

0 commit comments

Comments
 (0)