temp: DCGM - internal repository.

mc-nv · mc-nv · commit 99d7476159c6 · 2025-07-30T09:53:00.000-07:00
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
@@ -263,7 +263,13 @@ RUN pip3 install --upgrade "numpy<2" pillow attrdict && \
          "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
     xargs pip3 install --upgrade
 
+ARG DCGM_SOURCE_LIST
 # Install DCGM
+RUN if [ -n "${DCGM_SOURCE_LIST}" ]; then \
+        echo "deb [trusted=yes] $DCGM_SOURCE_LIST / " > /etc/apt/sources.list.d/dcgm-list.list && \
+        cat /etc/apt/sources.list.d/dcgm-list.list; \
+    fi
+
 RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
         [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \
         curl -o /tmp/cuda-keyring.deb \
diff --git a/build.py b/build.py
@@ -841,7 +841,15 @@ def tensorrtllm_cmake_args(images):
     return cargs
 
 
-def install_dcgm_libraries(dcgm_version, target_machine):
+def install_dcgm_libraries(dcgm_version):
+    if os.getenv("DCGM_SOURCE_LIST"):
+        dcgm_source_list = """
+RUN echo "deb [trusted=yes] {} / " > /etc/apt/sources.list.d/dcgm-list.list \\
+    && cat /etc/apt/sources.list.d/dcgm-list.list""".format(
+            os.getenv("DCGM_SOURCE_LIST")
+        )
+    else:
+        dcgm_source_list = ""
     if dcgm_version == "":
         fail(
             "unable to determine default repo-tag, DCGM version not known for {}".format(
@@ -852,11 +860,13 @@ def install_dcgm_libraries(dcgm_version, target_machine):
     else:
         # RHEL has the same install instructions for both aarch64 and x86
         if target_platform() == "rhel":
-            if target_machine == "aarch64":
-                return """
+            return (
+                dcgm_source_list
+                + """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
-RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
+RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) && \\
+    && dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${{ARCH}}/cuda-rhel8.repo \\
     && dnf clean expire-cache \\
     && dnf install --assumeyes \\
                  datacenter-gpu-manager-4-core=1:{} \\
@@ -876,13 +886,16 @@ def install_dcgm_libraries(dcgm_version, target_machine):
 """.format(
                     dcgm_version, dcgm_version, dcgm_version
                 )
+            )
         else:
-            if target_machine == "aarch64":
-                return """
+            return (
+                dcgm_source_list
+                + """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
-RUN curl -o /tmp/cuda-keyring.deb \\
-        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \\
+RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) \\
+      && curl -o /tmp/cuda-keyring.deb \\
+        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${{ARCH}}/cuda-keyring_1.1-1_all.deb \\
       && apt install /tmp/cuda-keyring.deb \\
       && rm /tmp/cuda-keyring.deb \\
       && apt update \\
@@ -907,6 +920,7 @@ def install_dcgm_libraries(dcgm_version, target_machine):
 """.format(
                     dcgm_version, dcgm_version, dcgm_version
                 )
+            )
 
 
 def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
@@ -1007,7 +1021,7 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
       && mv /tmp/boost_1_80_0/boost /usr/include/boost
 """
     if FLAGS.enable_gpu:
-        df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
+        df += install_dcgm_libraries(argmap["DCGM_VERSION"])
     df += """
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
 ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
@@ -1120,7 +1134,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
 """
 
         if FLAGS.enable_gpu:
-            df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
+            df += install_dcgm_libraries(argmap["DCGM_VERSION"])
 
     df += """
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
@@ -1412,7 +1426,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         df += fastertransformer_buildscript.create_postbuild(is_multistage_build=False)
 
     if enable_gpu:
-        df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
+        df += install_dcgm_libraries(argmap["DCGM_VERSION"])
         # This segment will break the RHEL SBSA build. Need to determine whether
         # this is necessary to incorporate.
         if target_platform() != "rhel":