temp: DCGM - internal repository.

mc-nv · mc-nv · commit 40fecf64c6e8 · 2025-07-17T15:43:38.000-07:00
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
@@ -259,13 +259,19 @@ RUN pip3 install --upgrade "numpy<2" pillow attrdict && \
          "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
     xargs pip3 install --upgrade
 
+ARG DCGM_SOURCE_LIST
 # Install DCGM
+RUN if [ -n "${DCGM_SOURCE_LIST}" ]; then \
+        echo "deb [trusted=yes] $DCGM_SOURCE_LIST / " > /etc/apt/sources.list.d/dcgm-list.list && \
+        cat /etc/apt/sources.list.d/dcgm-list.list; \
+    fi
+
 RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
         [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \
         curl -o /tmp/cuda-keyring.deb \
         https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/$arch/cuda-keyring_1.1-1_all.deb \
         && apt install /tmp/cuda-keyring.deb && rm /tmp/cuda-keyring.deb && \
-        apt-get update && apt-get install -y datacenter-gpu-manager=1:${DCGM_VERSION}; \
+        apt-get update && apt-get install -y datacenter-gpu-manager-4-dev; \
     fi
 
 # Build expects "python" executable (not python3).
diff --git a/build.py b/build.py
@@ -77,7 +77,7 @@
     "ort_version": "1.22.0",
     "ort_openvino_version": "2025.2.0",
     "standalone_openvino_version": "2025.2.0",
-    "dcgm_version": "3.3.6",
+    "dcgm_version": "4",
     "vllm_version": "0.9.0.1",
     "rhel_py_version": "3.12.3",
 }
@@ -841,7 +841,15 @@ def tensorrtllm_cmake_args(images):
     return cargs
 
 
-def install_dcgm_libraries(dcgm_version, target_machine):
+def install_dcgm_libraries(dcgm_version):
+    if os.getenv("DCGM_SOURCE_LIST"):
+        dcgm_source_list = """
+RUN echo "deb [trusted=yes] {} / " > /etc/apt/sources.list.d/dcgm-list.list \\
+    && cat /etc/apt/sources.list.d/dcgm-list.list""".format(
+            os.getenv("DCGM_SOURCE_LIST")
+        )
+    else:
+        dcgm_source_list = ""
     if dcgm_version == "":
         fail(
             "unable to determine default repo-tag, DCGM version not known for {}".format(
@@ -852,53 +860,36 @@ def install_dcgm_libraries(dcgm_version, target_machine):
     else:
         # RHEL has the same install instructions for both aarch64 and x86
         if target_platform() == "rhel":
-            if target_machine == "aarch64":
-                return """
-ENV DCGM_VERSION {}
-# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
-RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
-    && dnf clean expire-cache \\
-    && dnf install -y datacenter-gpu-manager-{}
-""".format(
-                    dcgm_version, dcgm_version
-                )
-            else:
-                return """
+            return (
+                dcgm_source_list
+                + """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
-RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
+RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) && \\
+    && dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${{ARCH}}/cuda-rhel8.repo \\
     && dnf clean expire-cache \\
-    && dnf install -y datacenter-gpu-manager-{}
+    && dnf install -y datacenter-gpu-manager-{}-dev
 """.format(
                     dcgm_version, dcgm_version
                 )
+            )
         else:
-            if target_machine == "aarch64":
-                return """
+            return (
+                dcgm_source_list
+                + """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
-RUN curl -o /tmp/cuda-keyring.deb \\
-        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \\
+RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) \\
+      && curl -o /tmp/cuda-keyring.deb \\
+        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${{ARCH}}/cuda-keyring_1.1-1_all.deb \\
       && apt install /tmp/cuda-keyring.deb \\
-      && rm /tmp/cuda-keyring.deb \\
+      && rm /tmp/cuda-keyring.deb   \\
       && apt-get update \\
-      && apt-get install -y datacenter-gpu-manager=1:{}
-""".format(
-                    dcgm_version, dcgm_version
-                )
-            else:
-                return """
-ENV DCGM_VERSION {}
-# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
-RUN curl -o /tmp/cuda-keyring.deb \\
-          https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \\
-      && apt install /tmp/cuda-keyring.deb \\
-      && rm /tmp/cuda-keyring.deb \\
-      && apt-get update \\
-      && apt-get install -y datacenter-gpu-manager=1:{}
+      && apt-get install -y datacenter-gpu-manager-{}-dev
 """.format(
                     dcgm_version, dcgm_version
                 )
+            )
 
 
 def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
@@ -999,7 +990,7 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
       && mv /tmp/boost_1_80_0/boost /usr/include/boost
 """
     if FLAGS.enable_gpu:
-        df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
+        df += install_dcgm_libraries(argmap["DCGM_VERSION"])
     df += """
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
 ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
@@ -1112,7 +1103,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
 """
 
         if FLAGS.enable_gpu:
-            df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
+            df += install_dcgm_libraries(argmap["DCGM_VERSION"])
 
     df += """
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
@@ -1404,7 +1395,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         df += fastertransformer_buildscript.create_postbuild(is_multistage_build=False)
 
     if enable_gpu:
-        df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
+        df += install_dcgm_libraries(argmap["DCGM_VERSION"])
         # This segment will break the RHEL SBSA build. Need to determine whether
         # this is necessary to incorporate.
         if target_platform() != "rhel":