77
77
"ort_version" : "1.22.0" ,
78
78
"ort_openvino_version" : "2025.2.0" ,
79
79
"standalone_openvino_version" : "2025.2.0" ,
80
- "dcgm_version" : "3.3.6 " ,
80
+ "dcgm_version" : "4 " ,
81
81
"vllm_version" : "0.9.0.1" ,
82
82
"rhel_py_version" : "3.12.3" ,
83
83
}
@@ -841,7 +841,15 @@ def tensorrtllm_cmake_args(images):
841
841
return cargs
842
842
843
843
844
- def install_dcgm_libraries (dcgm_version , target_machine ):
844
+ def install_dcgm_libraries (dcgm_version ):
845
+ if os .getenv ("DCGM_SOURCE_LIST" ):
846
+ dcgm_source_list = """
847
+ RUN echo "deb [trusted=yes] {} / " > /etc/apt/sources.list.d/dcgm-list.list \\
848
+ && cat /etc/apt/sources.list.d/dcgm-list.list""" .format (
849
+ os .getenv ("DCGM_SOURCE_LIST" )
850
+ )
851
+ else :
852
+ dcgm_source_list = ""
845
853
if dcgm_version == "" :
846
854
fail (
847
855
"unable to determine default repo-tag, DCGM version not known for {}" .format (
@@ -852,53 +860,36 @@ def install_dcgm_libraries(dcgm_version, target_machine):
852
860
else :
853
861
# RHEL has the same install instructions for both aarch64 and x86
854
862
if target_platform () == "rhel" :
855
- if target_machine == "aarch64" :
856
- return """
857
- ENV DCGM_VERSION {}
858
- # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
859
- RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
860
- && dnf clean expire-cache \\
861
- && dnf install -y datacenter-gpu-manager-{}
862
- """ .format (
863
- dcgm_version , dcgm_version
864
- )
865
- else :
866
- return """
863
+ return (
864
+ dcgm_source_list
865
+ + """
867
866
ENV DCGM_VERSION {}
868
867
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
869
- RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
868
+ RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) && \\
869
+ && dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${{ARCH}}/cuda-rhel8.repo \\
870
870
&& dnf clean expire-cache \\
871
- && dnf install -y datacenter-gpu-manager-{}
871
+ && dnf install -y datacenter-gpu-manager-{}-dev
872
872
""" .format (
873
873
dcgm_version , dcgm_version
874
874
)
875
+ )
875
876
else :
876
- if target_machine == "aarch64" :
877
- return """
877
+ return (
878
+ dcgm_source_list
879
+ + """
878
880
ENV DCGM_VERSION {}
879
881
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
880
- RUN curl -o /tmp/cuda-keyring.deb \\
881
- https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \\
882
+ RUN ARCH=$( [ $(uname -m) = "x86_64" ] && echo "$(uname -m)" || echo "sbsa" ) \\
883
+ && curl -o /tmp/cuda-keyring.deb \\
884
+ https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${{ARCH}}/cuda-keyring_1.1-1_all.deb \\
882
885
&& apt install /tmp/cuda-keyring.deb \\
883
- && rm /tmp/cuda-keyring.deb \\
886
+ && rm /tmp/cuda-keyring.deb \\
884
887
&& apt-get update \\
885
- && apt-get install -y datacenter-gpu-manager=1:{}
886
- """ .format (
887
- dcgm_version , dcgm_version
888
- )
889
- else :
890
- return """
891
- ENV DCGM_VERSION {}
892
- # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
893
- RUN curl -o /tmp/cuda-keyring.deb \\
894
- https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \\
895
- && apt install /tmp/cuda-keyring.deb \\
896
- && rm /tmp/cuda-keyring.deb \\
897
- && apt-get update \\
898
- && apt-get install -y datacenter-gpu-manager=1:{}
888
+ && apt-get install -y datacenter-gpu-manager-{}-dev
899
889
""" .format (
900
890
dcgm_version , dcgm_version
901
891
)
892
+ )
902
893
903
894
904
895
def create_dockerfile_buildbase_rhel (ddir , dockerfile_name , argmap ):
@@ -999,7 +990,7 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
999
990
&& mv /tmp/boost_1_80_0/boost /usr/include/boost
1000
991
"""
1001
992
if FLAGS .enable_gpu :
1002
- df += install_dcgm_libraries (argmap ["DCGM_VERSION" ], target_machine () )
993
+ df += install_dcgm_libraries (argmap ["DCGM_VERSION" ])
1003
994
df += """
1004
995
ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
1005
996
ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
@@ -1112,7 +1103,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
1112
1103
"""
1113
1104
1114
1105
if FLAGS .enable_gpu :
1115
- df += install_dcgm_libraries (argmap ["DCGM_VERSION" ], target_machine () )
1106
+ df += install_dcgm_libraries (argmap ["DCGM_VERSION" ])
1116
1107
1117
1108
df += """
1118
1109
ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
@@ -1404,7 +1395,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
1404
1395
df += fastertransformer_buildscript .create_postbuild (is_multistage_build = False )
1405
1396
1406
1397
if enable_gpu :
1407
- df += install_dcgm_libraries (argmap ["DCGM_VERSION" ], target_machine )
1398
+ df += install_dcgm_libraries (argmap ["DCGM_VERSION" ])
1408
1399
# This segment will break the RHEL SBSA build. Need to determine whether
1409
1400
# this is necessary to incorporate.
1410
1401
if target_platform () != "rhel" :
0 commit comments