11# syntax=docker/dockerfile:1.2
2- ARG MERLIN_VERSION=23 .06
3- ARG TRITON_VERSION=23.06
2+ ARG MERLIN_VERSION=24 .06
3+ ARG TRITON_VERSION=24.03
44
5- ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin -base:${MERLIN_VERSION}
5+ ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/ctr -base:${MERLIN_VERSION}
66
77FROM ${BASE_IMAGE} as base
88
99ARG HUGECTR_VER=main
1010ARG HUGECTR_BACKEND_VER=main
1111
12- RUN pip install --no-cache-dir --upgrade notebook ipython
13- RUN pip install --no-cache-dir mpi4py
12+ RUN pip install --no-cache-dir --upgrade notebook ipython mpi4py
1413
1514# Install CUDA-Aware hwloc
1615ARG HWLOC_VER=2.4.1
@@ -45,22 +44,86 @@ ENV SHARP_COLL_NUM_COLL_GROUP_RESOURCE_ALLOC_THRESHOLD=0
4544ENV SHARP_COLL_LOCK_ON_COMM_INIT=1
4645ENV SHARP_COLL_LOG_LEVEL=3
4746ENV HCOLL_ENABLE_MCAST=0
47+ ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
48+ SOK_COMPILE_UNIT_TEST=ON
4849
4950# link sub modules expected by hugectr cmake
5051RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
5152RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
5253RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')
5354
54- # Install HugeCTR
55+ # Optional dependency: Build and install protocol buffers and Hadoop/HDFS.
56+ ARG INSTALL_HDFS=false
57+ # Env for HDFS
58+ ENV HADOOP_HOME=/opt/hadoop
59+ ENV PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin \
60+ HDFS_NAMENODE_USER=root \
61+ HDFS_SECONDARYNAMENODE_USER=root \
62+ HDFS_DATANODE_USER=root \
63+ YARN_RESOURCEMANAGER_USER=root \
64+ YARN_NODEMANAGER_USER=root \
65+ # Tackles with ThreadReaper stack overflow issues: https://bugs.openjdk.java.net/browse/JDK-8153057
66+ LIBHDFS_OPTS='-Djdk.lang.processReaperUseDefaultStackSize=true' \
67+ # Tackles with JVM setting error signals that the UCX library checks (GitLab issue #425).
68+ UCX_ERROR_SIGNALS='' \
69+ CLASSPATH=${CLASSPATH}:\
70+ ${HADOOP_HOME}/etc/hadoop/*:\
71+ ${HADOOP_HOME}/share/hadoop/common/*:\
72+ ${HADOOP_HOME}/share/hadoop/common/lib/*:\
73+ ${HADOOP_HOME}/share/hadoop/hdfs/*:\
74+ ${HADOOP_HOME}/share/hadoop/hdfs/lib/*:\
75+ ${HADOOP_HOME}/share/hadoop/mapreduce/*:\
76+ ${HADOOP_HOME}/share/hadoop/yarn/*:\
77+ ${HADOOP_HOME}/share/hadoop/yarn/lib/*
78+
79+ # Install Inference and HPS Backend
80+ ARG HUGECTR_DEV_MODE=false
81+ ARG HUGECTR_VER=main
82+ ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git"
83+ ARG HUGECTR_BACKEND_VER=main
84+ ARG _CI_JOB_TOKEN=""
85+ ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
5586ARG HUGECTR_HOME=/usr/local/hugectr
56- RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
57- rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
58- rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
87+ ARG TRITON_VERSION
88+
89+ ENV PATH=$PATH:${HUGECTR_HOME}/bin \
90+ CPATH=$CPATH:${HUGECTR_HOME}/include \
91+ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib
92+
93+ RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
94+ # Install HugeCTR inference which is dependency for hps_backend
5995 git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
6096 cd /hugectr && \
6197 git submodule update --init --recursive && \
6298 mkdir build && \
6399 cd build && \
100+ if [[ "${INSTALL_HDFS}" == "false" ]]; then \
101+ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON .. \
102+ ; else \
103+ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON -DENABLE_HDFS=ON .. \
104+ ; fi && \
105+ make -j$(nproc) && \
106+ make install && \
107+ rm -rf ./* && \
108+ # Install hps_backend
109+ git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \
110+ mkdir /repos/hugectr_triton_backend/hps_backend/build && \
111+ cd /repos/hugectr_triton_backend/hps_backend/build && \
112+ cmake \
113+ -DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \
114+ -DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \
115+ -DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \
116+ -DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \
117+ make -j$(nproc) && \
118+ make install && \
119+ chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hps/*.so && \
120+ cd ../../.. && \
121+ rm -rf hugectr_triton_backend && \
122+ # Remove the incompatible gmock and gtest installed by hps_backend
123+ rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
124+ rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
125+ # Install HugeCTR multinode
126+ cd /hugectr/build && \
64127 LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
65128 export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
66129 if [[ "${INSTALL_HDFS}" == "false" ]]; then \
@@ -70,13 +133,34 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
70133 ; fi && \
71134 make -j$(nproc) && \
72135 make install && \
73- rm -rf ./* && \
74136 chmod +x ${HUGECTR_HOME}/bin/* ${HUGECTR_HOME}/lib/*.so && \
75- cd ../onnx_converter && \
137+ # Install HPS trt pugin
138+ cd ../hps_trt && \
139+ mkdir build && \
140+ cd build && \
141+ cmake -DSM="70;75;80;90" .. && \
142+ make -j$(nproc) && \
143+ make install && \
144+ cd ../../onnx_converter && \
76145 python setup.py install && \
77- mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
146+ pip --no-cache-dir install ninja tf2onnx && \
147+ # Install SOK
148+ cd ../sparse_operation_kit && \
149+ python setup.py install && \
150+ # Install HPS TF plugin
151+ cd ../hps_tf && \
152+ python setup.py install && \
153+ # Install hps_torch
154+ cd ../hps_torch/ && \
155+ TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 9.0" python setup.py install && \
156+ mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
157+ rm -rf /hugectr && mkdir -p /hugectr /hugectr/sparse_operation_kit && \
158+ mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit/sparse_operation_kit && \
159+ chmod +x /hugectr/ci/* /hugectr/sparse_operation_kit/sparse_operation_kit/* \
78160 ; fi
79161
162+ RUN ln -s ${HUGECTR_HOME}/backends/hps /opt/tritonserver/backends/hps
163+
80164ENV PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib
81165
82166# Clean up
0 commit comments