Skip to content

Commit c4eb92d

Browse files
authored
[HugeCTR]Add a new base for hugectr (#1098)
* Add ctr base image and install ctr components in merlin-hugectr * Use same version for pytorch base image * Correct torch python folder name for new version * Add merlin and test script for ctr-base * upate test script for new ctr-base and merlin-hugectr * Fix typo * Remove some packages hugectr maynot use * Add back keras since SOK uses it * Refactor dockerfiles * Correct relative path * Upgrade upstream image to 24.03 * Remove libboost which not in triton container * Add libhdf5-dev * Add execution privilege for test scripts * Remove unused test script * correct for base version
1 parent c0f43d6 commit c4eb92d

File tree

4 files changed

+424
-14
lines changed

4 files changed

+424
-14
lines changed

ci/container_hugectr.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
container=$1
4+
devices=$2
5+
6+
echo "##############"
7+
echo "# Unit tests #"
8+
echo "##############"
9+
10+
exit_code=0
11+
12+
## Test HugeCTR
13+
if [ "$container" == "merlin-hugectr" ]; then
14+
echo "Run unit tests for HugeCTR"
15+
/hugectr/ci/test_unit.sh $container $devices || exit_code=1
16+
echo "Run unit tests for merlin-sok"
17+
/hugectr/ci/test_unit.sh "merlin-tensorflow" $devices || exit_code=1
18+
fi
19+
20+
exit $exit_code

ci/test_container.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ if [ $container != 'merlin-ci-runner' ]; then
1717
fi
1818

1919
${ci_script_dir}container_software.sh $container $devices
20-
${ci_script_dir}container_integration.sh $container $devices $suppress_failures
21-
${ci_script_dir}container_unit.sh $container $devices
20+
21+
if [ $container == 'merlin-hugectr' ]; then
22+
${ci_script_dir}container_hugectr.sh $container $devices
23+
elif [ $container != 'ctr-base' ]; then
24+
${ci_script_dir}container_integration.sh $container $devices $suppress_failures
25+
${ci_script_dir}container_unit.sh $container $devices
26+
fi
2227

docker/dockerfile.ctr

Lines changed: 96 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
# syntax=docker/dockerfile:1.2
2-
ARG MERLIN_VERSION=23.06
3-
ARG TRITON_VERSION=23.06
2+
ARG MERLIN_VERSION=24.06
3+
ARG TRITON_VERSION=24.03
44

5-
ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}
5+
ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/ctr-base:${MERLIN_VERSION}
66

77
FROM ${BASE_IMAGE} as base
88

99
ARG HUGECTR_VER=main
1010
ARG HUGECTR_BACKEND_VER=main
1111

12-
RUN pip install --no-cache-dir --upgrade notebook ipython
13-
RUN pip install --no-cache-dir mpi4py
12+
RUN pip install --no-cache-dir --upgrade notebook ipython mpi4py
1413

1514
# Install CUDA-Aware hwloc
1615
ARG HWLOC_VER=2.4.1
@@ -45,22 +44,86 @@ ENV SHARP_COLL_NUM_COLL_GROUP_RESOURCE_ALLOC_THRESHOLD=0
4544
ENV SHARP_COLL_LOCK_ON_COMM_INIT=1
4645
ENV SHARP_COLL_LOG_LEVEL=3
4746
ENV HCOLL_ENABLE_MCAST=0
47+
ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
48+
SOK_COMPILE_UNIT_TEST=ON
4849

4950
# link sub modules expected by hugectr cmake
5051
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
5152
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
5253
RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')
5354

54-
# Install HugeCTR
55+
# Optional dependency: Build and install protocol buffers and Hadoop/HDFS.
56+
ARG INSTALL_HDFS=false
57+
# Env for HDFS
58+
ENV HADOOP_HOME=/opt/hadoop
59+
ENV PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin \
60+
HDFS_NAMENODE_USER=root \
61+
HDFS_SECONDARYNAMENODE_USER=root \
62+
HDFS_DATANODE_USER=root \
63+
YARN_RESOURCEMANAGER_USER=root \
64+
YARN_NODEMANAGER_USER=root \
65+
# Tackles with ThreadReaper stack overflow issues: https://bugs.openjdk.java.net/browse/JDK-8153057
66+
LIBHDFS_OPTS='-Djdk.lang.processReaperUseDefaultStackSize=true' \
67+
# Tackles with JVM setting error signals that the UCX library checks (GitLab issue #425).
68+
UCX_ERROR_SIGNALS='' \
69+
CLASSPATH=${CLASSPATH}:\
70+
${HADOOP_HOME}/etc/hadoop/*:\
71+
${HADOOP_HOME}/share/hadoop/common/*:\
72+
${HADOOP_HOME}/share/hadoop/common/lib/*:\
73+
${HADOOP_HOME}/share/hadoop/hdfs/*:\
74+
${HADOOP_HOME}/share/hadoop/hdfs/lib/*:\
75+
${HADOOP_HOME}/share/hadoop/mapreduce/*:\
76+
${HADOOP_HOME}/share/hadoop/yarn/*:\
77+
${HADOOP_HOME}/share/hadoop/yarn/lib/*
78+
79+
# Install Inference and HPS Backend
80+
ARG HUGECTR_DEV_MODE=false
81+
ARG HUGECTR_VER=main
82+
ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git"
83+
ARG HUGECTR_BACKEND_VER=main
84+
ARG _CI_JOB_TOKEN=""
85+
ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
5586
ARG HUGECTR_HOME=/usr/local/hugectr
56-
RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
57-
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
58-
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
87+
ARG TRITON_VERSION
88+
89+
ENV PATH=$PATH:${HUGECTR_HOME}/bin \
90+
CPATH=$CPATH:${HUGECTR_HOME}/include \
91+
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib
92+
93+
RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
94+
# Install HugeCTR inference which is dependency for hps_backend
5995
git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
6096
cd /hugectr && \
6197
git submodule update --init --recursive && \
6298
mkdir build && \
6399
cd build && \
100+
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
101+
cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON .. \
102+
; else \
103+
cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON -DENABLE_HDFS=ON .. \
104+
; fi && \
105+
make -j$(nproc) && \
106+
make install && \
107+
rm -rf ./* && \
108+
# Install hps_backend
109+
git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \
110+
mkdir /repos/hugectr_triton_backend/hps_backend/build && \
111+
cd /repos/hugectr_triton_backend/hps_backend/build && \
112+
cmake \
113+
-DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \
114+
-DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \
115+
-DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \
116+
-DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \
117+
make -j$(nproc) && \
118+
make install && \
119+
chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hps/*.so && \
120+
cd ../../.. && \
121+
rm -rf hugectr_triton_backend && \
122+
# Remove the incompatible gmock and gtest installed by hps_backend
123+
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
124+
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
125+
# Install HugeCTR multinode
126+
cd /hugectr/build && \
64127
LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
65128
export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
66129
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
@@ -70,13 +133,34 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
70133
; fi && \
71134
make -j$(nproc) && \
72135
make install && \
73-
rm -rf ./* && \
74136
chmod +x ${HUGECTR_HOME}/bin/* ${HUGECTR_HOME}/lib/*.so && \
75-
cd ../onnx_converter && \
137+
# Install HPS trt pugin
138+
cd ../hps_trt && \
139+
mkdir build && \
140+
cd build && \
141+
cmake -DSM="70;75;80;90" .. && \
142+
make -j$(nproc) && \
143+
make install && \
144+
cd ../../onnx_converter && \
76145
python setup.py install && \
77-
mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
146+
pip --no-cache-dir install ninja tf2onnx && \
147+
# Install SOK
148+
cd ../sparse_operation_kit && \
149+
python setup.py install && \
150+
# Install HPS TF plugin
151+
cd ../hps_tf && \
152+
python setup.py install && \
153+
# Install hps_torch
154+
cd ../hps_torch/ && \
155+
TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 9.0" python setup.py install && \
156+
mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
157+
rm -rf /hugectr && mkdir -p /hugectr /hugectr/sparse_operation_kit && \
158+
mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit/sparse_operation_kit && \
159+
chmod +x /hugectr/ci/* /hugectr/sparse_operation_kit/sparse_operation_kit/* \
78160
; fi
79161

162+
RUN ln -s ${HUGECTR_HOME}/backends/hps /opt/tritonserver/backends/hps
163+
80164
ENV PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib
81165

82166
# Clean up

0 commit comments

Comments
 (0)