@@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt"
3434MODEL_NAME=" gpt2_tensorrt_llm"
3535NAME=" tensorrt_llm_benchmarking_test"
3636MODEL_REPOSITORY=" $( pwd) /triton_model_repo"
37- TENSORRTLLM_BACKEND_DIR=" /opt/tritonserver /tensorrtllm_backend"
37+ TENSORRTLLM_BACKEND_DIR=" /workspace /tensorrtllm_backend"
3838GPT_DIR=" $TENSORRTLLM_BACKEND_DIR /tensorrt_llm/examples/gpt"
3939TOKENIZER_DIR=" $GPT_DIR /gpt2"
4040ENGINES_DIR=" ${BASE_DIR} /engines/inflight_batcher_llm/${NUM_GPUS} -gpu"
@@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
4747function clone_tensorrt_llm_backend_repo {
4848 rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR
4949 apt-get update && apt-get install git-lfs -y --no-install-recommends
50- git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server /tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
50+ git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG} /tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
5151 cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive
5252}
5353
5454# Update Open MPI to a version compatible with SLURM.
5555function upgrade_openmpi {
56- cd /tmp/
5756 local CURRENT_VERSION=$( mpirun --version 2>&1 | awk ' /Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}' )
5857
5958 if [ -n " $CURRENT_VERSION " ] && dpkg --compare-versions " $CURRENT_VERSION " lt " 5.0.1" ; then
6059 # Uninstall the current version of Open MPI
61- wget " https://download.open-mpi.org/release/open-mpi/v$( echo " ${CURRENT_VERSION} " | awk -F. ' {print $1"."$2}' ) /openmpi-${CURRENT_VERSION} .tar.gz" || {
62- echo " Failed to download Open MPI ${CURRENT_VERSION} "
63- exit 1
64- }
65- rm -rf " openmpi-${CURRENT_VERSION} " && tar -xzf " openmpi-${CURRENT_VERSION} .tar.gz" && cd " openmpi-${CURRENT_VERSION} " || {
66- echo " Failed to extract Open MPI ${CURRENT_VERSION} "
67- exit 1
68- }
69- unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || {
70- echo " Failed to uninstall Open MPI ${CURRENT_VERSION} "
71- exit 1
72- }
73- rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || {
74- echo " Failed to remove Open MPI ${CURRENT_VERSION} installation directories"
60+ rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$( gcc -print-multiarch) /openmpi || {
61+ echo " Failed to uninstall the existing Open MPI version $CURRENT_VERSION ."
7562 exit 1
7663 }
77- cd ../ && rm -r openmpi-${CURRENT_VERSION}
7864 else
79- echo " Installed Open MPI version is not less than 5.0.1. Skipping the upgrade."
65+ echo " The installed Open MPI version ( $CURRENT_VERSION ) is 5.0.1 or higher . Skipping the upgrade."
8066 return
8167 fi
8268
8369 # Install SLURM supported Open MPI version
70+ cd /tmp/
8471 wget " https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || {
8572 echo " Failed to download Open MPI 5.0.1"
8673 exit 1
@@ -108,18 +95,6 @@ function upgrade_openmpi {
10895 mpirun --version
10996}
11097
111- function install_tensorrt_llm {
112- # Install CMake
113- bash ${TENSORRTLLM_BACKEND_DIR} /tensorrt_llm/docker/common/install_cmake.sh
114- export PATH=" /usr/local/cmake/bin:${PATH} "
115-
116- TORCH_INSTALL_TYPE=" pypi" &&
117- (cd ${TENSORRTLLM_BACKEND_DIR} /tensorrt_llm &&
118- bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
119- python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt &&
120- pip3 install ./build/tensorrt_llm* .whl)
121- }
122-
12398function build_gpt2_base_model {
12499 # Download weights from HuggingFace Transformers
125100 cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2
@@ -131,24 +106,21 @@ function build_gpt2_base_model {
131106 cd ${GPT_DIR}
132107
133108 # Convert weights from HF Tranformers to FT format
134- python3 hf_gpt_convert .py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16
109+ python3 convert_checkpoint .py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir " ./c-model/gpt2/ ${NUM_GPUS} -gpu/ "
135110 cd ${BASE_DIR}
136111}
137112
138113function build_gpt2_tensorrt_engine {
139114 # Build TensorRT engines
140115 cd ${GPT_DIR}
141- python3 build.py --model_dir=" ./c-model/gpt2/${NUM_GPUS} -gpu/" \
142- --world_size=" ${NUM_GPUS} " \
143- --dtype float16 \
144- --use_inflight_batching \
145- --use_gpt_attention_plugin float16 \
146- --paged_kv_cache \
147- --use_gemm_plugin float16 \
148- --remove_input_padding \
149- --hidden_act gelu \
150- --parallel_build \
151- --output_dir=" ${ENGINES_DIR} "
116+ trtllm-build --checkpoint_dir " ./c-model/gpt2/${NUM_GPUS} -gpu/" \
117+ --gpt_attention_plugin float16 \
118+ --remove_input_padding enable \
119+ --paged_kv_cache enable \
120+ --gemm_plugin float16 \
121+ --workers " ${NUM_GPUS} " \
122+ --output_dir " ${ENGINES_DIR} "
123+
152124 cd ${BASE_DIR}
153125}
154126
@@ -172,18 +144,18 @@ function prepare_model_repository {
172144 replace_config_tags ' ${triton_max_batch_size}' " 128" " ${MODEL_REPOSITORY} /preprocessing/config.pbtxt"
173145 replace_config_tags ' ${preprocessing_instance_count}' ' 1' " ${MODEL_REPOSITORY} /preprocessing/config.pbtxt"
174146 replace_config_tags ' ${tokenizer_dir}' " ${TOKENIZER_DIR} /" " ${MODEL_REPOSITORY} /preprocessing/config.pbtxt"
175- replace_config_tags ' ${tokenizer_type}' ' auto' " ${MODEL_REPOSITORY} /preprocessing/config.pbtxt"
176147
177148 replace_config_tags ' ${triton_max_batch_size}' " 128" " ${MODEL_REPOSITORY} /postprocessing/config.pbtxt"
178149 replace_config_tags ' ${postprocessing_instance_count}' ' 1' " ${MODEL_REPOSITORY} /postprocessing/config.pbtxt"
179150 replace_config_tags ' ${tokenizer_dir}' " ${TOKENIZER_DIR} /" " ${MODEL_REPOSITORY} /postprocessing/config.pbtxt"
180- replace_config_tags ' ${tokenizer_type}' ' auto' " ${MODEL_REPOSITORY} /postprocessing/config.pbtxt"
181151
182152 replace_config_tags ' ${triton_max_batch_size}' " 128" " ${MODEL_REPOSITORY} /tensorrt_llm/config.pbtxt"
183153 replace_config_tags ' ${decoupled_mode}' ' true' " ${MODEL_REPOSITORY} /tensorrt_llm/config.pbtxt"
184154 replace_config_tags ' ${max_queue_delay_microseconds}' " 1000000" " ${MODEL_REPOSITORY} /tensorrt_llm/config.pbtxt"
185155 replace_config_tags ' ${batching_strategy}' ' inflight_fused_batching' " ${MODEL_REPOSITORY} /tensorrt_llm/config.pbtxt"
186156 replace_config_tags ' ${engine_dir}' " ${ENGINES_DIR} " " ${MODEL_REPOSITORY} /tensorrt_llm/config.pbtxt"
157+ replace_config_tags ' ${triton_backend}' " tensorrtllm" " ${MODEL_REPOSITORY} /tensorrt_llm/config.pbtxt"
158+ replace_config_tags ' ${max_queue_size}' " 0" " ${MODEL_REPOSITORY} /tensorrt_llm/config.pbtxt"
187159}
188160
189161# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
@@ -244,13 +216,12 @@ function kill_server {
244216
245217upgrade_openmpi
246218clone_tensorrt_llm_backend_repo
247- install_tensorrt_llm
248219build_gpt2_base_model
249220build_gpt2_tensorrt_engine
250221prepare_model_repository
251222
252223# Install perf_analyzer
253- pip3 install tritonclient nvidia-ml-py3
224+ pip3 install tritonclient
254225
255226ARCH=" amd64"
256227STATIC_BATCH=1
0 commit comments